diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..19261f54078ba22100bc5c0f0c9711ee692415d8
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "demo/ocr/PaddleOCR"]
+	path = demo/ocr/PaddleOCR
+	url = https://github.com/PaddlePaddle/PaddleOCR
diff --git a/.style.yapf b/.style.yapf
new file mode 100644
index 0000000000000000000000000000000000000000..b62febf509036e6b75d4d3ffa76754d6e2e80d98
--- /dev/null
+++ b/.style.yapf
@@ -0,0 +1,3 @@
+[style]
+based_on_style = pep8
+column_limit = 80
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..57bc88a15a0ee8266c259b2667e64608d3f7e292
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/README.md b/README.md
index 1c0f22919759f6bf2d0d8f980e18079ff1f3a220..68b203b9da5bbcd35eb54b595900bef9a683a859 100644
--- a/README.md
+++ b/README.md
@@ -1,61 +1,287 @@
-
-
 # PaddleSlim
 
-PaddleSlim是PaddlePaddle框架的一个子模块，主要用于压缩图像领域模型。在PaddleSlim中，不仅实现了目前主流的网络剪枝、量化、蒸馏三种压缩策略，还实现了超参数搜索和小模型网络结构搜索功能。在后续版本中，会添加更多的压缩策略，以及完善对NLP领域模型的支持。
+中文 | [English](README_en.md)
 
-## 功能
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://paddleslim.readthedocs.io/en/latest/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddleslim.readthedocs.io/zh_CN/latest/)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
-- 模型剪裁
-  - 支持通道均匀模型剪裁（uniform pruning)
-  - 基于敏感度的模型剪裁
-  - 基于进化算法的自动模型剪裁三种方式
+PaddleSlim是一个模型压缩工具库，包含模型剪裁、定点量化、知识蒸馏、超参搜索和模型结构搜索等一系列模型压缩策略。
 
-- 量化训练
-  - 在线量化训练（training aware）
-  - 离线量化（post training）
-  - 支持对权重全局量化和Channel-Wise量化
+对于业务用户，PaddleSlim提供完整的模型压缩解决方案，可用于图像分类、检测、分割等各种类型的视觉场景。
+同时也在持续探索NLP领域模型的压缩方案。另外，PaddleSlim提供且在不断完善各种压缩策略在经典开源任务的benchmark,
+以便业务用户参考。
 
-- 蒸馏
+对于模型压缩算法研究者或开发者，PaddleSlim提供各种压缩策略的底层辅助接口，方便用户复现、调研和使用最新论文方法。
+PaddleSlim会从底层能力、技术咨询合作和业务场景等角度支持开发者进行模型压缩策略相关的创新工作。
 
-- 轻量神经网络结构自动搜索（Light-NAS）
-  - 支持基于进化算法的轻量神经网络结构自动搜索（Light-NAS）
-  - 支持 FLOPS / 硬件延时约束
-  - 支持多平台模型延时评估
 
+## 功能
 
-## 安装
+<table style="width:100%;" cellpadding="2" cellspacing="0" border="1" bordercolor="#000000">
+	<tbody>
+		<tr>
+			<td style="text-align:center;">
+				<span style="font-size:18px;">功能模块</span>
+			</td>
+			<td style="text-align:center;">
+				<span style="font-size:18px;">算法</span>
+			</td>
+			<td style="text-align:center;">
+				<span style="font-size:18px;">教程</span><span style="font-size:18px;">与文档</span>
+			</td>
+		</tr>
+		<tr>
+			<td style="text-align:center;">
+				<span style="font-size:12px;">剪裁</span><span style="font-size:12px;"></span><br />
+			</td>
+			<td>
+				<ul>
+					<li>
+						Sensitivity&nbsp;&nbsp;Pruner:&nbsp;<a href="https://arxiv.org/abs/1608.08710" target="_blank"><span style="font-family:&quot;font-size:14px;background-color:#FFFFFF;"><span style="font-family:&quot;font-size:14px;background-color:#FFFFFF;">Li H , Kadav A , Durdanovic I , et al. Pruning Filters for Efficient ConvNets[J]. 2016.</span></span></a>
+					</li>
+					<li>
+						AMC Pruner:&nbsp;<a href="https://arxiv.org/abs/1802.03494" target="_blank"><span style="font-family:&quot;font-size:13px;background-color:#FFFFFF;">He, Yihui , et al. "AMC: AutoML for Model Compression and Acceleration on Mobile Devices." (2018).</span></a>
+					</li>
+					<li>
+						FPGM Pruner:&nbsp;<a href="https://arxiv.org/abs/1811.00250" target="_blank"><span style="font-family:&quot;font-size:14px;background-color:#FFFFFF;">He Y , Liu P , Wang Z , et al. Filter Pruning via Geometric Median for Deep Convolutional Neural Networks Acceleration[C]// IEEE/CVF Conference on Computer Vision &amp; Pattern Recognition. IEEE, 2019.</span></a>
+					</li>
+					<li>
+						Slim Pruner:<span style="background-color:#FFFDFA;">&nbsp;<a href="https://arxiv.org/pdf/1708.06519.pdf" target="_blank"><span style="font-family:&quot;font-size:14px;background-color:#FFFFFF;">Liu Z , Li J , Shen Z , et al. Learning Efficient Convolutional Networks through Network Slimming[J]. 2017.</span></a></span>
+					</li>
+					<li>
+						<span style="background-color:#FFFDFA;">Opt Slim Pruner:&nbsp;<a href="https://arxiv.org/pdf/2003.04566.pdf" target="_blank"><span style="font-family:&quot;font-size:14px;background-color:#FFFFFF;">Ye Y , You G , Fwu J K , et al. Channel Pruning via Optimal Thresholding[J]. 2020.</span></a><br />
+</span>
+					</li>
+				</ul>
+			</td>
+			<td>
+					<ul>
+						<li>
+							<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/api_cn/prune_api.rst" target="_blank">剪裁模块API文档</a>
+						</li>
+					        <li>
+								<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/quick_start/pruning_tutorial.md" target="_blank">剪裁快速开始示例</a>
+						</li>
+						<li>
+							<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/tutorials/image_classification_sensitivity_analysis_tutorial.md" target="_blank">分类模敏感度分析教程</a>
+						</li>
+						<li>
+							<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/tutorials/paddledetection_slim_pruing_tutorial.md" target="_blank">检测模型剪裁教程</a>
+						</li>
+						<li>
+								<span id="__kindeditor_bookmark_start_313__"></span><a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/tutorials/paddledetection_slim_prune_dist_tutorial.md" target="_blank">检测模型剪裁+蒸馏教程</a>
+						</li>
+						<li>
+								<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/tutorials/paddledetection_slim_sensitivy_tutorial.md" target="_blank">检测模型敏感度分析教程</a>
+						</li>
+					</ul>
+			</td>
+		</tr>
+		<tr>
+			<td style="text-align:center;">
+				量化
+			</td>
+			<td>
+				<ul>
+					<li>
+						Quantization Aware Training:&nbsp;<a href="https://arxiv.org/abs/1806.08342" target="_blank"><span style="font-family:&quot;font-size:14px;background-color:#FFFFFF;">Krishnamoorthi R . Quantizing deep convolutional networks for efficient inference: A whitepaper[J]. 2018.</span></a>
+					</li>
+					<li>
+						Post Training&nbsp;<span>Quantization&nbsp;</span><a href="http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf" target="_blank">原理</a>
+					</li>
+					<li>
+						Embedding&nbsp;<span>Quantization:&nbsp;<a href="https://arxiv.org/pdf/1603.01025.pdf" target="_blank"><span style="font-family:&quot;font-size:14px;background-color:#FFFFFF;">Miyashita D , Lee E H , Murmann B . Convolutional Neural Networks using Logarithmic Data Representation[J]. 2016.</span></a></span>
+					</li>
+					<li>
+						DSQ: <a href="https://arxiv.org/abs/1908.05033" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Gong, Ruihao, et al. "Differentiable soft quantization: Bridging full-precision and low-bit neural networks."&nbsp;</span><i>Proceedings of the IEEE International Conference on Computer Vision</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">. 2019.</span></a>
+					</li>
+					<li>
+						PACT:&nbsp; <a href="https://arxiv.org/abs/1805.06085" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Choi, Jungwook, et al. "Pact: Parameterized clipping activation for quantized neural networks."&nbsp;</span><i>arXiv preprint arXiv:1805.06085</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">&nbsp;(2018).</span></a>
+					</li>
+				</ul>
+			</td>
+			<td>
+				<ul>
+					<li>
+						<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/api_cn/quantization_api.rst" target="_blank">量化API文档</a>
+					</li>
+					<li>
+						<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/quick_start/quant_aware_tutorial.md" target="_blank">量化训练快速开始示例</a>
+					</li>
+					<li>
+						<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/quick_start/quant_post_static_tutorial.md" target="_blank">静态离线量化快速开始示例</a>
+					</li>
+					<li>
+						<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/tutorials/paddledetection_slim_quantization_tutorial.md" target="_blank">检测模型量化教程</a>
+					</li>
+				</ul>
+			</td>
+		</tr>
+		<tr>
+			<td style="text-align:center;">
+				蒸馏
+			</td>
+			<td>
+				<ul>
+					<li>
+						<span>Knowledge Distillation</span>:&nbsp;<a href="https://arxiv.org/abs/1503.02531" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Hinton, Geoffrey, Oriol Vinyals, and Jeff Dean. "Distilling the knowledge in a neural network."&nbsp;</span><i>arXiv preprint arXiv:1503.02531</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">&nbsp;(2015).</span></a>
+					</li>
+					<li>
+						FSP <span>Knowledge Distillation</span>:&nbsp;&nbsp;<a href="http://openaccess.thecvf.com/content_cvpr_2017/papers/Yim_A_Gift_From_CVPR_2017_paper.pdf" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Yim, Junho, et al. "A gift from knowledge distillation: Fast optimization, network minimization and transfer learning."&nbsp;</span><i>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">. 2017.</span></a>
+					</li>
+					<li>
+						YOLO Knowledge Distillation:&nbsp;&nbsp;<a href="http://openaccess.thecvf.com/content_ECCVW_2018/papers/11133/Mehta_Object_detection_at_200_Frames_Per_Second_ECCVW_2018_paper.pdf" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Mehta, Rakesh, and Cemalettin Ozturk. "Object detection at 200 frames per second."&nbsp;</span><i>Proceedings of the European Conference on Computer Vision (ECCV)</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">. 2018.</span></a>
+					</li>
+					<li>
+						DML:&nbsp;<a href="https://arxiv.org/abs/1706.00384" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Zhang, Ying, et al. "Deep mutual learning."&nbsp;</span><i>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">. 2018.</span></a>
+					</li>
+				</ul>
+			</td>
+			<td>
+				<ul>
+					<li>
+						<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/api_cn/single_distiller_api.rst" target="_blank">蒸馏API文档</a>
+					</li>
+					<li>
+						<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/quick_start/distillation_tutorial.md" target="_blank">蒸馏快速开始示例</a>
+					</li>
+					<li>
+						<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/tutorials/paddledetection_slim_distillation_tutorial.md" target="_blank">检测模型蒸馏教程</a>
+					</li>
+				</ul>
+			</td>
+		</tr>
+		<tr>
+			<td style="text-align:center;">
+				模型结构搜索(NAS)
+			</td>
+			<td>
+				<ul>
+					<li>
+						Simulate Anneal NAS:&nbsp;<a href="https://arxiv.org/pdf/2005.04117.pdf" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Abdelhamed, Abdelrahman, et al. "Ntire 2020 challenge on real image denoising: Dataset, methods and results."&nbsp;</span><i>The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) Workshops</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">. Vol. 2. 2020.</span></a>
+					</li>
+					<li>
+						DARTS <a href="https://arxiv.org/abs/1806.09055" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Liu, Hanxiao, Karen Simonyan, and Yiming Yang. "Darts: Differentiable architecture search."&nbsp;</span><i>arXiv preprint arXiv:1806.09055</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">&nbsp;(2018).</span></a>
+					</li>
+					<li>
+						PC-DARTS <a href="https://arxiv.org/abs/1907.05737" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Xu, Yuhui, et al. "Pc-darts: Partial channel connections for memory-efficient differentiable architecture search."&nbsp;</span><i>arXiv preprint arXiv:1907.05737</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">&nbsp;(2019).</span></a>
+					</li>
+					<li>
+						OneShot&nbsp;
+					</li>
+				</ul>
+			</td>
+			<td>
+						<ul>
+							<li>
+								<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/api_cn/nas_api.rst" target="_blank">NAS API文档</a>
+							</li>
+							<li>
+								<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/api_cn/darts.rst" target="_blank">DARTS API文档</a>
+							</li>
+							<li>
+								<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/quick_start/nas_tutorial.md" target="_blank">NAS快速开始示例</a>
+							</li>
+							<li>
+								<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/tutorials/paddledetection_slim_nas_tutorial.md" target="_blank">检测模型NAS教程</a>
+							</li>
+							<li>
+								<a href="https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/tutorials/sanas_darts_space.md" target="_blank">SANAS进阶版实验教程-压缩DARTS产出模型</a>
+							</li>
+						</ul>
+			</td>
+		</tr>
+	</tbody>
+</table>
+<br />
 
-安装PaddleSlim前，请确认已正确安装Paddle1.6版本或更新版本。Paddle安装请参考：[Paddle安装教程](https://www.paddlepaddle.org.cn/install/quick)。
+## 安装
 
+```bash
+pip install paddleslim -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+### 量化和Paddle版本的对应关系
 
-- 安装develop版本
+如果在ARM和GPU上预测，每个版本都可以，如果在CPU上预测，请选择Paddle 2.0对应的PaddleSlim 1.1.0版本
 
+- Paddle 1.7 系列版本，需要安装PaddleSlim 1.0.1版本
 
-```
-git clone https://github.com/PaddlePaddle/PaddleSlim.git
-cd PaddleSlim
-python setup.py install
+```bash
+pip install paddleslim==1.0.1 -i https://pypi.tuna.tsinghua.edu.cn/simple
 ```
 
-- 安装官方发布的最新版本
+- Paddle 1.8 系列版本，需要安装PaddleSlim 1.1.1版本
 
+```bash
+pip install paddleslim==1.1.1 -i https://pypi.tuna.tsinghua.edu.cn/simple
 ```
-pip install paddleslim -i https://pypi.org/simple
+
+- Paddle 2.0 系列版本，需要安装PaddleSlim 1.1.0版本
+
+```bash
+pip install paddleslim==1.1.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
 ```
 
-- 安装历史版本
 
-请点击[pypi.org](https://pypi.org/project/paddleslim/#history)查看可安装历史版本。
 
 ## 使用
 
-- [API文档](doc/api_guide.md)：API使用介绍，包括[蒸馏]()、[剪裁]()、[量化]()和[模型结构搜索]()。
-- [示例](doc/demo_guide.md)：基于mnist和cifar10等简单分类任务的模型压缩示例，您可以通过该部分快速体验和了解PaddleSlim的功能。
-- [实践教程]()：经典模型的分析和压缩实验教程。
-- [模型库]()：经过压缩的分类、检测、语义分割模型，包括权重文件、网络结构文件和性能数据。
-- [Paddle检测库]()：介绍如何在检测库中使用PaddleSlim。
-- [Paddle分割库]()：介绍如何在分割库中使用PaddleSlim。
-- [PaddleLite]()：介绍如何使用预测库PaddleLite部署PaddleSlim产出的模型。
+- [快速开始](docs/zh_cn/quick_start)：通过简单示例介绍如何快速使用PaddleSlim。
+- [进阶教程](docs/zh_cn/tutorials)：PaddleSlim高阶教程。
+- [模型库](docs/zh_cn/model_zoo.md)：各个压缩策略在图像分类、目标检测和图像语义分割模型上的实验结论，包括模型精度、预测速度和可供下载的预训练模型。
+- [API文档](https://paddlepaddle.github.io/PaddleSlim/api_cn/index.html)
+- [算法原理](https://paddlepaddle.github.io/PaddleSlim/algo/algo.html): 介绍量化、剪枝、蒸馏、NAS的基本知识背景。
+- 视觉模型压缩
+    - [SlimMobileNet](paddleslim/models#slimmobilenet系列指标)
+    - [SlimFaceNet](demo/slimfacenet/README.md)
+    - [OCR模型压缩(基于PaddleOCR)](demo/ocr/README.md)
+    - [检测模型压缩(基于PaddleDetection)](demo/detection/README.md)
+
+## 部分压缩策略效果
+
+### 分类模型
+
+数据: ImageNet2012; 模型: MobileNetV1;
+
+|压缩策略 |精度收益(baseline: 70.91%) |模型大小(baseline: 17.0M)|
+|:---:|:---:|:---:|
+| 知识蒸馏(ResNet50)| [+1.06%]() |-|
+| 知识蒸馏(ResNet50) + int8量化训练 |[+1.10%]()| [-71.76%]()|
+| 剪裁(FLOPs-50%) + int8量化训练|[-1.71%]()|[-86.47%]()|
+
+
+### 图像检测模型
+
+#### 数据：Pascal VOC；模型：MobileNet-V1-YOLOv3
+
+|        压缩方法           | mAP(baseline: 76.2%)         | 模型大小(baseline: 94MB)      |
+| :---------------------:   | :------------: | :------------:|
+| 知识蒸馏(ResNet34-YOLOv3) | [+2.8%](#)      |       -       |
+| 剪裁 FLOPs -52.88%        | [+1.4%]()      | [-67.76%]()   |
+|知识蒸馏(ResNet34-YOLOv3)+剪裁(FLOPs-69.57%)| [+2.6%]()|[-67.00%]()|
+
+
+#### 数据：COCO；模型：MobileNet-V1-YOLOv3
+
+|        压缩方法           | mAP(baseline: 29.3%) | 模型大小|
+| :---------------------:   | :------------: | :------:|
+| 知识蒸馏(ResNet34-YOLOv3) |  [+2.1%]()     |-|
+| 知识蒸馏(ResNet34-YOLOv3)+剪裁(FLOPs-67.56%) | [-0.3%]() | [-66.90%]()|
+
+### 搜索
+
+数据：ImageNet2012; 模型：MobileNetV2
+
+|硬件环境           | 推理耗时 | Top1准确率(baseline:71.90%) |
+|:---------------:|:---------:|:--------------------:|
+| RK3288  | [-23%]()    | +0.07%    |
+| Android cellphone  | [-20%]()    | +0.16% |
+| iPhone 6s   | [-17%]()    | +0.32%  |
+
+## 许可证书
+本项目的发布受[Apache 2.0 license](LICENSE)许可认证。
+
+## 如何贡献代码
 
-## 贡献与反馈
+我们非常欢迎你可以为PaddleSlim提供代码，也十分感谢你的反馈。
diff --git a/README_en.md b/README_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..fd7cf0fef814b65dca8336bcff0e08084b5c0a8d
--- /dev/null
+++ b/README_en.md
@@ -0,0 +1,133 @@
+[中文](README.md) | English
+
+Documents：https://paddlepaddle.github.io/PaddleSlim
+
+# PaddleSlim
+
+PaddleSlim is a toolkit for model compression. It contains a collection of compression strategies, such as pruning, fixed point quantization, knowledge distillation, hyperparameter searching and neural architecture search.
+
+PaddleSlim provides solutions of compression on computer vision models, such as image classification, object detection and semantic segmentation. Meanwhile, PaddleSlim Keeps exploring advanced compression strategies for language model. Furthermore, benckmark of compression strategies on some open tasks is available for your reference.
+
+PaddleSlim also provides auxiliary and primitive API for developer and researcher to survey, implement and apply the method in latest papers. PaddleSlim will support developer in ability of framework and technology consulting.
+
+## Features
+
+### Pruning
+
+  - Uniform pruning of convolution
+  - Sensitivity-based prunning
+  - Automated pruning based evolution search strategy
+  - Support pruning of various deep architectures such as VGG, ResNet, and MobileNet.
+  - Support self-defined range of pruning, i.e., layers to be pruned.
+
+### Fixed Point Quantization
+
+  - **Training aware**
+    - Dynamic strategy: During inference, we quantize models with hyperparameters dynamically estimated from small batches of samples.
+    - Static strategy: During inference, we quantize models with the same hyperparameters estimated from training data.
+    - Support layer-wise and channel-wise quantization.
+  - **Post training**
+
+### Knowledge Distillation
+
+  - **Naive knowledge distillation:** transfers dark knowledge by merging the teacher and student model into the same Program
+  - **Paddle large-scale scalable knowledge distillation framework Pantheon:** a universal solution for knowledge distillation, more flexible than the naive knowledge distillation, and easier to scale to the large-scale applications.
+
+    - Decouple the teacher and student models --- they run in different processes in the same or different nodes, and transfer knowledge via TCP/IP ports or local files;
+    - Friendly to assemble multiple teacher models and each of them can work in either online or offline mode independently;
+    - Merge knowledge from different teachers and make batch data for the student model automatically;
+    - Support the large-scale knowledge prediction of teacher models on multiple devices.
+
+### Neural Architecture Search
+
+  - Neural architecture search based on evolution strategy.
+  - Support distributed search.
+  - One-Shot neural architecture search.
+  - Support FLOPs and latency constrained search.
+  - Support the latency estimation on different hardware and platforms.
+
+## Install
+
+Requires:
+
+Paddle >= 1.7.0
+
+```bash
+pip install paddleslim -i https://pypi.org/simple
+```
+
+### quantization
+
+If you want to use quantization in PaddleSlim, please install PaddleSlim as follows.
+
+If you want to use quantized model in ARM and GPU, any PaddleSlim version is ok and you should install 1.1.0 for CPU.
+
+- For Paddle 1.7, install PaddleSlim 1.0.1
+
+```bash
+pip install paddleslim==1.0.1 -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+
+- For Paddle 1.8，install PaddleSlim 1.1.1
+
+```bash
+pip install paddleslim==1.1.1 -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+
+- For Paddle 2.0 ，install PaddleSlim 1.1.0
+
+```bash
+pip install paddleslim==1.1.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+
+## Usage
+
+- [QuickStart](https://paddlepaddle.github.io/PaddleSlim/quick_start/index_en.html): Introduce how to use PaddleSlim by simple examples.
+- [Advanced Tutorials](https://paddlepaddle.github.io/PaddleSlim/tutorials/index_en.html)：Tutorials about advanced usage of PaddleSlim.
+- [Model Zoo](https://paddlepaddle.github.io/PaddleSlim/model_zoo_en.html)：Benchmark and pretrained models.
+- [API Documents](https://paddlepaddle.github.io/PaddleSlim/api_en/index_en.html)
+- [Algorithm Background](https://paddlepaddle.github.io/PaddleSlim/algo/algo.html): Introduce the background of quantization, pruning, distillation, NAS.
+- [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection/tree/master/slim): Introduce how to use PaddleSlim in PaddleDetection library.
+- [PaddleSeg](https://github.com/PaddlePaddle/PaddleSeg/tree/develop/slim): Introduce how to use PaddleSlim in PaddleSeg library.
+- [PaddleLite](https://paddlepaddle.github.io/Paddle-Lite/): How to use PaddleLite to deploy models generated by PaddleSlim.
+
+## Performance
+
+### Image Classification
+
+Dataset: ImageNet2012; Model: MobileNetV1;
+
+|Method |Accuracy(baseline: 70.91%) |Model Size(baseline: 17.0M)|
+|:---:|:---:|:---:|
+| Knowledge Distillation(ResNet50)| [+1.06%]() |-|
+| Knowledge Distillation(ResNet50) + int8 quantization |[+1.10%]()| [-71.76%]()|
+| Pruning(FLOPs-50%) + int8 quantization|[-1.71%]()|[-86.47%]()|
+
+
+### Object Detection
+
+#### Dataset: Pascal VOC; Model: MobileNet-V1-YOLOv3
+
+|        Method           | mAP(baseline: 76.2%)         | Model Size(baseline: 94MB)      |
+| :---------------------:   | :------------: | :------------:|
+| Knowledge Distillation(ResNet34-YOLOv3) | [+2.8%]()      |       -       |
+| Pruning(FLOPs -52.88%)        | [+1.4%]()      | [-67.76%]()   |
+|Knowledge DistillationResNet34-YOLOv3)+Pruning(FLOPs-69.57%)| [+2.6%]()|[-67.00%]()|
+
+
+#### Dataset: COCO; Model: MobileNet-V1-YOLOv3
+
+|        Method           | mAP(baseline: 29.3%) | Model Size|
+| :---------------------:   | :------------: | :------:|
+| Knowledge Distillation(ResNet34-YOLOv3) |  [+2.1%]()     |-|
+| Knowledge Distillation(ResNet34-YOLOv3)+Pruning(FLOPs-67.56%) | [-0.3%]() | [-66.90%]()|
+
+### NAS
+
+Dataset: ImageNet2012; Model: MobileNetV2
+
+|Device           | Infer time cost | Top1 accuracy(baseline:71.90%) |
+|:---------------:|:---------:|:--------------------:|
+| RK3288  | [-23%]()    | +0.07%    |
+| Android cellphone  | [-20%]()    | +0.16% |
+| iPhone 6s   | [-17%]()    | +0.32%  |
diff --git a/demo/auto_prune/README.md b/demo/auto_prune/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9ada4cf4c848ef24e296aa3ebf4724c707354199
--- /dev/null
+++ b/demo/auto_prune/README.md
@@ -0,0 +1,63 @@
+该示例介绍如何使用自动裁剪。
+该示例使用默认会自动下载并使用MNIST数据。支持以下模型：
+
+- MobileNetV1
+- MobileNetV2
+- ResNet50
+
+## 1. 接口介绍
+
+该示例涉及以下接口：
+
+- [paddleslim.prune.AutoPruner])
+- [paddleslim.prune.Pruner])
+
+## 2. 运行示例
+
+
+提供两种自动裁剪模式，直接以裁剪目标进行一次自动裁剪，和多次迭代的方式进行裁剪。 
+
+###2.1一次裁剪
+
+在路径`PaddleSlim/demo/auto_prune`下执行以下代码运行示例：
+
+```
+export CUDA_VISIBLE_DEVICES=0
+python train.py --model "MobileNet"
+从log中获取搜索的最佳裁剪率列表，加入到train_finetune.py的ratiolist中，如下命令finetune得到最终结果
+python train_finetune.py --model "MobileNet" --lr 0.1 --num_epochs 120 --step_epochs 30 60 90
+
+```
+
+通过`python train.py --help`查看更多选项。
+
+
+###2.2多次迭代裁剪
+
+在路径`PaddleSlim/demo/auto_prune`下执行以下代码运行示例：
+
+```
+export CUDA_VISIBLE_DEVICES=0
+python train_iterator.py --model "MobileNet"
+从log中获取本次迭代搜索的最佳裁剪率列表，加入到train_finetune.py的ratiolist中,如下命令开始finetune本次搜索到的结果
+python train_finetune.py --model "MobileNet"
+将第一次迭代的最佳裁剪率列表，加入到train_iterator.py 的ratiolist中,如下命令进行第二次迭代
+python train_iterator.py --model "MobileNet" --pretrained_model "checkpoint/Mobilenet/19"
+finetune第二次迭代搜索结果，并继续重复迭代，直到获得目标裁剪率的结果
+...
+如下命令finetune最终结果
+python train_finetune.py --model "MobileNet" --pretrained_model "checkpoint/Mobilenet/19"  --num_epochs 70 --step_epochs 10 40
+```
+
+
+## 3. 注意
+
+### 3.1 一次裁剪
+
+在`paddleslim.prune.AutoPruner`接口的参数中，pruned_flops表示期望的最低flops剪切率。
+
+
+### 3.2 多次迭代裁剪
+
+单次迭代的裁剪目标，建议不高于10%。
+在load前次迭代结果时，需要删除checkpoint下learning_rate、@LR_DECAY_COUNTER@等文件，避免继承之前的learning_rate，影响finetune效果。
diff --git a/demo/auto_prune/train.py b/demo/auto_prune/train.py
index d65dd875a8d650b57bbe429514e99dc6fa46e630..ff55a54597ce7e923764f9e6f8dff232d2d28658 100644
--- a/demo/auto_prune/train.py
+++ b/demo/auto_prune/train.py
@@ -116,8 +116,8 @@ def compress(args):
 
         fluid.io.load_vars(exe, args.pretrained_model, predicate=if_exist)
 
-    val_reader = paddle.batch(val_reader, batch_size=args.batch_size)
-    train_reader = paddle.batch(
+    val_reader = paddle.fluid.io.batch(val_reader, batch_size=args.batch_size)
+    train_reader = paddle.fluid.io.batch(
         train_reader, batch_size=args.batch_size, drop_last=True)
 
     train_feeder = feeder = fluid.DataFeeder([image, label], place)
diff --git a/demo/auto_prune/train_finetune.py b/demo/auto_prune/train_finetune.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dcb56bfe7988475cc7c63ebde257897456bfb22
--- /dev/null
+++ b/demo/auto_prune/train_finetune.py
@@ -0,0 +1,227 @@
+import os
+import sys
+import logging
+import paddle
+import argparse
+import functools
+import math
+import paddle.fluid as fluid
+import imagenet_reader as reader
+import models
+from utility import add_arguments, print_arguments
+import numpy as np
+import time
+from paddleslim.prune import Pruner
+from paddleslim.analysis import flops
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',       int,  64 * 4,                 "Minibatch size.")
+add_arg('use_gpu',          bool, True,                "Whether to use GPU or not.")
+add_arg('model',            str,  "MobileNet",                "The target model.")
+add_arg('model_save_dir',            str,  "./",                "checkpoint  model.")
+add_arg('pretrained_model', str,  "../pretrained_model/MobileNetV1_pretained",                "Whether to use pretrained model.")
+add_arg('lr',               float,  0.01,               "The learning rate used to fine-tune pruned model.")
+add_arg('lr_strategy',      str,  "piecewise_decay",   "The learning rate decay strategy.")
+add_arg('l2_decay',         float,  3e-5,               "The l2_decay parameter.")
+add_arg('momentum_rate',    float,  0.9,               "The value of momentum_rate.")
+add_arg('num_epochs',       int,  20,               "The number of total epochs.")
+add_arg('total_images',     int,  1281167,               "The number of total training images.")
+parser.add_argument('--step_epochs', nargs='+', type=int, default=[5, 15], help="piecewise decay step")
+add_arg('config_file',      str, None,                 "The config file for compression with yaml format.")
+# yapf: enable
+
+model_list = [m for m in dir(models) if "__" not in m]
+ratiolist = [
+    #    [0.06, 0.0, 0.09, 0.03, 0.09, 0.02, 0.05, 0.03, 0.0, 0.07, 0.07, 0.05, 0.08],
+    #    [0.08, 0.02, 0.03, 0.13, 0.1, 0.06, 0.03, 0.04, 0.14, 0.02, 0.03, 0.02, 0.01],
+]
+
+
+def save_model(args, exe, train_prog, eval_prog, info):
+    model_path = os.path.join(args.model_save_dir, args.model, str(info))
+    if not os.path.isdir(model_path):
+        os.makedirs(model_path)
+    fluid.io.save_persistables(exe, model_path, main_program=train_prog)
+    print("Already save model in %s" % (model_path))
+
+
+def piecewise_decay(args):
+    step = int(math.ceil(float(args.total_images) / args.batch_size))
+    bd = [step * e for e in args.step_epochs]
+    lr = [args.lr * (0.1**i) for i in range(len(bd) + 1)]
+    learning_rate = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=learning_rate,
+        momentum=args.momentum_rate,
+        regularization=fluid.regularizer.L2Decay(args.l2_decay))
+    return optimizer
+
+
+def cosine_decay(args):
+    step = int(math.ceil(float(args.total_images) / args.batch_size))
+    learning_rate = fluid.layers.cosine_decay(
+        learning_rate=args.lr, step_each_epoch=step, epochs=args.num_epochs)
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=learning_rate,
+        momentum=args.momentum_rate,
+        regularization=fluid.regularizer.L2Decay(args.l2_decay))
+    return optimizer
+
+
+def create_optimizer(args):
+    if args.lr_strategy == "piecewise_decay":
+        return piecewise_decay(args)
+    elif args.lr_strategy == "cosine_decay":
+        return cosine_decay(args)
+
+
+def compress(args):
+    class_dim = 1000
+    image_shape = "3,224,224"
+    image_shape = [int(m) for m in image_shape.split(",")]
+    assert args.model in model_list, "{} is not in lists: {}".format(
+        args.model, model_list)
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    # model definition
+    model = models.__dict__[args.model]()
+    out = model.net(input=image, class_dim=class_dim)
+    cost = fluid.layers.cross_entropy(input=out, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+    val_program = fluid.default_main_program().clone(for_test=True)
+    opt = create_optimizer(args)
+    opt.minimize(avg_cost)
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if args.pretrained_model:
+
+        def if_exist(var):
+            exist = os.path.exists(
+                os.path.join(args.pretrained_model, var.name))
+            print("exist", exist)
+            return exist
+
+        #fluid.io.load_vars(exe, args.pretrained_model, predicate=if_exist)
+
+    val_reader = paddle.fluid.io.batch(reader.val(), batch_size=args.batch_size)
+    train_reader = paddle.fluid.io.batch(
+        reader.train(), batch_size=args.batch_size, drop_last=True)
+
+    train_feeder = feeder = fluid.DataFeeder([image, label], place)
+    val_feeder = feeder = fluid.DataFeeder(
+        [image, label], place, program=val_program)
+
+    def test(epoch, program):
+        batch_id = 0
+        acc_top1_ns = []
+        acc_top5_ns = []
+        for data in val_reader():
+            start_time = time.time()
+            acc_top1_n, acc_top5_n = exe.run(
+                program,
+                feed=train_feeder.feed(data),
+                fetch_list=[acc_top1.name, acc_top5.name])
+            end_time = time.time()
+            print(
+                "Eval epoch[{}] batch[{}] - acc_top1: {}; acc_top5: {}; time: {}".
+                format(epoch, batch_id,
+                       np.mean(acc_top1_n),
+                       np.mean(acc_top5_n), end_time - start_time))
+            acc_top1_ns.append(np.mean(acc_top1_n))
+            acc_top5_ns.append(np.mean(acc_top5_n))
+            batch_id += 1
+
+        print("Final eval epoch[{}] - acc_top1: {}; acc_top5: {}".format(
+            epoch,
+            np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns))))
+
+    def train(epoch, program):
+
+        build_strategy = fluid.BuildStrategy()
+        exec_strategy = fluid.ExecutionStrategy()
+        train_program = fluid.compiler.CompiledProgram(
+            program).with_data_parallel(
+                loss_name=avg_cost.name,
+                build_strategy=build_strategy,
+                exec_strategy=exec_strategy)
+
+        batch_id = 0
+        for data in train_reader():
+            start_time = time.time()
+            loss_n, acc_top1_n, acc_top5_n, lr_n = exe.run(
+                train_program,
+                feed=train_feeder.feed(data),
+                fetch_list=[
+                    avg_cost.name, acc_top1.name, acc_top5.name,
+                    "learning_rate"
+                ])
+            end_time = time.time()
+            loss_n = np.mean(loss_n)
+            acc_top1_n = np.mean(acc_top1_n)
+            acc_top5_n = np.mean(acc_top5_n)
+            lr_n = np.mean(lr_n)
+            print(
+                "epoch[{}]-batch[{}] - loss: {}; acc_top1: {}; acc_top5: {};lrn: {}; time: {}".
+                format(epoch, batch_id, loss_n, acc_top1_n, acc_top5_n, lr_n,
+                       end_time - start_time))
+            batch_id += 1
+
+    params = []
+    for param in fluid.default_main_program().global_block().all_parameters():
+        #if "_weights" in  param.name and "conv1_weights" not in param.name:
+        if "_sep_weights" in param.name:
+            params.append(param.name)
+    print("fops before pruning: {}".format(
+        flops(fluid.default_main_program())))
+    pruned_program_iter = fluid.default_main_program()
+    pruned_val_program_iter = val_program
+    for ratios in ratiolist:
+        pruner = Pruner()
+        pruned_val_program_iter = pruner.prune(
+            pruned_val_program_iter,
+            fluid.global_scope(),
+            params=params,
+            ratios=ratios,
+            place=place,
+            only_graph=True)
+
+        pruned_program_iter = pruner.prune(
+            pruned_program_iter,
+            fluid.global_scope(),
+            params=params,
+            ratios=ratios,
+            place=place)
+        print("fops after pruning: {}".format(flops(pruned_program_iter)))
+    """ do not inherit learning rate """
+    if (os.path.exists(args.pretrained_model + "/learning_rate")):
+        os.remove(args.pretrained_model + "/learning_rate")
+    if (os.path.exists(args.pretrained_model + "/@LR_DECAY_COUNTER@")):
+        os.remove(args.pretrained_model + "/@LR_DECAY_COUNTER@")
+    fluid.io.load_vars(
+        exe,
+        args.pretrained_model,
+        main_program=pruned_program_iter,
+        predicate=if_exist)
+
+    pruned_program = pruned_program_iter
+    pruned_val_program = pruned_val_program_iter
+    for i in range(args.num_epochs):
+        train(i, pruned_program)
+        test(i, pruned_val_program)
+        save_model(args, exe, pruned_program, pruned_val_program, i)
+
+
+def main():
+    args = parser.parse_args()
+    print_arguments(args)
+    compress(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/auto_prune/train_iterator.py b/demo/auto_prune/train_iterator.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b7ca07f768e661476a5943dbc623291a37f939a
--- /dev/null
+++ b/demo/auto_prune/train_iterator.py
@@ -0,0 +1,254 @@
+import os
+import sys
+import logging
+import paddle
+import argparse
+import functools
+import math
+import time
+import numpy as np
+import paddle.fluid as fluid
+from paddleslim.prune import AutoPruner
+from paddleslim.common import get_logger
+from paddleslim.analysis import flops
+from paddleslim.prune import Pruner
+sys.path.append(sys.path[0] + "/../")
+import models
+from utility import add_arguments, print_arguments
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',       int,  64 * 4,                 "Minibatch size.")
+add_arg('use_gpu',          bool, True,                "Whether to use GPU or not.")
+add_arg('model',            str,  "MobileNet",                "The target model.")
+add_arg('pretrained_model', str,  "../pretrained_model/MobileNetV1_pretained",                "Whether to use pretrained model.")
+add_arg('model_save_dir',   str,  "./",                "checkpoint  model.")
+add_arg('lr',               float,  0.1,               "The learning rate used to fine-tune pruned model.")
+add_arg('lr_strategy',      str,  "piecewise_decay",   "The learning rate decay strategy.")
+add_arg('l2_decay',         float,  3e-5,               "The l2_decay parameter.")
+add_arg('momentum_rate',    float,  0.9,               "The value of momentum_rate.")
+add_arg('num_epochs',       int,  120,               "The number of total epochs.")
+add_arg('total_images',     int,  1281167,               "The number of total training images.")
+parser.add_argument('--step_epochs', nargs='+', type=int, default=[30, 60, 90], help="piecewise decay step")
+add_arg('config_file',      str, None,                 "The config file for compression with yaml format.")
+add_arg('data',             str, "mnist",                 "Which data to use. 'mnist' or 'imagenet'")
+add_arg('log_period',       int, 10,                 "Log period in batches.")
+add_arg('test_period',      int, 10,                 "Test period in epoches.")
+# yapf: enable
+
+model_list = [m for m in dir(models) if "__" not in m]
+ratiolist = [
+    #    [0.06, 0.0, 0.09, 0.03, 0.09, 0.02, 0.05, 0.03, 0.0, 0.07, 0.07, 0.05, 0.08],
+    #    [0.08, 0.02, 0.03, 0.13, 0.1, 0.06, 0.03, 0.04, 0.14, 0.02, 0.03, 0.02, 0.01],
+]
+
+
+def piecewise_decay(args):
+    step = int(math.ceil(float(args.total_images) / args.batch_size))
+    bd = [step * e for e in args.step_epochs]
+    lr = [args.lr * (0.1**i) for i in range(len(bd) + 1)]
+    learning_rate = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=learning_rate,
+        momentum=args.momentum_rate,
+        regularization=fluid.regularizer.L2Decay(args.l2_decay))
+    return optimizer
+
+
+def cosine_decay(args):
+    step = int(math.ceil(float(args.total_images) / args.batch_size))
+    learning_rate = fluid.layers.cosine_decay(
+        learning_rate=args.lr, step_each_epoch=step, epochs=args.num_epochs)
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=learning_rate,
+        momentum=args.momentum_rate,
+        regularization=fluid.regularizer.L2Decay(args.l2_decay))
+    return optimizer
+
+
+def create_optimizer(args):
+    if args.lr_strategy == "piecewise_decay":
+        return piecewise_decay(args)
+    elif args.lr_strategy == "cosine_decay":
+        return cosine_decay(args)
+
+
+def compress(args):
+
+    train_reader = None
+    test_reader = None
+    if args.data == "mnist":
+        import paddle.dataset.mnist as reader
+        train_reader = reader.train()
+        val_reader = reader.test()
+        class_dim = 10
+        image_shape = "1,28,28"
+    elif args.data == "imagenet":
+        import imagenet_reader as reader
+        train_reader = reader.train()
+        val_reader = reader.val()
+        class_dim = 1000
+        image_shape = "3,224,224"
+    else:
+        raise ValueError("{} is not supported.".format(args.data))
+
+    image_shape = [int(m) for m in image_shape.split(",")]
+    assert args.model in model_list, "{} is not in lists: {}".format(
+        args.model, model_list)
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    # model definition
+    model = models.__dict__[args.model]()
+    out = model.net(input=image, class_dim=class_dim)
+    cost = fluid.layers.cross_entropy(input=out, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+    val_program = fluid.default_main_program().clone(for_test=True)
+    opt = create_optimizer(args)
+    opt.minimize(avg_cost)
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if args.pretrained_model:
+
+        def if_exist(var):
+            return os.path.exists(
+                os.path.join(args.pretrained_model, var.name))
+
+#        fluid.io.load_vars(exe, args.pretrained_model, predicate=if_exist)
+
+    val_reader = paddle.fluid.io.batch(val_reader, batch_size=args.batch_size)
+    train_reader = paddle.fluid.io.batch(
+        train_reader, batch_size=args.batch_size, drop_last=True)
+
+    train_feeder = feeder = fluid.DataFeeder([image, label], place)
+    val_feeder = feeder = fluid.DataFeeder(
+        [image, label], place, program=val_program)
+
+    def test(epoch, program):
+        batch_id = 0
+        acc_top1_ns = []
+        acc_top5_ns = []
+        for data in val_reader():
+            start_time = time.time()
+            acc_top1_n, acc_top5_n = exe.run(
+                program,
+                feed=train_feeder.feed(data),
+                fetch_list=[acc_top1.name, acc_top5.name])
+            end_time = time.time()
+            if batch_id % args.log_period == 0:
+                _logger.info(
+                    "Eval epoch[{}] batch[{}] - acc_top1: {}; acc_top5: {}; time: {}".
+                    format(epoch, batch_id,
+                           np.mean(acc_top1_n),
+                           np.mean(acc_top5_n), end_time - start_time))
+            acc_top1_ns.append(np.mean(acc_top1_n))
+            acc_top5_ns.append(np.mean(acc_top5_n))
+            batch_id += 1
+
+        _logger.info("Final eval epoch[{}] - acc_top1: {}; acc_top5: {}".
+                     format(epoch,
+                            np.mean(np.array(acc_top1_ns)),
+                            np.mean(np.array(acc_top5_ns))))
+        return np.mean(np.array(acc_top1_ns))
+
+    def train(epoch, program):
+
+        build_strategy = fluid.BuildStrategy()
+        exec_strategy = fluid.ExecutionStrategy()
+        train_program = fluid.compiler.CompiledProgram(
+            program).with_data_parallel(
+                loss_name=avg_cost.name,
+                build_strategy=build_strategy,
+                exec_strategy=exec_strategy)
+
+        batch_id = 0
+        for data in train_reader():
+            start_time = time.time()
+            loss_n, acc_top1_n, acc_top5_n = exe.run(
+                train_program,
+                feed=train_feeder.feed(data),
+                fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name])
+            end_time = time.time()
+            loss_n = np.mean(loss_n)
+            acc_top1_n = np.mean(acc_top1_n)
+            acc_top5_n = np.mean(acc_top5_n)
+            if batch_id % args.log_period == 0:
+                _logger.info(
+                    "epoch[{}]-batch[{}] - loss: {}; acc_top1: {}; acc_top5: {}; time: {}".
+                    format(epoch, batch_id, loss_n, acc_top1_n, acc_top5_n,
+                           end_time - start_time))
+            batch_id += 1
+
+    params = []
+    for param in fluid.default_main_program().global_block().all_parameters():
+        if "_sep_weights" in param.name:
+            params.append(param.name)
+
+    pruned_program_iter = fluid.default_main_program()
+    pruned_val_program_iter = val_program
+
+    for ratios in ratiolist:
+        pruner = Pruner()
+        pruned_val_program_iter = pruner.prune(
+            pruned_val_program_iter,
+            fluid.global_scope(),
+            params=params,
+            ratios=ratios,
+            place=place,
+            only_graph=True)
+
+        pruned_program_iter = pruner.prune(
+            pruned_program_iter,
+            fluid.global_scope(),
+            params=params,
+            ratios=ratios,
+            place=place)
+        print("fops after pruning: {}".format(flops(pruned_program_iter)))
+    fluid.io.load_vars(
+        exe,
+        args.pretrained_model,
+        main_program=pruned_program_iter,
+        predicate=if_exist)
+
+    pruner = AutoPruner(
+        pruned_val_program_iter,
+        fluid.global_scope(),
+        place,
+        params=params,
+        init_ratios=[0.1] * len(params),
+        pruned_flops=0.1,
+        pruned_latency=None,
+        server_addr=("", 0),
+        init_temperature=100,
+        reduce_rate=0.85,
+        max_try_times=300,
+        max_client_num=10,
+        search_steps=100,
+        max_ratios=0.2,
+        min_ratios=0.,
+        is_server=True,
+        key="auto_pruner")
+
+    while True:
+        pruned_program, pruned_val_program = pruner.prune(
+            pruned_program_iter, pruned_val_program_iter)
+        for i in range(0):
+            train(i, pruned_program)
+        score = test(0, pruned_val_program)
+        pruner.reward(score)
+
+
+def main():
+    args = parser.parse_args()
+    print_arguments(args)
+    compress(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/bert/run.sh b/demo/bert/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..781d9314ba69508ed055adbc2cd0cc7c69fc4856
--- /dev/null
+++ b/demo/bert/run.sh
@@ -0,0 +1 @@
+CUDA_VISIBLE_DEVICES=0 python2 -u train_cell_base.py
diff --git a/demo/bert/train_distill.py b/demo/bert/train_distill.py
new file mode 100755
index 0000000000000000000000000000000000000000..8f4758b05b700a6d39440e6e08bc68913dc4f7e9
--- /dev/null
+++ b/demo/bert/train_distill.py
@@ -0,0 +1,204 @@
+import numpy as np
+from itertools import izip
+import paddle.fluid as fluid
+from paddleslim.teachers.bert.reader.cls import *
+from paddleslim.nas.darts.search_space import AdaBERTClassifier
+from paddle.fluid.dygraph.base import to_variable
+from tqdm import tqdm
+import os
+import pickle
+
+import logging
+from paddleslim.common import AvgrageMeter, get_logger
+logger = get_logger(__name__, level=logging.INFO)
+
+
+def valid_one_epoch(model, valid_loader, epoch, log_freq):
+    accs = AvgrageMeter()
+    ce_losses = AvgrageMeter()
+    model.student.eval()
+
+    step_id = 0
+    for valid_data in valid_loader():
+        try:
+            loss, acc, ce_loss, _, _ = model._layers.loss(valid_data, epoch)
+        except:
+            loss, acc, ce_loss, _, _ = model.loss(valid_data, epoch)
+
+        batch_size = valid_data[0].shape[0]
+        ce_losses.update(ce_loss.numpy(), batch_size)
+        accs.update(acc.numpy(), batch_size)
+        step_id += 1
+    return ce_losses.avg[0], accs.avg[0]
+
+
+def train_one_epoch(model, train_loader, optimizer, epoch, use_data_parallel,
+                    log_freq):
+    total_losses = AvgrageMeter()
+    accs = AvgrageMeter()
+    ce_losses = AvgrageMeter()
+    kd_losses = AvgrageMeter()
+    model.student.train()
+
+    step_id = 0
+    for train_data in train_loader():
+        batch_size = train_data[0].shape[0]
+
+        if use_data_parallel:
+            total_loss, acc, ce_loss, kd_loss, _ = model._layers.loss(
+                train_data, epoch)
+        else:
+            total_loss, acc, ce_loss, kd_loss, _ = model.loss(train_data,
+                                                              epoch)
+
+        if use_data_parallel:
+            total_loss = model.scale_loss(total_loss)
+            total_loss.backward()
+            model.apply_collective_grads()
+        else:
+            total_loss.backward()
+        optimizer.minimize(total_loss)
+        model.clear_gradients()
+        total_losses.update(total_loss.numpy(), batch_size)
+        accs.update(acc.numpy(), batch_size)
+        ce_losses.update(ce_loss.numpy(), batch_size)
+        kd_losses.update(kd_loss.numpy(), batch_size)
+
+        if step_id % log_freq == 0:
+            logger.info(
+                "Train Epoch {}, Step {}, Lr {:.6f} total_loss {:.6f}; ce_loss {:.6f}, kd_loss {:.6f}, train_acc {:.6f};".
+                format(epoch, step_id,
+                       optimizer.current_step_lr(), total_losses.avg[0],
+                       ce_losses.avg[0], kd_losses.avg[0], accs.avg[0]))
+        step_id += 1
+
+
+def main():
+    # whether use multi-gpus
+    use_data_parallel = False
+    place = fluid.CUDAPlace(fluid.dygraph.parallel.Env(
+    ).dev_id) if use_data_parallel else fluid.CUDAPlace(0)
+
+    BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12"
+    vocab_path = BERT_BASE_PATH + "/vocab.txt"
+
+    do_lower_case = True
+    # augmented dataset nums
+    # num_samples = 8016987
+
+    max_seq_len = 128
+    batch_size = 192
+    hidden_size = 768
+    emb_size = 768
+    epoch = 80
+    log_freq = 10
+
+    task_name = 'mnli'
+
+    if task_name == 'mrpc':
+        data_dir = "./data/glue_data/MRPC/"
+        teacher_model_dir = "./data/teacher_model/mrpc"
+        num_samples = 3668
+        max_layer = 4
+        num_labels = 2
+        processor_func = MrpcProcessor
+    elif task_name == 'mnli':
+        data_dir = "./data/glue_data/MNLI/"
+        teacher_model_dir = "./data/teacher_model/steps_23000"
+        num_samples = 392702
+        max_layer = 8
+        num_labels = 3
+        processor_func = MnliProcessor
+
+    device_num = fluid.dygraph.parallel.Env().nranks
+    use_fixed_gumbel = True
+    train_phase = "train"
+    val_phase = "dev"
+    step_per_epoch = int(num_samples / (batch_size * device_num))
+
+    with fluid.dygraph.guard(place):
+        if use_fixed_gumbel:
+            # make sure gumbel arch is constant
+            np.random.seed(1)
+            fluid.default_main_program().random_seed = 1
+        model = AdaBERTClassifier(
+            num_labels,
+            n_layer=max_layer,
+            hidden_size=hidden_size,
+            task_name=task_name,
+            emb_size=emb_size,
+            teacher_model=teacher_model_dir,
+            data_dir=data_dir,
+            use_fixed_gumbel=use_fixed_gumbel)
+
+        learning_rate = fluid.dygraph.CosineDecay(2e-2, step_per_epoch, epoch)
+
+        model_parameters = []
+        for p in model.parameters():
+            if (p.name not in [a.name for a in model.arch_parameters()] and
+                    p.name not in
+                [a.name for a in model.teacher.parameters()]):
+                model_parameters.append(p)
+
+        optimizer = fluid.optimizer.MomentumOptimizer(
+            learning_rate,
+            0.9,
+            regularization=fluid.regularizer.L2DecayRegularizer(3e-4),
+            parameter_list=model_parameters)
+
+        processor = processor_func(
+            data_dir=data_dir,
+            vocab_path=vocab_path,
+            max_seq_len=max_seq_len,
+            do_lower_case=do_lower_case,
+            in_tokens=False)
+
+        train_reader = processor.data_generator(
+            batch_size=batch_size,
+            phase=train_phase,
+            epoch=1,
+            dev_count=1,
+            shuffle=True)
+        dev_reader = processor.data_generator(
+            batch_size=batch_size,
+            phase=val_phase,
+            epoch=1,
+            dev_count=1,
+            shuffle=False)
+
+        if use_data_parallel:
+            train_reader = fluid.contrib.reader.distributed_batch_reader(
+                train_reader)
+
+        train_loader = fluid.io.DataLoader.from_generator(
+            capacity=128,
+            use_double_buffer=True,
+            iterable=True,
+            return_list=True)
+        dev_loader = fluid.io.DataLoader.from_generator(
+            capacity=128,
+            use_double_buffer=True,
+            iterable=True,
+            return_list=True)
+
+        train_loader.set_batch_generator(train_reader, places=place)
+        dev_loader.set_batch_generator(dev_reader, places=place)
+
+        if use_data_parallel:
+            strategy = fluid.dygraph.parallel.prepare_context()
+            model = fluid.dygraph.parallel.DataParallel(model, strategy)
+
+        best_valid_acc = 0
+        for epoch_id in range(epoch):
+            train_one_epoch(model, train_loader, optimizer, epoch_id,
+                            use_data_parallel, log_freq)
+            loss, acc = valid_one_epoch(model, dev_loader, epoch_id, log_freq)
+            if acc > best_valid_acc:
+                best_valid_acc = acc
+            logger.info(
+                "dev set, ce_loss {:.6f}; acc {:.6f}, best_acc {:.6f};".format(
+                    loss, acc, best_valid_acc))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/bert/train_search.py b/demo/bert/train_search.py
new file mode 100755
index 0000000000000000000000000000000000000000..685c9f995ca2805e6c03cabc68d702ff9600da78
--- /dev/null
+++ b/demo/bert/train_search.py
@@ -0,0 +1,232 @@
+import numpy as np
+from itertools import izip
+import paddle.fluid as fluid
+from paddleslim.teachers.bert.reader.cls import *
+from paddleslim.nas.darts.search_space import AdaBERTClassifier
+from paddle.fluid.dygraph.base import to_variable
+from tqdm import tqdm
+import os
+import pickle
+
+import logging
+from paddleslim.common import AvgrageMeter, get_logger
+logger = get_logger(__name__, level=logging.INFO)
+
+
+def valid_one_epoch(model, valid_loader, epoch, log_freq):
+    accs = AvgrageMeter()
+    ce_losses = AvgrageMeter()
+    model.student.eval()
+
+    step_id = 0
+    for valid_data in valid_loader():
+        try:
+            loss, acc, ce_loss, _, _ = model._layers.loss(valid_data, epoch)
+        except:
+            loss, acc, ce_loss, _, _ = model.loss(valid_data, epoch)
+
+        batch_size = valid_data[0].shape[0]
+        ce_losses.update(ce_loss.numpy(), batch_size)
+        accs.update(acc.numpy(), batch_size)
+        step_id += 1
+    return ce_losses.avg[0], accs.avg[0]
+
+
+def train_one_epoch(model, train_loader, valid_loader, optimizer,
+                    arch_optimizer, epoch, use_data_parallel, log_freq):
+    total_losses = AvgrageMeter()
+    accs = AvgrageMeter()
+    ce_losses = AvgrageMeter()
+    kd_losses = AvgrageMeter()
+    val_accs = AvgrageMeter()
+    model.student.train()
+
+    step_id = 0
+    for train_data, valid_data in izip(train_loader(), valid_loader()):
+        batch_size = train_data[0].shape[0]
+        # make sure arch on every gpu is same, otherwise an error will occurs
+        np.random.seed(step_id * 2 * (epoch + 1))
+        if use_data_parallel:
+            total_loss, acc, ce_loss, kd_loss, _ = model._layers.loss(
+                train_data, epoch)
+        else:
+            total_loss, acc, ce_loss, kd_loss, _ = model.loss(train_data,
+                                                              epoch)
+
+        if use_data_parallel:
+            total_loss = model.scale_loss(total_loss)
+            total_loss.backward()
+            model.apply_collective_grads()
+        else:
+            total_loss.backward()
+        optimizer.minimize(total_loss)
+        model.clear_gradients()
+        total_losses.update(total_loss.numpy(), batch_size)
+        accs.update(acc.numpy(), batch_size)
+        ce_losses.update(ce_loss.numpy(), batch_size)
+        kd_losses.update(kd_loss.numpy(), batch_size)
+
+        # make sure arch on every gpu is same, otherwise an error will occurs
+        np.random.seed(step_id * 2 * (epoch + 1) + 1)
+        if use_data_parallel:
+            arch_loss, _, _, _, arch_logits = model._layers.loss(valid_data,
+                                                                 epoch)
+        else:
+            arch_loss, _, _, _, arch_logits = model.loss(valid_data, epoch)
+
+        if use_data_parallel:
+            arch_loss = model.scale_loss(arch_loss)
+            arch_loss.backward()
+            model.apply_collective_grads()
+        else:
+            arch_loss.backward()
+        arch_optimizer.minimize(arch_loss)
+        model.clear_gradients()
+        probs = fluid.layers.softmax(arch_logits[-1])
+        val_acc = fluid.layers.accuracy(input=probs, label=valid_data[4])
+        val_accs.update(val_acc.numpy(), batch_size)
+
+        if step_id % log_freq == 0:
+            logger.info(
+                "Train Epoch {}, Step {}, Lr {:.6f} total_loss {:.6f}; ce_loss {:.6f}, kd_loss {:.6f}, train_acc {:.6f}, search_valid_acc {:.6f};".
+                format(epoch, step_id,
+                       optimizer.current_step_lr(), total_losses.avg[
+                           0], ce_losses.avg[0], kd_losses.avg[0], accs.avg[0],
+                       val_accs.avg[0]))
+
+        step_id += 1
+
+
+def main():
+    # whether use multi-gpus
+    use_data_parallel = False
+    place = fluid.CUDAPlace(fluid.dygraph.parallel.Env(
+    ).dev_id) if use_data_parallel else fluid.CUDAPlace(0)
+
+    BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12"
+    vocab_path = BERT_BASE_PATH + "/vocab.txt"
+    data_dir = "./data/glue_data/MNLI/"
+    teacher_model_dir = "./data/teacher_model/steps_23000"
+    do_lower_case = True
+    num_samples = 392702
+    # augmented dataset nums
+    # num_samples = 8016987
+    max_seq_len = 128
+    batch_size = 128
+    hidden_size = 768
+    emb_size = 768
+    max_layer = 8
+    epoch = 80
+    log_freq = 10
+    device_num = fluid.dygraph.parallel.Env().nranks
+
+    use_fixed_gumbel = False
+    train_phase = "search_train"
+    val_phase = "search_valid"
+    step_per_epoch = int(num_samples * 0.5 / ((batch_size) * device_num))
+
+    with fluid.dygraph.guard(place):
+        model = AdaBERTClassifier(
+            3,
+            n_layer=max_layer,
+            hidden_size=hidden_size,
+            emb_size=emb_size,
+            teacher_model=teacher_model_dir,
+            data_dir=data_dir,
+            use_fixed_gumbel=use_fixed_gumbel)
+
+        learning_rate = fluid.dygraph.CosineDecay(2e-2, step_per_epoch, epoch)
+
+        model_parameters = []
+        for p in model.parameters():
+            if (p.name not in [a.name for a in model.arch_parameters()] and
+                    p.name not in
+                [a.name for a in model.teacher.parameters()]):
+                model_parameters.append(p)
+
+        optimizer = fluid.optimizer.MomentumOptimizer(
+            learning_rate,
+            0.9,
+            regularization=fluid.regularizer.L2DecayRegularizer(3e-4),
+            parameter_list=model_parameters)
+
+        arch_optimizer = fluid.optimizer.Adam(
+            3e-4,
+            0.5,
+            0.999,
+            regularization=fluid.regularizer.L2Decay(1e-3),
+            parameter_list=model.arch_parameters())
+
+        processor = MnliProcessor(
+            data_dir=data_dir,
+            vocab_path=vocab_path,
+            max_seq_len=max_seq_len,
+            do_lower_case=do_lower_case,
+            in_tokens=False)
+
+        train_reader = processor.data_generator(
+            batch_size=batch_size,
+            phase=train_phase,
+            epoch=1,
+            dev_count=1,
+            shuffle=True)
+        valid_reader = processor.data_generator(
+            batch_size=batch_size,
+            phase=val_phase,
+            epoch=1,
+            dev_count=1,
+            shuffle=True)
+        dev_reader = processor.data_generator(
+            batch_size=batch_size,
+            phase="dev",
+            epoch=1,
+            dev_count=1,
+            shuffle=False)
+
+        if use_data_parallel:
+            train_reader = fluid.contrib.reader.distributed_batch_reader(
+                train_reader)
+            valid_reader = fluid.contrib.reader.distributed_batch_reader(
+                valid_reader)
+
+        train_loader = fluid.io.DataLoader.from_generator(
+            capacity=128,
+            use_double_buffer=True,
+            iterable=True,
+            return_list=True)
+        valid_loader = fluid.io.DataLoader.from_generator(
+            capacity=128,
+            use_double_buffer=True,
+            iterable=True,
+            return_list=True)
+        dev_loader = fluid.io.DataLoader.from_generator(
+            capacity=128,
+            use_double_buffer=True,
+            iterable=True,
+            return_list=True)
+
+        train_loader.set_batch_generator(train_reader, places=place)
+        valid_loader.set_batch_generator(valid_reader, places=place)
+        dev_loader.set_batch_generator(dev_reader, places=place)
+
+        if use_data_parallel:
+            strategy = fluid.dygraph.parallel.prepare_context()
+            model = fluid.dygraph.parallel.DataParallel(model, strategy)
+
+        for epoch_id in range(epoch):
+            train_one_epoch(model, train_loader, valid_loader, optimizer,
+                            arch_optimizer, epoch_id, use_data_parallel,
+                            log_freq)
+            loss, acc = valid_one_epoch(model, dev_loader, epoch_id, log_freq)
+            logger.info("dev set, ce_loss {:.6f}; acc: {:.6f};".format(loss,
+                                                                       acc))
+
+            if use_data_parallel:
+                print(model._layers.student._encoder.alphas.numpy())
+            else:
+                print(model.student._encoder.alphas.numpy())
+            print("=" * 100)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/bert/train_teacher.py b/demo/bert/train_teacher.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bacdc3fef6e0d5514ba313e463f00064ebccffe
--- /dev/null
+++ b/demo/bert/train_teacher.py
@@ -0,0 +1,14 @@
+import paddle.fluid as fluid
+from paddleslim.teachers.bert import BERTClassifier
+
+place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id)
+
+with fluid.dygraph.guard(place):
+
+    bert = BERTClassifier(3)
+    bert.fit("./data/glue_data/MNLI/",
+             5,
+             batch_size=32,
+             use_data_parallel=True,
+             learning_rate=0.00005,
+             save_steps=1000)
diff --git a/demo/darts/README.md b/demo/darts/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..64f6bf6024c4f16b737fce9e706985024c1ea0b7
--- /dev/null
+++ b/demo/darts/README.md
@@ -0,0 +1,129 @@
+# 可微分架构搜索DARTS(Differentiable Architecture Search)方法使用示例
+
+本示例介绍如何使用PaddlePaddle进行可微分架构搜索，可以直接使用[DARTS](https://arxiv.org/abs/1806.09055)和[PC-DARTS](https://arxiv.org/abs/1907.05737)两种方法，也支持自定义修改后使用其他可微分架构搜索算法。
+
+本示例目录结构如下：
+```
+├── genotypes.py 搜索过程得到的模型结构Genotypes
+│
+├── model.py 对搜索得到的子网络组网
+│
+├── model_search.py 对搜索前的超网络组网
+│
+├── operations.py 用于搜索的多种运算符组合
+│
+├── reader.py 数据读取与增广部分
+│
+├── search.py 模型结构搜索入口
+│
+├── train.py CIFAR10数据集评估训练入口
+│
+├── train_imagenet.py ImageNet数据集评估训练入口
+│
+├── visualize.py 模型结构可视化入口
+
+```
+
+## 依赖项
+
+PaddlePaddle >= 1.8.0, PaddleSlim >= 1.1.0, graphviz >= 0.11.1
+
+## 数据集
+
+本示例使用`CIFAR10`数据集进行架构搜索，可选择在`CIFAR10`或`ImageNet`数据集上做架构评估。
+`CIFAR10`数据集可以在进行架构搜索或评估的过程中自动下载，`ImageNet`数据集需要自行下载，可参照此[教程](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification#%E6%95%B0%E6%8D%AE%E5%87%86%E5%A4%87)
+
+
+## 网络结构搜索
+
+搜索方法支持DARTS的一阶、二阶近似搜索方法和PC-DARTS的搜索方法:
+``` bash
+python search.py                       # DARTS一阶近似搜索方法
+python search.py --unrolled=True       # DARTS的二阶近似搜索方法
+python search.py --method='PC-DARTS' --batch_size=256 --learning_rate=0.1 --arch_learning_rate=6e-4 --epochs_no_archopt=15   # PC-DARTS搜索方法
+```
+如果您使用的是docker环境，请确保共享内存足够使用多进程的dataloader，如果碰到共享内存问题，请设置`--use_multiprocess=False`
+
+也可以使用多卡进行模型结构搜索，以4卡为例(GPU id: 0-3), 启动命令如下:
+
+```bash
+python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog search.py --use_data_parallel 1
+```
+
+因为使用多卡训练总的BatchSize会扩大n倍，n代表卡数，为了获得与单卡相当的准确率效果，请相应的将初始学习率扩大n倍。
+
+模型结构随搜索轮数的变化如图1所示。需要注意的是，图中准确率Acc并不代表该结构最终准确率，为了获得当前结构的最佳准确率，请对得到的genotype做网络结构评估训练。
+
+![networks](images/networks.gif)
+
+<p align="center">
+图1: 在CIFAR10数据集上进行搜索的模型结构变化，上半部分为reduction cell，下半部分为normal cell
+</p>
+
+使用三种搜索方法得到的结构Genotype已添加到了genotypes.py文件中，`DARTS_V1`、`DARTS_V2`和`PC_DARTS`分别代表使用DARTS一阶、二阶近似方法和PC-DARTS搜索方法得到的网络结构。
+
+## 网络结构评估训练
+
+在得到搜索结构Genotype之后，可以对其进行评估训练，从而获得它在特定数据集上的真实性能
+
+```bash
+python train.py --arch='PC_DARTS'            # 在CIFAR10数据集上对搜索到的结构评估训练
+python train_imagenet.py --arch='PC_DARTS'   # 在ImageNet数据集上对搜索得到的结构评估训练
+```
+
+同样，也支持用多卡进行评估训练, 以4卡为例(GPU id: 0-3), 启动命令如下：
+
+```bash
+python -m paddle.distributed.launch --selected_gpus=0,1,2,3  --log_dir ./mylog train.py --use_data_parallel 1 --arch='DARTS_V2'
+python -m paddle.distributed.launch --selected_gpus=0,1,2,3  --log_dir ./mylog train_imagenet.py --use_data_parallel 1 --arch='DARTS_V2'
+```
+
+同理，使用多卡训练总的BatchSize会扩大n倍，n代表卡数，为了获得与单卡相当的准确率效果，请相应的将初始学习率扩大n倍。
+
+对搜索到的`DARTS_V1`、`DARTS_V2`和`PC-DARTS`做评估训练的结果如下：
+
+| 模型结构                    | 数据集   | 准确率          |
+| --------------------------- | -------- | --------------- |
+| DARTS_V1                    | CIFAR10  | 97.01%          |
+| DARTS（一阶搜索，论文数据） | CIFAR10  | 97.00$\pm$0.14% |
+| DARTS_V2                    | CIFAR10  | 97.26%          |
+| DARTS  (二阶搜索，论文数据) | CIFAR10  | 97.24$\pm$0.09% |
+| DARTS_V2                    | ImageNet | 74.12%          |
+| DARTS (二阶搜索，论文数据)  | ImageNet | 73.30%          |
+| PC-DARTS                    | CIFAR10  | 97.41%          |
+| PC-DARTS （论文数据）       | CIFAR10  | 97.43$\pm$0.07% |
+
+## 自定义数据集与搜索空间
+
+### 修改数据集
+
+本示例默认使用CIFAR10数据集进行搜索，如果需要替换为其他自定义数据集只需要对reader.py进行少量代码修改：
+
+```python
+def train_search(batch_size, train_portion, is_shuffle, args):
+    datasets = cifar10_reader(                                          #对此进行替换
+        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'data_batch', is_shuffle, args)
+```
+
+将默认使用的`cifar10_reader`替换为特定数据集的reader即可
+
+### 修改搜索空间
+
+本示例提供了DARTS和PC-DARTS两种方法，定义在model_search.py中
+
+可以直接修改model_search.py中定义的`class Network`对搜索空间进行自定义，使用paddleslim.nas.DARTSearch对该结构进行搜索
+
+搜索结束后对model.py做相应的修改进行评估训练。
+
+
+
+## 搜索结构可视化
+
+使用以下命令对搜索得到的Genotype结构进行可视化观察
+
+```python
+python visualize.py PC_DARTS
+```
+
+`PC_DARTS`代表某个Genotype结构，需要预先添加到genotype.py中
diff --git a/demo/darts/genotypes.py b/demo/darts/genotypes.py
new file mode 100644
index 0000000000000000000000000000000000000000..344c42bb09f88f07cabc0a3540bdea74780d655e
--- /dev/null
+++ b/demo/darts/genotypes.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import namedtuple
+
+Genotype = namedtuple('Genotype', 'normal normal_concat reduce reduce_concat')
+
+PRIMITIVES = [
+    'none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3',
+    'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5'
+]
+
+DARTS_V1 = Genotype(
+    normal=[('sep_conv_5x5', 0), ('dil_conv_3x3', 1), ('sep_conv_3x3', 2),
+            ('sep_conv_5x5', 0), ('sep_conv_5x5', 0), ('dil_conv_3x3', 3),
+            ('sep_conv_3x3', 0), ('max_pool_3x3', 1)],
+    normal_concat=range(2, 6),
+    reduce=[('max_pool_3x3', 1), ('max_pool_3x3', 0), ('dil_conv_3x3', 2),
+            ('sep_conv_5x5', 0), ('max_pool_3x3', 0), ('dil_conv_3x3', 3),
+            ('avg_pool_3x3', 3), ('avg_pool_3x3', 4)],
+    reduce_concat=range(2, 6))
+
+DARTS_V2 = Genotype(
+    normal=[('dil_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 0),
+            ('sep_conv_3x3', 1), ('sep_conv_3x3', 1), ('sep_conv_3x3', 0),
+            ('skip_connect', 0), ('sep_conv_3x3', 1)],
+    normal_concat=range(2, 6),
+    reduce=[('skip_connect', 1), ('max_pool_3x3', 0), ('max_pool_3x3', 1),
+            ('skip_connect', 2), ('skip_connect', 2), ('dil_conv_5x5', 3),
+            ('skip_connect', 2), ('max_pool_3x3', 1)],
+    reduce_concat=range(2, 6))
+
+PC_DARTS = Genotype(
+    normal=[('sep_conv_3x3', 1), ('skip_connect', 0), ('sep_conv_5x5', 0),
+            ('dil_conv_5x5', 2), ('sep_conv_5x5', 0), ('sep_conv_3x3', 2),
+            ('sep_conv_3x3', 0), ('dil_conv_3x3', 1)],
+    normal_concat=range(2, 6),
+    reduce=[('avg_pool_3x3', 0), ('sep_conv_3x3', 1), ('skip_connect', 2),
+            ('avg_pool_3x3', 0), ('dil_conv_5x5', 3), ('skip_connect', 2),
+            ('skip_connect', 2), ('avg_pool_3x3', 0)],
+    reduce_concat=range(2, 6))
diff --git a/demo/darts/images/networks.gif b/demo/darts/images/networks.gif
new file mode 100755
index 0000000000000000000000000000000000000000..41a1145c6c185312d519bbf93578e75c98fcab0a
Binary files /dev/null and b/demo/darts/images/networks.gif differ
diff --git a/demo/darts/model.py b/demo/darts/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0ceb198ea129aab10e3faf65a688991cb019c9f
--- /dev/null
+++ b/demo/darts/model.py
@@ -0,0 +1,312 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.initializer import ConstantInitializer, MSRAInitializer
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+from paddle.fluid.dygraph.base import to_variable
+from genotypes import PRIMITIVES
+from genotypes import Genotype
+from operations import *
+
+
+class ConvBN(fluid.dygraph.Layer):
+    def __init__(self, c_curr, c_out, kernel_size, padding, stride, name=None):
+        super(ConvBN, self).__init__()
+        self.conv = Conv2D(
+            num_channels=c_curr,
+            num_filters=c_out,
+            filter_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            param_attr=fluid.ParamAttr(
+                name=name + "_conv" if name is not None else None,
+                initializer=MSRAInitializer()),
+            bias_attr=False)
+        self.bn = BatchNorm(
+            num_channels=c_out,
+            param_attr=fluid.ParamAttr(
+                name=name + "_bn_scale" if name is not None else None,
+                initializer=ConstantInitializer(value=1)),
+            bias_attr=fluid.ParamAttr(
+                name=name + "_bn_offset" if name is not None else None,
+                initializer=ConstantInitializer(value=0)),
+            moving_mean_name=name + "_bn_mean" if name is not None else None,
+            moving_variance_name=name + "_bn_variance"
+            if name is not None else None)
+
+    def forward(self, x):
+        conv = self.conv(x)
+        bn = self.bn(conv)
+        return bn
+
+
+class Classifier(fluid.dygraph.Layer):
+    def __init__(self, input_dim, num_classes, name=None):
+        super(Classifier, self).__init__()
+        self.pool2d = Pool2D(pool_type='avg', global_pooling=True)
+        self.fc = Linear(
+            input_dim=input_dim,
+            output_dim=num_classes,
+            param_attr=fluid.ParamAttr(
+                name=name + "_fc_weights" if name is not None else None,
+                initializer=MSRAInitializer()),
+            bias_attr=fluid.ParamAttr(
+                name=name + "_fc_bias" if name is not None else None,
+                initializer=MSRAInitializer()))
+
+    def forward(self, x):
+        x = self.pool2d(x)
+        x = fluid.layers.squeeze(x, axes=[2, 3])
+        out = self.fc(x)
+        return out
+
+
+def drop_path(x, drop_prob):
+    if drop_prob > 0:
+        keep_prob = 1. - drop_prob
+    mask = 1 - np.random.binomial(
+        1, drop_prob, size=[x.shape[0]]).astype(np.float32)
+    mask = to_variable(mask)
+    x = fluid.layers.elementwise_mul(x / keep_prob, mask, axis=0)
+    return x
+
+
+class Cell(fluid.dygraph.Layer):
+    def __init__(self, genotype, c_prev_prev, c_prev, c_curr, reduction,
+                 reduction_prev):
+        super(Cell, self).__init__()
+        print(c_prev_prev, c_prev, c_curr)
+
+        if reduction_prev:
+            self.preprocess0 = FactorizedReduce(c_prev_prev, c_curr)
+        else:
+            self.preprocess0 = ReLUConvBN(c_prev_prev, c_curr, 1, 1, 0)
+        self.preprocess1 = ReLUConvBN(c_prev, c_curr, 1, 1, 0)
+
+        if reduction:
+            op_names, indices = zip(*genotype.reduce)
+            concat = genotype.reduce_concat
+        else:
+            op_names, indices = zip(*genotype.normal)
+            concat = genotype.normal_concat
+
+        multiplier = len(concat)
+        self._multiplier = multiplier
+        self._compile(c_curr, op_names, indices, multiplier, reduction)
+
+    def _compile(self, c_curr, op_names, indices, multiplier, reduction):
+        assert len(op_names) == len(indices)
+        self._steps = len(op_names) // 2
+        ops = []
+        edge_index = 0
+        for op_name, index in zip(op_names, indices):
+            stride = 2 if reduction and index < 2 else 1
+            op = OPS[op_name](c_curr, stride, True)
+            ops += [op]
+            edge_index += 1
+        self._ops = fluid.dygraph.LayerList(ops)
+        self._indices = indices
+
+    def forward(self, s0, s1, drop_prob, training):
+        s0 = self.preprocess0(s0)
+        s1 = self.preprocess1(s1)
+
+        states = [s0, s1]
+        for i in range(self._steps):
+            h1 = states[self._indices[2 * i]]
+            h2 = states[self._indices[2 * i + 1]]
+            op1 = self._ops[2 * i]
+            op2 = self._ops[2 * i + 1]
+            h1 = op1(h1)
+            h2 = op2(h2)
+            if training and drop_prob > 0.:
+                if not isinstance(op1, Identity):
+                    h1 = drop_path(h1, drop_prob)
+                if not isinstance(op2, Identity):
+                    h2 = drop_path(h2, drop_prob)
+            states += [h1 + h2]
+        out = fluid.layers.concat(input=states[-self._multiplier:], axis=1)
+        return out
+
+
+class AuxiliaryHeadCIFAR(fluid.dygraph.Layer):
+    def __init__(self, C, num_classes):
+        super(AuxiliaryHeadCIFAR, self).__init__()
+        self.avgpool = Pool2D(
+            pool_size=5, pool_stride=3, pool_padding=0, pool_type='avg')
+        self.conv_bn1 = ConvBN(
+            c_curr=C,
+            c_out=128,
+            kernel_size=1,
+            padding=0,
+            stride=1,
+            name='aux_conv_bn1')
+        self.conv_bn2 = ConvBN(
+            c_curr=128,
+            c_out=768,
+            kernel_size=2,
+            padding=0,
+            stride=1,
+            name='aux_conv_bn2')
+        self.classifier = Classifier(768, num_classes, 'aux')
+
+    def forward(self, x):
+        x = fluid.layers.relu(x)
+        x = self.avgpool(x)
+        conv1 = self.conv_bn1(x)
+        conv1 = fluid.layers.relu(conv1)
+        conv2 = self.conv_bn2(conv1)
+        conv2 = fluid.layers.relu(conv2)
+        out = self.classifier(conv2)
+        return out
+
+
+class NetworkCIFAR(fluid.dygraph.Layer):
+    def __init__(self, C, num_classes, layers, auxiliary, genotype):
+        super(NetworkCIFAR, self).__init__()
+        self._layers = layers
+        self._auxiliary = auxiliary
+
+        stem_multiplier = 3
+        c_curr = stem_multiplier * C
+        self.stem = ConvBN(
+            c_curr=3, c_out=c_curr, kernel_size=3, padding=1, stride=1)
+
+        c_prev_prev, c_prev, c_curr = c_curr, c_curr, C
+        cells = []
+        reduction_prev = False
+        for i in range(layers):
+            if i in [layers // 3, 2 * layers // 3]:
+                c_curr *= 2
+                reduction = True
+            else:
+                reduction = False
+            cell = Cell(genotype, c_prev_prev, c_prev, c_curr, reduction,
+                        reduction_prev)
+            reduction_prev = reduction
+            cells += [cell]
+            c_prev_prev, c_prev = c_prev, cell._multiplier * c_curr
+            if i == 2 * layers // 3:
+                c_to_auxiliary = c_prev
+        self.cells = fluid.dygraph.LayerList(cells)
+
+        if auxiliary:
+            self.auxiliary_head = AuxiliaryHeadCIFAR(c_to_auxiliary,
+                                                     num_classes)
+        self.classifier = Classifier(c_prev, num_classes)
+
+    def forward(self, input, drop_path_prob, training):
+        logits_aux = None
+        s0 = s1 = self.stem(input)
+        for i, cell in enumerate(self.cells):
+            s0, s1 = s1, cell(s0, s1, drop_path_prob, training)
+            if i == 2 * self._layers // 3:
+                if self._auxiliary and training:
+                    logits_aux = self.auxiliary_head(s1)
+        logits = self.classifier(s1)
+        return logits, logits_aux
+
+
+class AuxiliaryHeadImageNet(fluid.dygraph.Layer):
+    def __init__(self, C, num_classes):
+        super(AuxiliaryHeadImageNet, self).__init__()
+        self.avgpool = Pool2D(
+            pool_size=5, pool_stride=2, pool_padding=0, pool_type='avg')
+        self.conv_bn1 = ConvBN(
+            c_curr=C,
+            c_out=128,
+            kernel_size=1,
+            padding=0,
+            stride=1,
+            name='aux_conv_bn1')
+        self.conv_bn2 = ConvBN(
+            c_curr=128,
+            c_out=768,
+            kernel_size=2,
+            padding=0,
+            stride=1,
+            name='aux_conv_bn2')
+        self.classifier = Classifier(768, num_classes, 'aux')
+
+    def forward(self, x):
+        x = fluid.layers.relu(x)
+        x = self.avgpool(x)
+        conv1 = self.conv_bn1(x)
+        conv1 = fluid.layers.relu(conv1)
+        conv2 = self.conv_bn2(conv1)
+        conv2 = fluid.layers.relu(conv2)
+        out = self.classifier(conv2)
+        return out
+
+
+class NetworkImageNet(fluid.dygraph.Layer):
+    def __init__(self, C, num_classes, layers, auxiliary, genotype):
+        super(NetworkImageNet, self).__init__()
+        self._layers = layers
+        self._auxiliary = auxiliary
+
+        self.stem_a0 = ConvBN(
+            c_curr=3, c_out=C // 2, kernel_size=3, padding=1, stride=2)
+
+        self.stem_a1 = ConvBN(
+            c_curr=C // 2, c_out=C, kernel_size=3, padding=1, stride=2)
+
+        self.stem_b = ConvBN(
+            c_curr=C, c_out=C, kernel_size=3, padding=1, stride=2)
+
+        c_prev_prev, c_prev, c_curr = C, C, C
+        cells = []
+        reduction_prev = True
+        for i in range(layers):
+            if i in [layers // 3, 2 * layers // 3]:
+                c_curr *= 2
+                reduction = True
+            else:
+                reduction = False
+            cell = Cell(genotype, c_prev_prev, c_prev, c_curr, reduction,
+                        reduction_prev)
+            reduction_prev = reduction
+            cells += [cell]
+            c_prev_prev, c_prev = c_prev, cell._multiplier * c_curr
+            if i == 2 * layers // 3:
+                c_to_auxiliary = c_prev
+        self.cells = fluid.dygraph.LayerList(cells)
+
+        if auxiliary:
+            self.auxiliary_head = AuxiliaryHeadImageNet(c_to_auxiliary,
+                                                        num_classes)
+        self.classifier = Classifier(c_prev, num_classes)
+
+    def forward(self, input, training):
+        logits_aux = None
+        s0 = self.stem_a0(input)
+        s0 = fluid.layers.relu(s0)
+        s0 = self.stem_a1(s0)
+        s1 = fluid.layers.relu(s0)
+        s1 = self.stem_b(s1)
+
+        for i, cell in enumerate(self.cells):
+            s0, s1 = s1, cell(s0, s1, 0, training)
+            if i == 2 * self._layers // 3:
+                if self._auxiliary and training:
+                    logits_aux = self.auxiliary_head(s1)
+        logits = self.classifier(s1)
+        return logits, logits_aux
diff --git a/demo/darts/model_search.py b/demo/darts/model_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..9596f2ea67b2c266fb39a2b71022aa96853d80e6
--- /dev/null
+++ b/demo/darts/model_search.py
@@ -0,0 +1,270 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.initializer import NormalInitializer, MSRAInitializer, ConstantInitializer
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+from paddle.fluid.dygraph.base import to_variable
+from genotypes import PRIMITIVES
+from operations import *
+
+
+def channel_shuffle(x, groups):
+    batchsize, num_channels, height, width = x.shape
+    channels_per_group = num_channels // groups
+
+    # reshape
+    x = fluid.layers.reshape(
+        x, [batchsize, groups, channels_per_group, height, width])
+    x = fluid.layers.transpose(x, [0, 2, 1, 3, 4])
+
+    # flatten
+    x = fluid.layers.reshape(x, [batchsize, num_channels, height, width])
+    return x
+
+
+class MixedOp(fluid.dygraph.Layer):
+    def __init__(self, c_cur, stride, method):
+        super(MixedOp, self).__init__()
+        self._method = method
+        self._k = 4 if self._method == "PC-DARTS" else 1
+        self.mp = Pool2D(
+            pool_size=2,
+            pool_stride=2,
+            pool_type='max', )
+        ops = []
+        for primitive in PRIMITIVES:
+            op = OPS[primitive](c_cur // self._k, stride, False)
+            if 'pool' in primitive:
+                gama = ParamAttr(
+                    initializer=fluid.initializer.Constant(value=1),
+                    trainable=False)
+                beta = ParamAttr(
+                    initializer=fluid.initializer.Constant(value=0),
+                    trainable=False)
+                BN = BatchNorm(
+                    c_cur // self._k, param_attr=gama, bias_attr=beta)
+                op = fluid.dygraph.Sequential(op, BN)
+            ops.append(op)
+        self._ops = fluid.dygraph.LayerList(ops)
+
+    def forward(self, x, weights):
+        if self._method == "PC-DARTS":
+            dim_2 = x.shape[1]
+            xtemp = x[:, :dim_2 // self._k, :, :]
+            xtemp2 = x[:, dim_2 // self._k:, :, :]
+
+            temp1 = fluid.layers.sums(
+                [weights[i] * op(xtemp) for i, op in enumerate(self._ops)])
+
+            if temp1.shape[2] == x.shape[2]:
+                out = fluid.layers.concat([temp1, xtemp2], axis=1)
+            else:
+                out = fluid.layers.concat([temp1, self.mp(xtemp2)], axis=1)
+            out = channel_shuffle(out, self._k)
+        else:
+            out = fluid.layers.sums(
+                [weights[i] * op(x) for i, op in enumerate(self._ops)])
+        return out
+
+
+class Cell(fluid.dygraph.Layer):
+    def __init__(self, steps, multiplier, c_prev_prev, c_prev, c_cur,
+                 reduction, reduction_prev, method):
+        super(Cell, self).__init__()
+        self.reduction = reduction
+
+        if reduction_prev:
+            self.preprocess0 = FactorizedReduce(c_prev_prev, c_cur, False)
+        else:
+            self.preprocess0 = ReLUConvBN(c_prev_prev, c_cur, 1, 1, 0, False)
+        self.preprocess1 = ReLUConvBN(c_prev, c_cur, 1, 1, 0, affine=False)
+        self._steps = steps
+        self._multiplier = multiplier
+        self._method = method
+
+        ops = []
+        for i in range(self._steps):
+            for j in range(2 + i):
+                stride = 2 if reduction and j < 2 else 1
+                op = MixedOp(c_cur, stride, method)
+                ops.append(op)
+        self._ops = fluid.dygraph.LayerList(ops)
+
+    def forward(self, s0, s1, weights, weights2=None):
+        s0 = self.preprocess0(s0)
+        s1 = self.preprocess1(s1)
+
+        states = [s0, s1]
+        offset = 0
+        for i in range(self._steps):
+            if self._method == "PC-DARTS":
+                s = fluid.layers.sums([
+                    weights2[offset + j] *
+                    self._ops[offset + j](h, weights[offset + j])
+                    for j, h in enumerate(states)
+                ])
+            else:
+                s = fluid.layers.sums([
+                    self._ops[offset + j](h, weights[offset + j])
+                    for j, h in enumerate(states)
+                ])
+            offset += len(states)
+            states.append(s)
+        out = fluid.layers.concat(input=states[-self._multiplier:], axis=1)
+        return out
+
+
+class Network(fluid.dygraph.Layer):
+    def __init__(self,
+                 c_in,
+                 num_classes,
+                 layers,
+                 method,
+                 steps=4,
+                 multiplier=4,
+                 stem_multiplier=3):
+        super(Network, self).__init__()
+        self._c_in = c_in
+        self._num_classes = num_classes
+        self._layers = layers
+        self._steps = steps
+        self._multiplier = multiplier
+        self._primitives = PRIMITIVES
+        self._method = method
+
+        c_cur = stem_multiplier * c_in
+        self.stem = fluid.dygraph.Sequential(
+            Conv2D(
+                num_channels=3,
+                num_filters=c_cur,
+                filter_size=3,
+                padding=1,
+                param_attr=fluid.ParamAttr(initializer=MSRAInitializer()),
+                bias_attr=False),
+            BatchNorm(
+                num_channels=c_cur,
+                param_attr=fluid.ParamAttr(
+                    initializer=ConstantInitializer(value=1)),
+                bias_attr=fluid.ParamAttr(
+                    initializer=ConstantInitializer(value=0))))
+
+        c_prev_prev, c_prev, c_cur = c_cur, c_cur, c_in
+        cells = []
+        reduction_prev = False
+        for i in range(layers):
+            if i in [layers // 3, 2 * layers // 3]:
+                c_cur *= 2
+                reduction = True
+            else:
+                reduction = False
+            cell = Cell(steps, multiplier, c_prev_prev, c_prev, c_cur,
+                        reduction, reduction_prev, method)
+            reduction_prev = reduction
+            cells.append(cell)
+            c_prev_prev, c_prev = c_prev, multiplier * c_cur
+        self.cells = fluid.dygraph.LayerList(cells)
+        self.global_pooling = Pool2D(pool_type='avg', global_pooling=True)
+        self.classifier = Linear(
+            input_dim=c_prev,
+            output_dim=num_classes,
+            param_attr=ParamAttr(initializer=MSRAInitializer()),
+            bias_attr=ParamAttr(initializer=MSRAInitializer()))
+
+        self._initialize_alphas()
+
+    def forward(self, input):
+        s0 = s1 = self.stem(input)
+        weights2 = None
+        for i, cell in enumerate(self.cells):
+            if cell.reduction:
+                weights = fluid.layers.softmax(self.alphas_reduce)
+                if self._method == "PC-DARTS":
+                    n = 3
+                    start = 2
+                    weights2 = fluid.layers.softmax(self.betas_reduce[0:2])
+                    for i in range(self._steps - 1):
+                        end = start + n
+                        tw2 = fluid.layers.softmax(self.betas_reduce[start:
+                                                                     end])
+                        start = end
+                        n += 1
+                        weights2 = fluid.layers.concat([weights2, tw2])
+            else:
+                weights = fluid.layers.softmax(self.alphas_normal)
+                if self._method == "PC-DARTS":
+                    n = 3
+                    start = 2
+                    weights2 = fluid.layers.softmax(self.betas_normal[0:2])
+                    for i in range(self._steps - 1):
+                        end = start + n
+                        tw2 = fluid.layers.softmax(self.betas_normal[start:
+                                                                     end])
+                        start = end
+                        n += 1
+                        weights2 = fluid.layers.concat([weights2, tw2])
+            s0, s1 = s1, cell(s0, s1, weights, weights2)
+        out = self.global_pooling(s1)
+        out = fluid.layers.squeeze(out, axes=[2, 3])
+        logits = self.classifier(out)
+        return logits
+
+    def _loss(self, input, target):
+        logits = self(input)
+        loss = fluid.layers.reduce_mean(
+            fluid.layers.softmax_with_cross_entropy(logits, target))
+        return loss
+
+    def new(self):
+        model_new = Network(self._c_in, self._num_classes, self._layers,
+                            self._method)
+        return model_new
+
+    def _initialize_alphas(self):
+        k = sum(1 for i in range(self._steps) for n in range(2 + i))
+        num_ops = len(self._primitives)
+        self.alphas_normal = fluid.layers.create_parameter(
+            shape=[k, num_ops],
+            dtype="float32",
+            default_initializer=NormalInitializer(
+                loc=0.0, scale=1e-3))
+        self.alphas_reduce = fluid.layers.create_parameter(
+            shape=[k, num_ops],
+            dtype="float32",
+            default_initializer=NormalInitializer(
+                loc=0.0, scale=1e-3))
+        self._arch_parameters = [
+            self.alphas_normal,
+            self.alphas_reduce,
+        ]
+        if self._method == "PC-DARTS":
+            self.betas_normal = fluid.layers.create_parameter(
+                shape=[k],
+                dtype="float32",
+                default_initializer=NormalInitializer(
+                    loc=0.0, scale=1e-3))
+            self.betas_reduce = fluid.layers.create_parameter(
+                shape=[k],
+                dtype="float32",
+                default_initializer=NormalInitializer(
+                    loc=0.0, scale=1e-3))
+            self._arch_parameters += [self.betas_normal, self.betas_reduce]
+
+    def arch_parameters(self):
+        return self._arch_parameters
diff --git a/demo/darts/operations.py b/demo/darts/operations.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf362c1e00049ccdb732340b53316dd5ce7a3192
--- /dev/null
+++ b/demo/darts/operations.py
@@ -0,0 +1,267 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.initializer import ConstantInitializer, MSRAInitializer
+
+
+OPS = {
+    'none':
+    lambda C, stride, affine: Zero(stride),
+    'avg_pool_3x3':
+    lambda C, stride, affine: Pool2D(
+        pool_size=3,
+        pool_type="avg",
+        pool_stride=stride,
+        pool_padding=1),
+    'max_pool_3x3':
+    lambda C, stride, affine: Pool2D(
+        pool_size=3,
+        pool_type="max",
+        pool_stride=stride,
+        pool_padding=1),
+    'skip_connect':
+    lambda C, stride, affine: Identity()
+    if stride == 1 else FactorizedReduce(C, C, affine),
+    'sep_conv_3x3':
+    lambda C, stride, affine: SepConv(C, C, 3, stride, 1,
+                                                       affine),
+    'sep_conv_5x5':
+    lambda C, stride, affine: SepConv(C, C, 5, stride, 2,
+                                                       affine),
+    'sep_conv_7x7':
+    lambda C, stride, affine: SepConv(C, C, 7, stride, 3,
+                                                       affine),
+    'dil_conv_3x3':
+    lambda C, stride, affine: DilConv(C, C, 3, stride, 2,
+                                                       2, affine),
+    'dil_conv_5x5':
+    lambda C, stride, affine: DilConv(C, C, 5, stride, 4,
+                                                       2, affine),
+    'conv_7x1_1x7':
+    lambda C, stride, affine: Conv_7x1_1x7(
+        C, C, stride, affine),
+}
+
+
+def bn_param_config(affine=False):
+    gama = ParamAttr(
+        initializer=ConstantInitializer(value=1), trainable=affine)
+    beta = ParamAttr(
+        initializer=ConstantInitializer(value=0), trainable=affine)
+    return gama, beta
+
+
+class Zero(fluid.dygraph.Layer):
+    def __init__(self, stride):
+        super(Zero, self).__init__()
+        self.stride = stride
+        self.pool = Pool2D(pool_size=1, pool_stride=2)
+
+    def forward(self, x):
+        pooled = self.pool(x)
+        x = fluid.layers.zeros_like(
+            x) if self.stride == 1 else fluid.layers.zeros_like(pooled)
+        return x
+
+
+class Identity(fluid.dygraph.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+class FactorizedReduce(fluid.dygraph.Layer):
+    def __init__(self, c_in, c_out, affine=True):
+        super(FactorizedReduce, self).__init__()
+        assert c_out % 2 == 0
+        self.conv1 = Conv2D(
+            num_channels=c_in,
+            num_filters=c_out // 2,
+            filter_size=1,
+            stride=2,
+            padding=0,
+            param_attr=fluid.ParamAttr(initializer=MSRAInitializer()),
+            bias_attr=False)
+        self.conv2 = Conv2D(
+            num_channels=c_in,
+            num_filters=c_out // 2,
+            filter_size=1,
+            stride=2,
+            padding=0,
+            param_attr=fluid.ParamAttr(initializer=MSRAInitializer()),
+            bias_attr=False)
+        gama, beta = bn_param_config(affine)
+        self.bn = BatchNorm(
+            num_channels=c_out, param_attr=gama, bias_attr=beta)
+
+    def forward(self, x):
+        x = fluid.layers.relu(x)
+        out = fluid.layers.concat(
+            input=[self.conv1(x), self.conv2(x[:, :, 1:, 1:])], axis=1)
+        out = self.bn(out)
+        return out
+
+
+class SepConv(fluid.dygraph.Layer):
+    def __init__(self, c_in, c_out, kernel_size, stride, padding, affine=True):
+        super(SepConv, self).__init__()
+        self.conv1 = Conv2D(
+            num_channels=c_in,
+            num_filters=c_in,
+            filter_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=c_in,
+            use_cudnn=False,
+            param_attr=fluid.ParamAttr(initializer=MSRAInitializer()),
+            bias_attr=False)
+        self.conv2 = Conv2D(
+            num_channels=c_in,
+            num_filters=c_in,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            param_attr=fluid.ParamAttr(initializer=MSRAInitializer()),
+            bias_attr=False)
+        gama, beta = bn_param_config(affine)
+        self.bn1 = BatchNorm(
+            num_channels=c_in, param_attr=gama, bias_attr=beta)
+        self.conv3 = Conv2D(
+            num_channels=c_in,
+            num_filters=c_in,
+            filter_size=kernel_size,
+            stride=1,
+            padding=padding,
+            groups=c_in,
+            use_cudnn=False,
+            param_attr=fluid.ParamAttr(initializer=MSRAInitializer()),
+            bias_attr=False)
+        self.conv4 = Conv2D(
+            num_channels=c_in,
+            num_filters=c_out,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            param_attr=fluid.ParamAttr(initializer=MSRAInitializer()),
+            bias_attr=False)
+        gama, beta = bn_param_config(affine)
+        self.bn2 = BatchNorm(
+            num_channels=c_out, param_attr=gama, bias_attr=beta)
+
+    def forward(self, x):
+        x = fluid.layers.relu(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+        bn1 = self.bn1(x)
+        x = fluid.layers.relu(bn1)
+        x = self.conv3(x)
+        x = self.conv4(x)
+        bn2 = self.bn2(x)
+        return bn2
+
+
+class DilConv(fluid.dygraph.Layer):
+    def __init__(self,
+                 c_in,
+                 c_out,
+                 kernel_size,
+                 stride,
+                 padding,
+                 dilation,
+                 affine=True):
+        super(DilConv, self).__init__()
+        self.conv1 = Conv2D(
+            num_channels=c_in,
+            num_filters=c_in,
+            filter_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=c_in,
+            use_cudnn=False,
+            param_attr=fluid.ParamAttr(initializer=MSRAInitializer()),
+            bias_attr=False)
+        self.conv2 = Conv2D(
+            num_channels=c_in,
+            num_filters=c_out,
+            filter_size=1,
+            padding=0,
+            param_attr=fluid.ParamAttr(initializer=MSRAInitializer()),
+            bias_attr=False)
+        gama, beta = bn_param_config(affine)
+        self.bn1 = BatchNorm(
+            num_channels=c_out, param_attr=gama, bias_attr=beta)
+
+    def forward(self, x):
+        x = fluid.layers.relu(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+        out = self.bn1(x)
+        return out
+
+
+class Conv_7x1_1x7(fluid.dygraph.Layer):
+    def __init__(self, c_in, c_out, stride, affine=True):
+        super(Conv_7x1_1x7, self).__init__()
+        self.conv1 = Conv2D(
+            num_channels=c_in,
+            num_filters=c_out,
+            filter_size=(1, 7),
+            padding=(0, 3),
+            param_attr=fluid.ParamAttr(initializer=MSRAInitializer()),
+            bias_attr=False)
+        self.conv2 = Conv2D(
+            num_channels=c_in,
+            num_filters=c_out,
+            filter_size=(7, 1),
+            padding=(3, 0),
+            param_attr=fluid.ParamAttr(initializer=MSRAInitializer()),
+            bias_attr=False)
+        gama, beta = bn_param_config(affine)
+        self.bn1 = BatchNorm(
+            num_channels=c_out, param_attr=gama, bias_attr=beta)
+
+    def forward(self, x):
+        x = fluid.layers.relu(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+        out = self.bn1(x)
+        return out
+
+
+class ReLUConvBN(fluid.dygraph.Layer):
+    def __init__(self, c_in, c_out, kernel_size, stride, padding, affine=True):
+        super(ReLUConvBN, self).__init__()
+        self.conv = Conv2D(
+            num_channels=c_in,
+            num_filters=c_out,
+            filter_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            param_attr=fluid.ParamAttr(initializer=MSRAInitializer()),
+            bias_attr=False)
+        gama, beta = bn_param_config(affine)
+        self.bn = BatchNorm(
+            num_channels=c_out, param_attr=gama, bias_attr=beta)
+
+    def forward(self, x):
+        x = fluid.layers.relu(x)
+        x = self.conv(x)
+        out = self.bn(x)
+        return out
diff --git a/demo/darts/reader.py b/demo/darts/reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a760f4447e10cf3d0a91a3567d192b68febd074
--- /dev/null
+++ b/demo/darts/reader.py
@@ -0,0 +1,295 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from PIL import Image
+from PIL import ImageOps
+import os
+import math
+import random
+import tarfile
+import functools
+import numpy as np
+from PIL import Image, ImageEnhance
+import paddle
+# for python2/python3 compatiablity
+try:
+    import cPickle
+except:
+    import _pickle as cPickle
+
+IMAGE_SIZE = 32
+IMAGE_DEPTH = 3
+CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
+CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]
+
+CIFAR10_URL = 'https://dataset.bj.bcebos.com/cifar%2Fcifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+
+paddle.dataset.common.DATA_HOME = "dataset/"
+
+THREAD = 16
+BUF_SIZE = 10240
+
+IMAGENET_MEAN = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+IMAGENET_STD = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+IMAGENET_DIM = 224
+
+
+def preprocess(sample, is_training, args):
+    image_array = sample.reshape(IMAGE_DEPTH, IMAGE_SIZE, IMAGE_SIZE)
+    rgb_array = np.transpose(image_array, (1, 2, 0))
+    img = Image.fromarray(rgb_array, 'RGB')
+
+    if is_training:
+        # pad, ramdom crop, random_flip_left_right
+        img = ImageOps.expand(img, (4, 4, 4, 4), fill=0)
+        left_top = np.random.randint(8, size=2)
+        img = img.crop((left_top[1], left_top[0], left_top[1] + IMAGE_SIZE,
+                        left_top[0] + IMAGE_SIZE))
+        if np.random.randint(2):
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+    img = np.array(img).astype(np.float32)
+
+    img_float = img / 255.0
+    img = (img_float - CIFAR_MEAN) / CIFAR_STD
+
+    if is_training and args.cutout:
+        center = np.random.randint(IMAGE_SIZE, size=2)
+        offset_width = max(0, center[0] - args.cutout_length // 2)
+        offset_height = max(0, center[1] - args.cutout_length // 2)
+        target_width = min(center[0] + args.cutout_length // 2, IMAGE_SIZE)
+        target_height = min(center[1] + args.cutout_length // 2, IMAGE_SIZE)
+
+        for i in range(offset_height, target_height):
+            for j in range(offset_width, target_width):
+                img[i][j][:] = 0.0
+
+    img = np.transpose(img, (2, 0, 1))
+    return img
+
+
+def reader_generator(datasets, batch_size, is_training, is_shuffle, args):
+    def read_batch(datasets, args):
+        if is_shuffle:
+            random.shuffle(datasets)
+        for im, label in datasets:
+            im = preprocess(im, is_training, args)
+            yield im, [int(label)]
+
+    def reader():
+        batch_data = []
+        batch_label = []
+        for data in read_batch(datasets, args):
+            batch_data.append(data[0])
+            batch_label.append(data[1])
+            if len(batch_data) == batch_size:
+                batch_data = np.array(batch_data, dtype='float32')
+                batch_label = np.array(batch_label, dtype='int64')
+                batch_out = [batch_data, batch_label]
+                yield batch_out
+                batch_data = []
+                batch_label = []
+
+    return reader
+
+
+def cifar10_reader(file_name, data_name, is_shuffle, args):
+    with tarfile.open(file_name, mode='r') as f:
+        names = [
+            each_item.name for each_item in f if data_name in each_item.name
+        ]
+        names.sort()
+        datasets = []
+        for name in names:
+            print("Reading file " + name)
+            try:
+                batch = cPickle.load(
+                    f.extractfile(name), encoding='iso-8859-1')
+            except:
+                batch = cPickle.load(f.extractfile(name))
+            data = batch['data']
+            labels = batch.get('labels', batch.get('fine_labels', None))
+            assert labels is not None
+            dataset = zip(data, labels)
+            datasets.extend(dataset)
+        if is_shuffle:
+            random.shuffle(datasets)
+    return datasets
+
+
+def train_search(batch_size, train_portion, is_shuffle, args):
+    datasets = cifar10_reader(
+        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'data_batch', is_shuffle, args)
+    split_point = int(np.floor(train_portion * len(datasets)))
+    train_datasets = datasets[:split_point]
+    val_datasets = datasets[split_point:]
+    reader = [
+        reader_generator(train_datasets, batch_size, True, True, args),
+        reader_generator(val_datasets, batch_size, True, True, args)
+    ]
+    return reader
+
+
+def train_valid(batch_size, is_train, is_shuffle, args):
+    name = 'data_batch' if is_train else 'test_batch'
+    datasets = cifar10_reader(
+        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        name, is_shuffle, args)
+
+    reader = reader_generator(datasets, batch_size, is_train, is_shuffle, args)
+    return reader
+
+
+def crop_image(img, target_size, center):
+    width, height = img.size
+    size = target_size
+    if center == True:
+        w_start = (width - size) / 2
+        h_start = (height - size) / 2
+    else:
+        w_start = np.random.randint(0, width - size + 1)
+        h_start = np.random.randint(0, height - size + 1)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img.crop((w_start, h_start, w_end, h_end))
+    return img
+
+
+def resize_short(img, target_size):
+    percent = float(target_size) / min(img.size[0], img.size[1])
+    resized_width = int(round(img.size[0] * percent))
+    resized_height = int(round(img.size[1] * percent))
+    img = img.resize((resized_width, resized_height), Image.LANCZOS)
+    return img
+
+
+def random_crop(img, size, scale=[0.08, 1.0], ratio=[3. / 4., 4. / 3.]):
+    aspect_ratio = math.sqrt(np.random.uniform(*ratio))
+    w = 1. * aspect_ratio
+    h = 1. / aspect_ratio
+
+    bound = min((float(img.size[0]) / img.size[1]) / (w**2),
+                (float(img.size[1]) / img.size[0]) / (h**2))
+    scale_max = min(scale[1], bound)
+    scale_min = min(scale[0], bound)
+
+    target_area = img.size[0] * img.size[1] * np.random.uniform(scale_min,
+                                                                scale_max)
+    target_size = math.sqrt(target_area)
+    w = int(target_size * w)
+    h = int(target_size * h)
+
+    i = np.random.randint(0, img.size[0] - w + 1)
+    j = np.random.randint(0, img.size[1] - h + 1)
+
+    img = img.crop((i, j, i + w, j + h))
+    img = img.resize((size, size), Image.BILINEAR)
+    return img
+
+
+def distort_color(img):
+    def random_brightness(img, lower=0.5, upper=1.5):
+        e = np.random.uniform(lower, upper)
+        return ImageEnhance.Brightness(img).enhance(e)
+
+    def random_contrast(img, lower=0.5, upper=1.5):
+        e = np.random.uniform(lower, upper)
+        return ImageEnhance.Contrast(img).enhance(e)
+
+    def random_color(img, lower=0.5, upper=1.5):
+        e = np.random.uniform(lower, upper)
+        return ImageEnhance.Color(img).enhance(e)
+
+    ops = [random_brightness, random_contrast, random_color]
+    np.random.shuffle(ops)
+
+    img = ops[0](img)
+    img = ops[1](img)
+    img = ops[2](img)
+
+    return img
+
+
+def process_image(sample, mode, color_jitter, rotate):
+    img_path = sample[0]
+
+    img = Image.open(img_path)
+    if mode == 'train':
+        img = random_crop(img, IMAGENET_DIM)
+        if np.random.randint(0, 2) == 1:
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+        if color_jitter:
+            img = distort_color(img)
+
+    else:
+        img = resize_short(img, target_size=256)
+        img = crop_image(img, target_size=IMAGENET_DIM, center=True)
+
+    if img.mode != 'RGB':
+        img = img.convert('RGB')
+
+    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
+    img -= IMAGENET_MEAN
+    img /= IMAGENET_STD
+
+    if mode == 'train' or mode == 'val':
+        return img, np.array([sample[1]], dtype='int64')
+    elif mode == 'test':
+        return [img]
+
+
+def _reader_creator(file_list,
+                    mode,
+                    shuffle=False,
+                    color_jitter=False,
+                    rotate=False,
+                    data_dir=None):
+    def reader():
+        try:
+            with open(file_list) as flist:
+                full_lines = [line.strip() for line in flist]
+                if shuffle:
+                    np.random.shuffle(full_lines)
+                lines = full_lines
+                for line in lines:
+                    if mode == 'train' or mode == 'val':
+                        img_path, label = line.split()
+                        img_path = os.path.join(data_dir, img_path)
+                        yield img_path, int(label)
+                    elif mode == 'test':
+                        img_path = os.path.join(data_dir, line)
+                        yield [img_path]
+        except Exception as e:
+            print("Reader failed!\n{}".format(str(e)))
+            os._exit(1)
+
+    mapper = functools.partial(
+        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
+    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
+
+
+def imagenet_reader(data_dir, mode):
+    if mode is 'train':
+        shuffle = True
+        suffix = 'train_list.txt'
+    elif mode is 'val':
+        shuffle = False
+        suffix = 'val_list.txt'
+    file_list = os.path.join(data_dir, suffix)
+    return _reader_creator(file_list, mode, shuffle=shuffle, data_dir=data_dir)
diff --git a/demo/darts/search.py b/demo/darts/search.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b4dcce2e6c6e8b0bc83f255fd45beb620c89c57
--- /dev/null
+++ b/demo/darts/search.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import ast
+import argparse
+import functools
+
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.base import to_variable
+import reader
+from model_search import Network
+from paddleslim.nas.darts import DARTSearch
+sys.path[0] = os.path.join(os.path.dirname("__file__"), os.path.pardir)
+from utility import add_arguments, print_arguments
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+
+# yapf: disable
+add_arg('log_freq',          int,   50,              "Log frequency.")
+add_arg('use_multiprocess',  bool,  True,            "Whether use multiprocess reader.")
+add_arg('batch_size',        int,   64,              "Minibatch size.")
+add_arg('learning_rate',     float, 0.025,            "The start learning rate.")
+add_arg('momentum',          float, 0.9,             "Momentum.")
+add_arg('use_gpu',           bool,  True,            "Whether use GPU.")
+add_arg('epochs',            int,   50,              "Epoch number.")
+add_arg('init_channels',     int,   16,              "Init channel number.")
+add_arg('layers',            int,   8,               "Total number of layers.")
+add_arg('class_num',         int,   10,              "Class number of dataset.")
+add_arg('trainset_num',      int,   50000,           "images number of trainset.")
+add_arg('model_save_dir',    str,   'search_cifar', "The path to save model.")
+add_arg('grad_clip',         float, 5,               "Gradient clipping.")
+add_arg('arch_learning_rate',float, 3e-4,            "Learning rate for arch encoding.")
+add_arg('method',            str,   'DARTS',         "The search method you would like to use")
+add_arg('epochs_no_archopt', int,   0,               "Epochs not optimize the arch params")
+add_arg('cutout_length',     int,   16,              "Cutout length.")
+add_arg('cutout',            ast.literal_eval,  False, "Whether use cutout.")
+add_arg('unrolled',          ast.literal_eval,  False, "Use one-step unrolled validation loss")
+add_arg('use_data_parallel', ast.literal_eval,  False, "The flag indicating whether to use data parallel mode to train the model.")
+# yapf: enable
+
+
+def main(args):
+    if not args.use_gpu:
+        place = fluid.CPUPlace()
+    elif not args.use_data_parallel:
+        place = fluid.CUDAPlace(0)
+    else:
+        place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id)
+
+    train_reader, valid_reader = reader.train_search(
+        batch_size=args.batch_size,
+        train_portion=0.5,
+        is_shuffle=True,
+        args=args)
+
+    with fluid.dygraph.guard(place):
+        model = Network(args.init_channels, args.class_num, args.layers,
+                        args.method)
+        searcher = DARTSearch(
+            model,
+            train_reader,
+            valid_reader,
+            place,
+            learning_rate=args.learning_rate,
+            batchsize=args.batch_size,
+            num_imgs=args.trainset_num,
+            arch_learning_rate=args.arch_learning_rate,
+            unrolled=args.unrolled,
+            num_epochs=args.epochs,
+            epochs_no_archopt=args.epochs_no_archopt,
+            use_multiprocess=args.use_multiprocess,
+            use_data_parallel=args.use_data_parallel,
+            save_dir=args.model_save_dir,
+            log_freq=args.log_freq)
+        searcher.train()
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+
+    main(args)
diff --git a/demo/darts/train.py b/demo/darts/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..39b31633dc27cc6999586c0a9db061da60d4cf5b
--- /dev/null
+++ b/demo/darts/train.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import ast
+import logging
+import argparse
+import functools
+
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.base import to_variable
+from paddleslim.common import AvgrageMeter, get_logger
+from paddleslim.nas.darts import count_parameters_in_MB
+
+import genotypes
+import reader
+from model import NetworkCIFAR as Network
+sys.path[0] = os.path.join(os.path.dirname("__file__"), os.path.pardir)
+from utility import add_arguments, print_arguments
+logger = get_logger(__name__, level=logging.INFO)
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+
+# yapf: disable
+add_arg('use_multiprocess',  bool,  True,            "Whether use multiprocess reader.")
+add_arg('data',              str,   'dataset/cifar10',"The dir of dataset.")
+add_arg('batch_size',        int,   96,              "Minibatch size.")
+add_arg('learning_rate',     float, 0.025,           "The start learning rate.")
+add_arg('momentum',          float, 0.9,             "Momentum.")
+add_arg('weight_decay',      float, 3e-4,            "Weight_decay.")
+add_arg('use_gpu',           bool,  True,            "Whether use GPU.")
+add_arg('epochs',            int,   600,             "Epoch number.")
+add_arg('init_channels',     int,   36,              "Init channel number.")
+add_arg('layers',            int,   20,              "Total number of layers.")
+add_arg('class_num',         int,   10,              "Class number of dataset.")
+add_arg('trainset_num',      int,   50000,           "images number of trainset.")
+add_arg('model_save_dir',    str,   'eval_cifar',   "The path to save model.")
+add_arg('cutout',            bool,  True,            'Whether use cutout.')
+add_arg('cutout_length',     int,   16,              "Cutout length.")
+add_arg('auxiliary',         bool,  True,            'Use auxiliary tower.')
+add_arg('auxiliary_weight',  float, 0.4,             "Weight for auxiliary loss.")
+add_arg('drop_path_prob',    float, 0.2,             "Drop path probability.")
+add_arg('grad_clip',         float, 5,               "Gradient clipping.")
+add_arg('arch',              str,   'DARTS_V2',      "Which architecture to use")
+add_arg('log_freq',          int,   50,              'Report frequency')
+add_arg('use_data_parallel', ast.literal_eval,  False, "The flag indicating whether to use data parallel mode to train the model.")
+# yapf: enable
+
+
+def train(model, train_reader, optimizer, epoch, drop_path_prob, args):
+    objs = AvgrageMeter()
+    top1 = AvgrageMeter()
+    top5 = AvgrageMeter()
+    model.train()
+
+    for step_id, data in enumerate(train_reader()):
+        image_np, label_np = data
+        image = to_variable(image_np)
+        label = to_variable(label_np)
+        label.stop_gradient = True
+        logits, logits_aux = model(image, drop_path_prob, True)
+
+        prec1 = fluid.layers.accuracy(input=logits, label=label, k=1)
+        prec5 = fluid.layers.accuracy(input=logits, label=label, k=5)
+        loss = fluid.layers.reduce_mean(
+            fluid.layers.softmax_with_cross_entropy(logits, label))
+        if args.auxiliary:
+            loss_aux = fluid.layers.reduce_mean(
+                fluid.layers.softmax_with_cross_entropy(logits_aux, label))
+            loss = loss + args.auxiliary_weight * loss_aux
+
+        if args.use_data_parallel:
+            loss = model.scale_loss(loss)
+            loss.backward()
+            model.apply_collective_grads()
+        else:
+            loss.backward()
+
+        optimizer.minimize(loss)
+        model.clear_gradients()
+
+        n = image.shape[0]
+        objs.update(loss.numpy(), n)
+        top1.update(prec1.numpy(), n)
+        top5.update(prec5.numpy(), n)
+
+        if step_id % args.log_freq == 0:
+            logger.info(
+                "Train Epoch {}, Step {}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}".
+                format(epoch, step_id, objs.avg[0], top1.avg[0], top5.avg[0]))
+    return top1.avg[0]
+
+
+def valid(model, valid_reader, epoch, args):
+    objs = AvgrageMeter()
+    top1 = AvgrageMeter()
+    top5 = AvgrageMeter()
+    model.eval()
+
+    for step_id, data in enumerate(valid_reader()):
+        image_np, label_np = data
+        image = to_variable(image_np)
+        label = to_variable(label_np)
+        logits, _ = model(image, 0, False)
+        prec1 = fluid.layers.accuracy(input=logits, label=label, k=1)
+        prec5 = fluid.layers.accuracy(input=logits, label=label, k=5)
+        loss = fluid.layers.reduce_mean(
+            fluid.layers.softmax_with_cross_entropy(logits, label))
+
+        n = image.shape[0]
+        objs.update(loss.numpy(), n)
+        top1.update(prec1.numpy(), n)
+        top5.update(prec5.numpy(), n)
+        if step_id % args.log_freq == 0:
+            logger.info(
+                "Valid Epoch {}, Step {}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}".
+                format(epoch, step_id, objs.avg[0], top1.avg[0], top5.avg[0]))
+    return top1.avg[0]
+
+
+def main(args):
+    place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \
+        if args.use_data_parallel else fluid.CUDAPlace(0)
+
+    with fluid.dygraph.guard(place):
+        genotype = eval("genotypes.%s" % args.arch)
+        model = Network(
+            C=args.init_channels,
+            num_classes=args.class_num,
+            layers=args.layers,
+            auxiliary=args.auxiliary,
+            genotype=genotype)
+
+        logger.info("param size = {:.6f}MB".format(
+            count_parameters_in_MB(model.parameters())))
+
+        device_num = fluid.dygraph.parallel.Env().nranks
+        step_per_epoch = int(args.trainset_num /
+                             (args.batch_size * device_num))
+        learning_rate = fluid.dygraph.CosineDecay(args.learning_rate,
+                                                  step_per_epoch, args.epochs)
+        clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=args.grad_clip)
+        optimizer = fluid.optimizer.MomentumOptimizer(
+            learning_rate,
+            momentum=args.momentum,
+            regularization=fluid.regularizer.L2Decay(args.weight_decay),
+            parameter_list=model.parameters(),
+            grad_clip=clip)
+
+        if args.use_data_parallel:
+            strategy = fluid.dygraph.parallel.prepare_context()
+            model = fluid.dygraph.parallel.DataParallel(model, strategy)
+
+        train_loader = fluid.io.DataLoader.from_generator(
+            capacity=64,
+            use_double_buffer=True,
+            iterable=True,
+            return_list=True,
+            use_multiprocess=args.use_multiprocess)
+        valid_loader = fluid.io.DataLoader.from_generator(
+            capacity=64,
+            use_double_buffer=True,
+            iterable=True,
+            return_list=True,
+            use_multiprocess=args.use_multiprocess)
+
+        train_reader = reader.train_valid(
+            batch_size=args.batch_size,
+            is_train=True,
+            is_shuffle=True,
+            args=args)
+        valid_reader = reader.train_valid(
+            batch_size=args.batch_size,
+            is_train=False,
+            is_shuffle=False,
+            args=args)
+        if args.use_data_parallel:
+            train_reader = fluid.contrib.reader.distributed_batch_reader(
+                train_reader)
+
+        train_loader.set_batch_generator(train_reader, places=place)
+        valid_loader.set_batch_generator(valid_reader, places=place)
+
+        save_parameters = (not args.use_data_parallel) or (
+            args.use_data_parallel and
+            fluid.dygraph.parallel.Env().local_rank == 0)
+        best_acc = 0
+        for epoch in range(args.epochs):
+            drop_path_prob = args.drop_path_prob * epoch / args.epochs
+            logger.info('Epoch {}, lr {:.6f}'.format(
+                epoch, optimizer.current_step_lr()))
+            train_top1 = train(model, train_loader, optimizer, epoch,
+                               drop_path_prob, args)
+            logger.info("Epoch {}, train_acc {:.6f}".format(epoch, train_top1))
+            valid_top1 = valid(model, valid_loader, epoch, args)
+            if valid_top1 > best_acc:
+                best_acc = valid_top1
+                if save_parameters:
+                    fluid.save_dygraph(model.state_dict(),
+                                       args.model_save_dir + "/best_model")
+            logger.info("Epoch {}, valid_acc {:.6f}, best_valid_acc {:.6f}".
+                        format(epoch, valid_top1, best_acc))
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+    main(args)
diff --git a/demo/darts/train_imagenet.py b/demo/darts/train_imagenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..09a90a96ba02173fba695aef030e04c88af6f5f9
--- /dev/null
+++ b/demo/darts/train_imagenet.py
@@ -0,0 +1,240 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import ast
+import logging
+import argparse
+import functools
+
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.base import to_variable
+from paddleslim.common import AvgrageMeter, get_logger
+from paddleslim.nas.darts import count_parameters_in_MB
+
+import genotypes
+import reader
+from model import NetworkImageNet as Network
+sys.path[0] = os.path.join(os.path.dirname("__file__"), os.path.pardir)
+from utility import add_arguments, print_arguments
+logger = get_logger(__name__, level=logging.INFO)
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+
+# yapf: disable
+add_arg('use_multiprocess',  bool,  True,            "Whether use multiprocess reader.")
+add_arg('num_workers',       int,   4,               "The multiprocess reader number.")
+add_arg('data_dir',          str,   'dataset/ILSVRC2012',"The dir of dataset.")
+add_arg('batch_size',        int,   128,             "Minibatch size.")
+add_arg('learning_rate',     float, 0.1,             "The start learning rate.")
+add_arg('decay_rate',        float, 0.97,            "The lr decay rate.")
+add_arg('momentum',          float, 0.9,             "Momentum.")
+add_arg('weight_decay',      float, 3e-5,            "Weight_decay.")
+add_arg('use_gpu',           bool,  True,            "Whether use GPU.")
+add_arg('epochs',            int,   250,             "Epoch number.")
+add_arg('init_channels',     int,   48,              "Init channel number.")
+add_arg('layers',            int,   14,              "Total number of layers.")
+add_arg('class_num',         int,   1000,            "Class number of dataset.")
+add_arg('trainset_num',      int,   1281167,         "Images number of trainset.")
+add_arg('model_save_dir',    str,   'eval_imagenet', "The path to save model.")
+add_arg('auxiliary',         bool,  True,            'Use auxiliary tower.')
+add_arg('auxiliary_weight',  float, 0.4,             "Weight for auxiliary loss.")
+add_arg('drop_path_prob',    float, 0.0,             "Drop path probability.")
+add_arg('dropout',           float, 0.0,             "Dropout probability.")
+add_arg('grad_clip',         float, 5,               "Gradient clipping.")
+add_arg('label_smooth',      float, 0.1,             "Label smoothing.")
+add_arg('arch',              str,   'DARTS_V2',      "Which architecture to use")
+add_arg('log_freq',          int,   100,             'Report frequency')
+add_arg('use_data_parallel', ast.literal_eval,  False, "The flag indicating whether to use data parallel mode to train the model.")
+# yapf: enable
+
+
+def cross_entropy_label_smooth(preds, targets, epsilon):
+    preds = fluid.layers.softmax(preds)
+    targets_one_hot = fluid.one_hot(input=targets, depth=args.class_num)
+    targets_smooth = fluid.layers.label_smooth(
+        targets_one_hot, epsilon=epsilon, dtype="float32")
+    loss = fluid.layers.cross_entropy(
+        input=preds, label=targets_smooth, soft_label=True)
+    return loss
+
+
+def train(model, train_reader, optimizer, epoch, args):
+    objs = AvgrageMeter()
+    top1 = AvgrageMeter()
+    top5 = AvgrageMeter()
+    model.train()
+
+    for step_id, data in enumerate(train_reader()):
+        image_np, label_np = data
+        image = to_variable(image_np)
+        label = to_variable(label_np)
+        label.stop_gradient = True
+        logits, logits_aux = model(image, True)
+
+        prec1 = fluid.layers.accuracy(input=logits, label=label, k=1)
+        prec5 = fluid.layers.accuracy(input=logits, label=label, k=5)
+        loss = fluid.layers.reduce_mean(
+            cross_entropy_label_smooth(logits, label, args.label_smooth))
+
+        if args.auxiliary:
+            loss_aux = fluid.layers.reduce_mean(
+                cross_entropy_label_smooth(logits_aux, label,
+                                           args.label_smooth))
+            loss = loss + args.auxiliary_weight * loss_aux
+
+        if args.use_data_parallel:
+            loss = model.scale_loss(loss)
+            loss.backward()
+            model.apply_collective_grads()
+        else:
+            loss.backward()
+
+        optimizer.minimize(loss)
+        model.clear_gradients()
+
+        n = image.shape[0]
+        objs.update(loss.numpy(), n)
+        top1.update(prec1.numpy(), n)
+        top5.update(prec5.numpy(), n)
+
+        if step_id % args.log_freq == 0:
+            logger.info(
+                "Train Epoch {}, Step {}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}".
+                format(epoch, step_id, objs.avg[0], top1.avg[0], top5.avg[0]))
+    return top1.avg[0], top5.avg[0]
+
+
+def valid(model, valid_reader, epoch, args):
+    objs = AvgrageMeter()
+    top1 = AvgrageMeter()
+    top5 = AvgrageMeter()
+    model.eval()
+
+    for step_id, data in enumerate(valid_reader()):
+        image_np, label_np = data
+        image = to_variable(image_np)
+        label = to_variable(label_np)
+        logits, _ = model(image, False)
+        prec1 = fluid.layers.accuracy(input=logits, label=label, k=1)
+        prec5 = fluid.layers.accuracy(input=logits, label=label, k=5)
+        loss = fluid.layers.reduce_mean(
+            cross_entropy_label_smooth(logits, label, args.label_smooth))
+
+        n = image.shape[0]
+        objs.update(loss.numpy(), n)
+        top1.update(prec1.numpy(), n)
+        top5.update(prec5.numpy(), n)
+        if step_id % args.log_freq == 0:
+            logger.info(
+                "Valid Epoch {}, Step {}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}".
+                format(epoch, step_id, objs.avg[0], top1.avg[0], top5.avg[0]))
+    return top1.avg[0], top5.avg[0]
+
+
+def main(args):
+    place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \
+        if args.use_data_parallel else fluid.CUDAPlace(0)
+
+    with fluid.dygraph.guard(place):
+        genotype = eval("genotypes.%s" % args.arch)
+        model = Network(
+            C=args.init_channels,
+            num_classes=args.class_num,
+            layers=args.layers,
+            auxiliary=args.auxiliary,
+            genotype=genotype)
+
+        logger.info("param size = {:.6f}MB".format(
+            count_parameters_in_MB(model.parameters())))
+
+        device_num = fluid.dygraph.parallel.Env().nranks
+        step_per_epoch = int(args.trainset_num /
+                             (args.batch_size * device_num))
+        learning_rate = fluid.dygraph.ExponentialDecay(
+            args.learning_rate,
+            step_per_epoch,
+            args.decay_rate,
+            staircase=True)
+
+        clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=args.grad_clip)
+        optimizer = fluid.optimizer.MomentumOptimizer(
+            learning_rate,
+            momentum=args.momentum,
+            regularization=fluid.regularizer.L2Decay(args.weight_decay),
+            parameter_list=model.parameters(),
+            grad_clip=clip)
+
+        if args.use_data_parallel:
+            strategy = fluid.dygraph.parallel.prepare_context()
+            model = fluid.dygraph.parallel.DataParallel(model, strategy)
+
+        train_loader = fluid.io.DataLoader.from_generator(
+            capacity=64,
+            use_double_buffer=True,
+            iterable=True,
+            return_list=True)
+        valid_loader = fluid.io.DataLoader.from_generator(
+            capacity=64,
+            use_double_buffer=True,
+            iterable=True,
+            return_list=True)
+
+        train_reader = fluid.io.batch(
+            reader.imagenet_reader(args.data_dir, 'train'),
+            batch_size=args.batch_size,
+            drop_last=True)
+        valid_reader = fluid.io.batch(
+            reader.imagenet_reader(args.data_dir, 'val'),
+            batch_size=args.batch_size)
+        if args.use_data_parallel:
+            train_reader = fluid.contrib.reader.distributed_batch_reader(
+                train_reader)
+
+        train_loader.set_sample_list_generator(train_reader, places=place)
+        valid_loader.set_sample_list_generator(valid_reader, places=place)
+
+        save_parameters = (not args.use_data_parallel) or (
+            args.use_data_parallel and
+            fluid.dygraph.parallel.Env().local_rank == 0)
+        best_top1 = 0
+        for epoch in range(args.epochs):
+            logger.info('Epoch {}, lr {:.6f}'.format(
+                epoch, optimizer.current_step_lr()))
+            train_top1, train_top5 = train(model, train_loader, optimizer,
+                                           epoch, args)
+            logger.info("Epoch {}, train_top1 {:.6f}, train_top5 {:.6f}".
+                        format(epoch, train_top1, train_top5))
+            valid_top1, valid_top5 = valid(model, valid_loader, epoch, args)
+            if valid_top1 > best_top1:
+                best_top1 = valid_top1
+                if save_parameters:
+                    fluid.save_dygraph(model.state_dict(),
+                                       args.model_save_dir + "/best_model")
+            logger.info(
+                "Epoch {}, valid_top1 {:.6f}, valid_top5 {:.6f}, best_valid_top1 {:6f}".
+                format(epoch, valid_top1, valid_top5, best_top1))
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+
+    main(args)
diff --git a/demo/darts/visualize.py b/demo/darts/visualize.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c12a0d2b3061dcc417d1151b045aa04b1ebba34
--- /dev/null
+++ b/demo/darts/visualize.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import genotypes
+from graphviz import Digraph
+
+
+def plot(genotype_normal, genotype_reduce, filename):
+    g = Digraph(
+        format='png',
+        edge_attr=dict(fontname="times"),
+        node_attr=dict(
+            style='filled',
+            shape='ellipse',
+            align='center',
+            height='0.5',
+            width='0.5',
+            penwidth='2',
+            fontname="times"),
+        engine='dot')
+
+    g.body.extend(['rankdir=LR'])
+
+    g.node("reduce_c_{k-2}", fillcolor='darkseagreen2')
+    g.node("reduce_c_{k-1}", fillcolor='darkseagreen2')
+    g.node("normal_c_{k-2}", fillcolor='darkseagreen2')
+    g.node("normal_c_{k-1}", fillcolor='darkseagreen2')
+    assert len(genotype_normal) % 2 == 0
+    steps = len(genotype_normal) // 2
+
+    for i in range(steps):
+        g.node('n_' + str(i), fillcolor='lightblue')
+    for i in range(steps):
+        g.node('r_' + str(i), fillcolor='lightblue')
+
+    for i in range(steps):
+        for k in [2 * i, 2 * i + 1]:
+            op, j = genotype_normal[k]
+            if j == 0:
+                u = "normal_c_{k-2}"
+            elif j == 1:
+                u = "normal_c_{k-1}"
+            else:
+                u = 'n_' + str(j - 2)
+            v = 'n_' + str(i)
+            g.edge(u, v, label=op, fillcolor="gray")
+
+    for i in range(steps):
+        for k in [2 * i, 2 * i + 1]:
+            op, j = genotype_reduce[k]
+            if j == 0:
+                u = "reduce_c_{k-2}"
+            elif j == 1:
+                u = "reduce_c_{k-1}"
+            else:
+                u = 'r_' + str(j - 2)
+            v = 'r_' + str(i)
+            g.edge(u, v, label=op, fillcolor="gray")
+
+    g.node("r_c_{k}", fillcolor='palegoldenrod')
+    for i in range(steps):
+        g.edge('r_' + str(i), "r_c_{k}", fillcolor="gray")
+    g.node("n_c_{k}", fillcolor='palegoldenrod')
+    for i in range(steps):
+        g.edge('n_' + str(i), "n_c_{k}", fillcolor="gray")
+    g.render(filename, view=False)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print("usage:\n python {} ARCH_NAME".format(sys.argv[0]))
+        sys.exit(1)
+    genotype_name = sys.argv[1]
+
+    try:
+        genotype = eval('genotypes.{}'.format(genotype_name))
+    except AttributeError:
+        print("{} is not specified in genotypes.py".format(genotype_name))
+        sys.exit(1)
+
+    plot(genotype.normal, genotype.reduce, genotype_name)
diff --git a/demo/deep_mutual_learning/README.md b/demo/deep_mutual_learning/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..160fb3d0a3ad2ef4c121becca794402d6ef8e397
--- /dev/null
+++ b/demo/deep_mutual_learning/README.md
@@ -0,0 +1,39 @@
+# 深度互学习DML(Deep Mutual Learning)
+本示例介绍如何使用PaddleSlim的深度互学习DML方法训练模型，算法原理请参考论文 [Deep Mutual Learning](https://arxiv.org/abs/1706.00384)
+
+![dml_architect](./images/dml_architect.png)
+
+## 使用数据
+
+示例中使用cifar100数据集进行训练, 您可以在启动训练时等待自动下载，
+也可以在自行下载[数据集](https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz)之后，放在当前目录的`./dataset/cifar`路径下
+
+## 启动命令
+
+### 训练MobileNet-Mobilenet的组合
+
+以0号GPU为例：
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python dml_train.py
+```
+### 训练MobileNet-ResNet50的组合
+
+以0号GPU为例：
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python dml_train.py --models='mobilenet-resnet50'
+```
+
+
+## 实验结果
+
+以下实验结果可以由默认实验配置(学习率、优化器等)训练得到，仅调整了DML训练的模型组合
+
+如果想进一步提升实验结果可以尝试[更多优化tricks](https://arxiv.org/abs/1812.01187), 或进一步增加一次DML训练的模型数量。
+
+| 数据集 | 网络模型 |  单独训练准确率 | 深度互学习准确率 |
+| ------ | ------ | ------ | ------ |
+| CIFAR100 | MobileNet X 2 | 73.65% | 76.34% (+2.69%) |
+| CIFAR100 | MobileNet X 4 | 73.65% | 76.56% (+2.91%) |
+| CIFAR100 | MobileNet + ResNet50 | 73.65%/76.52% | 76.00%/77.80% (+2.35%/+1.28%) |
diff --git a/demo/deep_mutual_learning/cifar100_reader.py b/demo/deep_mutual_learning/cifar100_reader.py
new file mode 100755
index 0000000000000000000000000000000000000000..6009e5fb468314b21e324d1a7d10d8f94e3ec08f
--- /dev/null
+++ b/demo/deep_mutual_learning/cifar100_reader.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from PIL import Image
+from PIL import ImageOps
+import os
+import math
+import random
+import tarfile
+import functools
+import numpy as np
+from PIL import Image, ImageEnhance
+import paddle
+# for python2/python3 compatiablity
+try:
+    import cPickle
+except:
+    import _pickle as cPickle
+
+IMAGE_SIZE = 32
+IMAGE_DEPTH = 3
+CIFAR_MEAN = [0.5070751592371323, 0.48654887331495095, 0.4409178433670343]
+CIFAR_STD = [0.2673342858792401, 0.2564384629170883, 0.27615047132568404]
+
+URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
+CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
+CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
+paddle.dataset.common.DATA_HOME = "dataset/"
+
+
+def preprocess(sample, is_training):
+    image_array = sample.reshape(IMAGE_DEPTH, IMAGE_SIZE, IMAGE_SIZE)
+    rgb_array = np.transpose(image_array, (1, 2, 0))
+    img = Image.fromarray(rgb_array, 'RGB')
+
+    if is_training:
+        # pad, ramdom crop, random_flip_left_right, random_rotation
+        img = ImageOps.expand(img, (4, 4, 4, 4), fill=0)
+        left_top = np.random.randint(8, size=2)
+        img = img.crop((left_top[1], left_top[0], left_top[1] + IMAGE_SIZE,
+                        left_top[0] + IMAGE_SIZE))
+        if np.random.randint(2):
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+        random_angle = np.random.randint(-15, 15)
+        img = img.rotate(random_angle, Image.NEAREST)
+    img = np.array(img).astype(np.float32)
+
+    img_float = img / 255.0
+    img = (img_float - CIFAR_MEAN) / CIFAR_STD
+
+    img = np.transpose(img, (2, 0, 1))
+    return img
+
+
+def reader_generator(datasets, batch_size, is_training, is_shuffle):
+    def read_batch(datasets):
+        if is_shuffle:
+            random.shuffle(datasets)
+        for im, label in datasets:
+            im = preprocess(im, is_training)
+            yield im, [int(label)]
+
+    def reader():
+        batch_data = []
+        batch_label = []
+        for data in read_batch(datasets):
+            batch_data.append(data[0])
+            batch_label.append(data[1])
+            if len(batch_data) == batch_size:
+                batch_data = np.array(batch_data, dtype='float32')
+                batch_label = np.array(batch_label, dtype='int64')
+                batch_out = [batch_data, batch_label]
+                yield batch_out
+                batch_data = []
+                batch_label = []
+
+    return reader
+
+
+def cifar100_reader(file_name, data_name, is_shuffle):
+    with tarfile.open(file_name, mode='r') as f:
+        names = [
+            each_item.name for each_item in f if data_name in each_item.name
+        ]
+        names.sort()
+        datasets = []
+        for name in names:
+            print("Reading file " + name)
+            try:
+                batch = cPickle.load(f.extractfile(name), encoding='iso-8859-1')
+            except:
+                batch = cPickle.load(f.extractfile(name))
+            data = batch['data']
+            labels = batch.get('labels', batch.get('fine_labels', None))
+            assert labels is not None
+            dataset = zip(data, labels)
+            datasets.extend(dataset)
+        if is_shuffle:
+            random.shuffle(datasets)
+    return datasets
+
+
+def train_valid(batch_size, is_train, is_shuffle):
+    name = 'train' if is_train else 'test'
+    datasets = cifar100_reader(
+        paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        name, is_shuffle)
+    reader = reader_generator(datasets, batch_size, is_train, is_shuffle)
+    return reader
diff --git a/demo/deep_mutual_learning/dml_train.py b/demo/deep_mutual_learning/dml_train.py
new file mode 100755
index 0000000000000000000000000000000000000000..f6d8dffb182372390d1842054311f0c09567aff5
--- /dev/null
+++ b/demo/deep_mutual_learning/dml_train.py
@@ -0,0 +1,213 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import argparse
+import functools
+import logging
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.base import to_variable
+from paddleslim.common import AvgrageMeter, get_logger
+from paddleslim.dist import DML
+from paddleslim.models.dygraph import MobileNetV1
+from paddleslim.models.dygraph import ResNet
+import cifar100_reader as reader
+sys.path[0] = os.path.join(os.path.dirname("__file__"), os.path.pardir)
+from utility import add_arguments, print_arguments
+
+logger = get_logger(__name__, level=logging.INFO)
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+
+# yapf: disable
+add_arg('log_freq',          int,   100,              "Log frequency.")
+add_arg('models',            str,   "mobilenet-mobilenet",  "model.")
+add_arg('batch_size',        int,   256,             "Minibatch size.")
+add_arg('init_lr',           float, 0.1,             "The start learning rate.")
+add_arg('use_gpu',           bool,  True,            "Whether use GPU.")
+add_arg('epochs',            int,   200,             "Epoch number.")
+add_arg('class_num',         int,   100,             "Class number of dataset.")
+add_arg('trainset_num',      int,   50000,           "Images number of trainset.")
+add_arg('model_save_dir',    str,   'saved_models',  "The path to save model.")
+add_arg('use_parallel',      bool,  False,           "Whether to use data parallel mode to train the model.")
+# yapf: enable
+
+
+def create_optimizer(models, args):
+    device_num = fluid.dygraph.parallel.Env().nranks
+    step = int(args.trainset_num / (args.batch_size * device_num))
+    epochs = [60, 120, 180]
+    bd = [step * e for e in epochs]
+    lr = [args.init_lr * (0.1**i) for i in range(len(bd) + 1)]
+
+    optimizers = []
+    for cur_model in models:
+        learning_rate = fluid.dygraph.PiecewiseDecay(bd, lr, 0)
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate,
+            0.9,
+            parameter_list=cur_model.parameters(),
+            use_nesterov=True,
+            regularization=fluid.regularizer.L2DecayRegularizer(5e-4))
+        optimizers.append(opt)
+    return optimizers
+
+
+def create_reader(place, args):
+    train_reader = reader.train_valid(
+        batch_size=args.batch_size, is_train=True, is_shuffle=True)
+    valid_reader = reader.train_valid(
+        batch_size=args.batch_size, is_train=False, is_shuffle=False)
+    if args.use_parallel:
+        train_reader = fluid.contrib.reader.distributed_batch_reader(
+            train_reader)
+    train_loader = fluid.io.DataLoader.from_generator(
+        capacity=1024, return_list=True)
+    valid_loader = fluid.io.DataLoader.from_generator(
+        capacity=1024, return_list=True)
+    train_loader.set_batch_generator(train_reader, places=place)
+    valid_loader.set_batch_generator(valid_reader, places=place)
+    return train_loader, valid_loader
+
+
+def train(train_loader, dml_model, dml_optimizer, args):
+    dml_model.train()
+    costs = [AvgrageMeter() for i in range(dml_model.model_num)]
+    accs = [AvgrageMeter() for i in range(dml_model.model_num)]
+    for step_id, (images, labels) in enumerate(train_loader):
+        images, labels = to_variable(images), to_variable(labels)
+        batch_size = images.shape[0]
+
+        logits = dml_model.forward(images)
+        precs = [
+            fluid.layers.accuracy(
+                input=l, label=labels, k=1) for l in logits
+        ]
+        losses = dml_model.loss(logits, labels)
+        dml_optimizer.minimize(losses)
+
+        for i in range(dml_model.model_num):
+            accs[i].update(precs[i].numpy(), batch_size)
+            costs[i].update(losses[i].numpy(), batch_size)
+        model_names = dml_model.full_name()
+        if step_id % args.log_freq == 0:
+            log_msg = "Train Step {}".format(step_id)
+            for model_id, (cost, acc) in enumerate(zip(costs, accs)):
+                log_msg += ", {} loss: {:.6f} acc: {:.6f}".format(
+                    model_names[model_id], cost.avg[0], acc.avg[0])
+            logger.info(log_msg)
+    return costs, accs
+
+
+def valid(valid_loader, dml_model, args):
+    dml_model.eval()
+    costs = [AvgrageMeter() for i in range(dml_model.model_num)]
+    accs = [AvgrageMeter() for i in range(dml_model.model_num)]
+    for step_id, (images, labels) in enumerate(valid_loader):
+        images, labels = to_variable(images), to_variable(labels)
+        batch_size = images.shape[0]
+
+        logits = dml_model.forward(images)
+        precs = [
+            fluid.layers.accuracy(
+                input=l, label=labels, k=1) for l in logits
+        ]
+        losses = dml_model.loss(logits, labels)
+
+        for i in range(dml_model.model_num):
+            accs[i].update(precs[i].numpy(), batch_size)
+            costs[i].update(losses[i].numpy(), batch_size)
+        model_names = dml_model.full_name()
+        if step_id % args.log_freq == 0:
+            log_msg = "Valid Step{} ".format(step_id)
+            for model_id, (cost, acc) in enumerate(zip(costs, accs)):
+                log_msg += ", {} loss: {:.6f} acc: {:.6f}".format(
+                    model_names[model_id], cost.avg[0], acc.avg[0])
+            logger.info(log_msg)
+    return costs, accs
+
+
+def main(args):
+    if not args.use_gpu:
+        place = fluid.CPUPlace()
+    elif not args.use_parallel:
+        place = fluid.CUDAPlace(0)
+    else:
+        place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id)
+
+    with fluid.dygraph.guard(place):
+        # 1. Define data reader
+        train_loader, valid_loader = create_reader(place, args)
+
+        # 2. Define neural network
+        if args.models == "mobilenet-mobilenet":
+            models = [
+                MobileNetV1(class_dim=args.class_num),
+                MobileNetV1(class_dim=args.class_num)
+            ]
+        elif args.models == "mobilenet-resnet50":
+            models = [
+                MobileNetV1(class_dim=args.class_num),
+                ResNet(class_dim=args.class_num)
+            ]
+        else:
+            logger.info("You can define the model as you wish")
+            return
+        optimizers = create_optimizer(models, args)
+
+        # 3. Use PaddleSlim DML strategy
+        dml_model = DML(models, args.use_parallel)
+        dml_optimizer = dml_model.opt(optimizers)
+
+        # 4. Train your network
+        save_parameters = (not args.use_parallel) or (
+            args.use_parallel and fluid.dygraph.parallel.Env().local_rank == 0)
+        best_valid_acc = [0] * dml_model.model_num
+        for epoch_id in range(args.epochs):
+            current_step_lr = dml_optimizer.get_lr()
+            lr_msg = "Epoch {}".format(epoch_id)
+            for model_id, lr in enumerate(current_step_lr):
+                lr_msg += ", {} lr: {:.6f}".format(
+                    dml_model.full_name()[model_id], lr)
+            logger.info(lr_msg)
+            train_losses, train_accs = train(train_loader, dml_model,
+                                             dml_optimizer, args)
+            valid_losses, valid_accs = valid(valid_loader, dml_model, args)
+            for i in range(dml_model.model_num):
+                if valid_accs[i].avg[0] > best_valid_acc[i]:
+                    best_valid_acc[i] = valid_accs[i].avg[0]
+                    if save_parameters:
+                        fluid.save_dygraph(
+                            models[i].state_dict(),
+                            os.path.join(args.model_save_dir,
+                                         dml_model.full_name()[i],
+                                         "best_model"))
+                summery_msg = "Epoch {} {}: valid_loss {:.6f}, valid_acc {:.6f}, best_valid_acc {:.6f}"
+                logger.info(
+                    summery_msg.format(epoch_id,
+                                       dml_model.full_name()[i], valid_losses[
+                                           i].avg[0], valid_accs[i].avg[0],
+                                       best_valid_acc[i]))
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+    main(args)
diff --git a/demo/deep_mutual_learning/images/dml_architect.png b/demo/deep_mutual_learning/images/dml_architect.png
new file mode 100755
index 0000000000000000000000000000000000000000..24f257aa81b3d95ac9e78c1508315ed90992e9f4
Binary files /dev/null and b/demo/deep_mutual_learning/images/dml_architect.png differ
diff --git a/demo/detection/README.md b/demo/detection/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f99d87154ecea4987a1f1621ce24080e04227a3c
--- /dev/null
+++ b/demo/detection/README.md
@@ -0,0 +1,231 @@
+# SlimDet模型库
+
+在PaddleDetection, 提供了基于PaddleSlim进行模型压缩的完整教程和实验结果。详细教程请参考：
+
+- [量化](https://github.com/PaddlePaddle/PaddleDetection/tree/release/0.4/slim/quantization)
+- [裁剪](https://github.com/PaddlePaddle/PaddleDetection/tree/release/0.4/slim/prune)
+- [蒸馏](https://github.com/PaddlePaddle/PaddleDetection/tree/release/0.4/slim/distillation)
+- [搜索](https://github.com/PaddlePaddle/PaddleDetection/tree/release/0.4/slim/nas)
+
+下面给出压缩的benchmark实验结果。
+
+## 测试环境
+
+- Python 2.7.1
+- PaddlePaddle >=1.6
+- CUDA 9.0
+- cuDNN >=7.4
+- NCCL 2.1.2
+
+## 剪裁模型库
+
+### 训练策略
+
+- 剪裁模型训练时使用[PaddleDetection模型库](https://paddledetection.readthedocs.io/MODEL_ZOO_cn.html)发布的模型权重作为预训练权重。
+- 剪裁训练使用模型默认配置，即除`pretrained_weights`外配置不变。
+- 剪裁模型全部为基于敏感度的卷积通道剪裁。
+- YOLOv3模型主要剪裁`yolo_head`部分，即剪裁参数如下。
+
+```
+--pruned_params="yolo_block.0.0.0.conv.weights,yolo_block.0.0.1.conv.weights,yolo_block.0.1.0.conv.weights,yolo_block.0.1.1.conv.weights,yolo_block.0.2.conv.weights,yolo_block.0.tip.conv.weights,yolo_block.1.0.0.conv.weights,yolo_block.1.0.1.conv.weights,yolo_block.1.1.0.conv.weights,yolo_block.1.1.1.conv.weights,yolo_block.1.2.conv.weights,yolo_block.1.tip.conv.weights,yolo_block.2.0.0.conv.weights,yolo_block.2.0.1.conv.weights,yolo_block.2.1.0.conv.weights,yolo_block.2.1.1.conv.weights,yolo_block.2.2.conv.weights,yolo_block.2.tip.conv.weights"
+```
+- YOLOv3模型剪裁中剪裁策略`r578`表示`yolo_head`中三个输出分支一次使用`0.5, 0.7, 0.8`的剪裁率剪裁，即剪裁率如下。
+
+```
+--pruned_ratios="0.5,0.5,0.5,0.5,0.5,0.5,0.7,0.7,0.7,0.7,0.7,0.7,0.8,0.8,0.8,0.8,0.8,0.8"
+```
+
+- YOLOv3模型剪裁中剪裁策略`sensity`表示`yolo_head`中各参数剪裁率如下，该剪裁率为使用`yolov3_mobilnet_v1`模型在COCO数据集上敏感度实验分析得出。
+
+```
+--pruned_ratios="0.1,0.2,0.2,0.2,0.2,0.1,0.2,0.3,0.3,0.3,0.2,0.1,0.3,0.4,0.4,0.4,0.4,0.3"
+```
+
+### YOLOv3 on COCO
+
+| 骨架网络         |  剪裁策略 |     GFLOPs     |  模型体积(MB)   | 输入尺寸 |   Box AP   |                           下载                          |
+| :----------------| :-------: | :------------: | :-------------: | :------: | :--------: | :-----------------------------------------------------: |
+| ResNet50-vd-dcn  |  baseline | 44.71          | 176.82          |   608    | 39.1       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r50vd_dcn.tar) |
+| ResNet50-vd-dcn  |  sensity  | 37.53(-16.06%) | 149.49(-15.46%) |   608    | 39.8(+0.7) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_r50vd_dcn_prune1x.tar) |
+| ResNet50-vd-dcn  |   r578    | 29.98(-32.94%) | 112.08(-36.61%) |   608    | 38.3(-0.8) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_r50vd_dcn_prune578.tar) |
+| MobileNetV1      |  baseline | 20.64          |  94.60          |   608    | 29.3       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) |
+| MobileNetV1      |  baseline |  9.66          |  94.60          |   416    | 29.3       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) |
+| MobileNetV1      |  baseline |  5.72          |  94.60          |   320    | 27.1       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) |
+| MobileNetV1      |  sensity  | 13.57(-34.27%) |  67.60(-28.54%) |   608    | 30.2(+0.9) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_mobilenet_v1_prune1x.tar) |
+| MobileNetV1      |  sensity  |  6.35(-34.27%) |  67.60(-28.54%) |   416    | 29.7(+0.4) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_mobilenet_v1_prune1x.tar) |
+| MobileNetV1      |  sensity  |  3.76(-34.27%) |  67.60(-28.54%) |   320    | 27.2(+0.1) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_mobilenet_v1_prune1x.tar) |
+| MobileNetV1      |   r578    |  6.27(-69.64%) |  31.30(-66.90%) |   608    | 27.8(-1.5) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_mobilenet_v1_prune578.tar) |
+| MobileNetV1      |   r578    |  2.93(-69.64%) |  31.30(-66.90%) |   416    | 26.8(-2.5) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_mobilenet_v1_prune578.tar) |
+| MobileNetV1      |   r578    |  1.74(-69.64%) |  31.30(-66.90%) |   320    | 24.0(-3.1) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_mobilenet_v1_prune578.tar) |
+| MobileNetV3      |   r578    |  - |  17.0(-81.11%) |   320    | 24.6(-2.5) | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/mobile_models/yolov3_mobilenet_v3_prune75875_FPGM_distillby_r34.pdparams) |
+
+- YOLO v3在训练阶段对minibatch采用随机reshape，可以采用相同的模型权重不同尺寸图片，表中`YOLOv3-MobileNetV1`提供了在`608/416/320`三种不同尺寸下的精度结果
+- 在使用`sensity`剪裁策略下，`YOLOv3-ResNet50-vd-dcn`和`YOLOv3-MobileNetV1`分别减少了`16.06%`和`34.27%`的FLOPs，输入图像尺寸为608时精度分别提高`0.7`和`0.9`
+- 在使用`r578`剪裁策略下，`YOLOv3-ResNet50-vd-dcn`和`YOLOv3-MobileNetV1`分别减少了`32.98%`和`69.64%`的FLOPs，输入图像尺寸为608时精度分别降低`0.8`和`1.5`
+- MobileNetV3-YOLOv3剪裁策略请参考: [MV3-YOLOv3剪裁说明](https://github.com/PaddlePaddle/PaddleDetection/tree/release/0.4/configs/mobile#yolov3%E5%89%AA%E8%A3%81%E8%AF%B4%E6%98%8E)
+
+### YOLOv3 on Pascal VOC
+
+| 骨架网络         |  剪裁策略 |     GFLOPs     |  模型体积(MB)   | 输入尺寸 |   Box AP   |                           下载                          |
+| :----------------| :-------: | :------------: | :-------------: | :------: | :--------: | :-----------------------------------------------------: |
+| MobileNetV1      |  baseline | 20.20          |  93.37          |   608    | 76.2       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) |
+| MobileNetV1      |  baseline |  9.46          |  93.37          |   416    | 76.7       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) |
+| MobileNetV1      |  baseline |  5.60          |  93.37          |   320    | 75.3       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) |
+| MobileNetV1      |  sensity  | 13.22(-34.55%) |  66.53(-28.74%) |   608    | 78.4(+2.2) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_mobilenet_v1_voc_prune1x.tar) |
+| MobileNetV1      |  sensity  |  6.19(-34.55%) |  66.53(-28.74%) |   416    | 78.7(+2.0) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_mobilenet_v1_voc_prune1x.tar) |
+| MobileNetV1      |  sensity  |  3.66(-34.55%) |  66.53(-28.74%) |   320    | 76.1(+0.8) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_mobilenet_v1_voc_prune1x.tar) |
+| MobileNetV1      |   r578    |  6.15(-69.57%) |  30.81(-67.00%) |   608    | 77.6(+1.4) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_mobilenet_v1_voc_prune578.tar) |
+| MobileNetV1      |   r578    |  2.88(-69.57%) |  30.81(-67.00%) |   416    | 77.7(+1.0) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_mobilenet_v1_voc_prune578.tar) |
+| MobileNetV1      |   剪裁+蒸馏    |  1.70(-69.57%) |  30.81(-67.00%) |   320    | 75.5(+0.2) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_mobilenet_v1_voc_prune578.tar) |
+
+- YOLO v3在训练阶段对minibatch采用随机reshape，可以采用相同的模型权重不同尺寸图片，表中`YOLOv3-MobileNetV1`提供了在`608/416/320`三种不同尺寸下的精度结果
+- 在使用`sensity`和`r578`剪裁策略下，`YOLOv3-MobileNetV1`分别减少了`34.55%`和`69.57%`的FLOPs，输入图像尺寸为608时精度分别提高`2.2`和`1.4`
+
+### 蒸馏通道剪裁模型
+
+可通过高精度模型蒸馏通道剪裁后模型的方式，训练方法及相关示例见[蒸馏通道剪裁模型](https://github.com/PaddlePaddle/PaddleDetection/blob/master/slim/extensions/distill_pruned_model/distill_pruned_model_demo.ipynb)。
+
+COCO数据集上蒸馏通道剪裁模型库如下。
+
+| 骨架网络         |  剪裁策略 |     GFLOPs     |  模型体积(MB)   | 输入尺寸 |         teacher模型          |   Box AP   |                           下载                          |
+| :----------------| :-------: | :------------: | :-------------: | :------: | :--------------------------: | :--------: | :-----------------------------------------------------: |
+| ResNet50-vd-dcn  |  baseline | 44.71          | 176.82          |   608    |              -               | 39.1       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r50vd_dcn.tar) |
+| ResNet50-vd-dcn  |   r578    | 29.98(-32.94%) | 112.08(-36.61%) |   608    | YOLOv3-ResNet50-vd-dcn(39.1) | 39.7(+0.6) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_r50vd_dcn_prune578_distill.tar) |
+| MobileNetV1      |  baseline | 20.64          |  94.60          |   608    |              -               | 29.3       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) |
+| MobileNetV1      |  baseline |  9.66          |  94.60          |   416    |              -               | 29.3       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) |
+| MobileNetV1      |  baseline |  5.72          |  94.60          |   320    |              -               | 27.1       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) |
+| MobileNetV1      |   r578    |  6.27(-69.64%) |  31.30(-66.90%) |   608    | YOLOv3-ResNet34(36.2)        | 29.0(-0.3) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_mobilenet_v1_prune578_distillby_r34.tar) |
+| MobileNetV1      |   r578    |  2.93(-69.64%) |  31.30(-66.90%) |   416    | YOLOv3-ResNet34(34.3)        | 28.0(-1.3) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_mobilenet_v1_prune578_distillby_r34.tar) |
+| MobileNetV1      |   r578    |  1.74(-69.64%) |  31.30(-66.90%) |   320    | YOLOv3-ResNet34(31.4)        | 25.1(-2.0) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_mobilenet_v1_prune578_distillby_r34.tar) |
+
+- YOLO v3在训练阶段对minibatch采用随机reshape，可以采用相同的模型权重不同尺寸图片，表中`YOLOv3-MobileNetV1`提供了在`608/416/320`三种不同尺寸下的精度结果
+- 在使用`r578`剪裁策略并使用`YOLOv3-ResNet50-vd-dcn`作为teacher模型蒸馏，`YOLOv3-ResNet50-vd-dcn`模型减少了`32.94%`的FLOPs，输入图像尺寸为608时精度提高`0.6`
+- 在使用`r578`剪裁策略并使用`YOLOv3-ResNet34`作为teacher模型蒸馏下，`YOLOv3-MobileNetV1`模型减少了`69.64%`的FLOPs，输入图像尺寸为608时精度降低`0.3`
+
+Pascal VOC数据集上蒸馏通道剪裁模型库如下。
+
+| 骨架网络         |  剪裁策略 |     GFLOPs     |  模型体积(MB)   | 输入尺寸 |       teacher模型      |   Box AP   |                           下载                          |
+| :----------------| :-------: | :------------: | :-------------: | :------: | :--------------------: | :--------: | :-----------------------------------------------------: |
+| MobileNetV1      |  baseline | 20.20          |  93.37          |   608    |           -            | 76.2       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) |
+| MobileNetV1      |  baseline |  9.46          |  93.37          |   416    |           -            | 76.7       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) |
+| MobileNetV1      |  baseline |  5.60          |  93.37          |   320    |           -            | 75.3       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) |
+| MobileNetV1      |   r578    |  6.15(-69.57%) |  30.81(-67.00%) |   608    | YOLOv3-ResNet34(82.6)  | 78.8(+2.6) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_mobilenet_v1_voc_prune578_distillby_r34.tar) |
+| MobileNetV1      |   r578    |  2.88(-69.57%) |  30.81(-67.00%) |   416    | YOLOv3-ResNet34(81.9)  | 78.7(+2.0) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_mobilenet_v1_voc_prune578_distillby_r34.tar) |
+| MobileNetV1      |   r578    |  1.70(-69.57%) |  30.81(-67.00%) |   320    | YOLOv3-ResNet34(80.1)  | 76.3(+2.0) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/prune/yolov3_mobilenet_v1_voc_prune578_distillby_r34.tar) |
+
+- YOLO v3在训练阶段对minibatch采用随机reshape，可以采用相同的模型权重不同尺寸图片，表中`YOLOv3-MobileNetV1`提供了在`608/416/320`三种不同尺寸下的精度结果
+- 在使用`r578`剪裁策略并使用`YOLOv3-ResNet34`作为teacher模型蒸馏下，`YOLOv3-MobileNetV1`模型减少了`69.57%`的FLOPs，输入图像尺寸为608时精度提高`2.6`
+
+### YOLOv3通道剪裁模型推理时延
+
+- 时延单位均为`ms/images`
+- Tesla P4时延为单卡并开启TensorRT推理时延
+- 高通835/高通855/麒麟970时延为使用PaddleLite部署，使用`arm8`架构并使用4线程(4 Threads)推理时延
+
+| 骨架网络         | 数据集 | 剪裁策略 |     GFLOPs     |  模型体积(MB)   | 输入尺寸 |    Tesla P4     |     麒麟970      |     高通835      |     高通855      |
+| :--------------- | :----: | :------: | :------------: | :-------------: | :------: | :-------------: | :--------------: | :--------------: | :--------------: |
+| MobileNetV1      |  VOC   | baseline | 20.20          |  93.37          |   608    | 16.556          | 748.404          | 734.970          | 289.878          |
+| MobileNetV1      |  VOC   | baseline |  9.46          |  93.37          |   416    |  9.031          | 371.214          | 349.065          | 140.877          |
+| MobileNetV1      |  VOC   | baseline |  5.60          |  93.37          |   320    |  6.235          | 221.705          | 200.498          |  80.515          |
+| MobileNetV1      |  VOC   |   r578   |  6.15(-69.57%) |  30.81(-67.00%) |   608    | 10.064(-39.21%) | 314.531(-57.97%) | 323.537(-55.98%) | 123.414(-57.43%) |
+| MobileNetV1      |  VOC   |   r578   |  2.88(-69.57%) |  30.81(-67.00%) |   416    |  5.478(-39.34%) | 151.562(-59.17%) | 146.014(-58.17%) |  56.420(-59.95%) |
+| MobileNetV1      |  VOC   |   r578   |  1.70(-69.57%) |  30.81(-67.00%) |   320    |  3.880(-37.77%) |  91.132(-58.90%) |  87.440(-56.39%) |  31.470(-60.91%) |
+| MobileNetV3      |  COCO   |   剪裁+蒸馏   |  - |  17.0(-81.11%) |   320    |  - |  - |  - |  高通845：91(-71.47%) |
+| ResNet50-vd-dcn  |  COCO  | baseline | 44.71          | 176.82          |   608    | 36.127          |        -         |        -         |        -         |
+| ResNet50-vd-dcn  |  COCO  | sensity  | 37.53(-16.06%) | 149.49(-15.46%) |   608    | 33.245(-7.98%)  |        -         |        -         |        -         |
+| ResNet50-vd-dcn  |  COCO  |   r578   | 29.98(-32.94%) | 112.08(-36.61%) |   608    | 29.138(-19.35%) |        -         |        -         |        -         |
+
+- 在使用`r578`剪裁策略下，`YOLOv3-MobileNetV1`模型减少了`69.57%`的FLOPs，输入图像尺寸为608时在单卡Tesla P4(TensorRT)推理时间减少`39.21%`，在麒麟970/高通835/高通855上推理时延分别减少`57.97%`, `55.98%`和`57.43%`
+- 在使用`sensity`和`r578`剪裁策略下，`YOLOv3-ResNet50-vd-dcn`模型分别减少了`16.06%`和`32.94%`的FLOPs，输入图像尺寸为608时在单卡Tesla P4(TensorRT)推理时间分别减少`7.98%`和`19.35%`
+
+## 蒸馏模型库
+
+### 训练策略
+
+- 蒸馏模型训练时teacher模型使用[PaddleDetection模型库](https://paddledetection.readthedocs.io/zh/latest/MODEL_ZOO_cn.html)发布的模型权重作为预训练权重。
+- 蒸馏模型训练时student模型使用backbone的预训练权重
+- 蒸馏策略`l2_distiil`为使用teacher模型和student模型特征图的L2损失作为蒸馏损失进行蒸馏，为`slim/distillation/distill.py`的默认策略
+- 蒸馏策略`split_distiil`为使用YOLOv3细粒度损失进行蒸馏，通过`-o use_fine_grained_loss=true`指定
+
+### YOLOv3 on COCO
+
+| 骨架网络         |    蒸馏策略   | 输入尺寸 |       teacher模型      |    Box AP    |                           下载                          |
+| :----------------| :-----------: | :------: | :--------------------: | :----------: | :-----------------------------------------------------: |
+| MobileNetV1      |    baseline   |   608    |           -            | 29.3         | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) |
+| MobileNetV1      |    baseline   |   416    |           -            | 29.3         | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) |
+| MobileNetV1      |    baseline   |   320    |           -            | 27.1         | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) |
+| MobileNetV1      | split_distiil |   608    | YOLOv3-ResNet34(36.2)  | 31.4(+2.1)   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_distilled.tar) |
+| MobileNetV1      | split_distiil |   416    | YOLOv3-ResNet34(34.3)  | 30.0(+0.7)   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_distilled.tar) |
+| MobileNetV1      | split_distiil |   320    | YOLOv3-ResNet34(31.4)  | 27.1(+0.0)   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_distilled.tar) |
+
+- YOLO v3在训练阶段对minibatch采用随机reshape，可以采用相同的模型权重不同尺寸图片，表中`YOLOv3-MobileNetV1`提供了在`608/416/320`三种不同尺寸下的精度结果
+- 在使用`YOLOv3-ResNet34`模型通过`split_distiil`策略蒸馏下，输入图像尺寸为608时`YOLOv3-MobileNetV1`模型精度提高`2.1`
+
+### YOLOv3 on Pascal VOC
+
+| 骨架网络         |    蒸馏策略   | 输入尺寸 |       teacher模型      |   Box AP   |                           下载                          |
+| :----------------| :-----------: | :------: | :--------------------: | :--------: | :-----------------------------------------------------: |
+| MobileNetV1      |    baseline   |   608    |           -            | 76.2       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) |
+| MobileNetV1      |    baseline   |   416    |           -            | 76.7       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) |
+| MobileNetV1      |    baseline   |   320    |           -            | 75.3       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) |
+| MobileNetV1      |  l2_distiil   |   608    | YOLOv3-ResNet34(82.6)  | 79.0(+2.8) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_voc_distilled.tar) |
+| MobileNetV1      |  l2_distiil   |   416    | YOLOv3-ResNet34(81.9)  | 78.2(+1.5) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_voc_distilled.tar) |
+| MobileNetV1      |  l2_distiil   |   320    | YOLOv3-ResNet34(80.1)  | 75.5(+0.2) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_voc_distilled.tar) |
+
+- YOLO v3在训练阶段对minibatch采用随机reshape，可以采用相同的模型权重不同尺寸图片，表中`YOLOv3-MobileNetV1`提供了在`608/416/320`三种不同尺寸下的精度结果
+- 在使用`YOLOv3-ResNet34`模型通过`l2_distiil`策略蒸馏下，输入图像尺寸为608时`YOLOv3-MobileNetV1`模型精度提高`2.8`
+
+## 量化模型库
+
+### 训练策略
+
+- 量化策略`post`为使用离线量化得到的模型，`aware`为在线量化训练得到的模型。
+
+### YOLOv3 on COCO
+
+| 骨架网络         | 预训练权重 | 量化策略 | 输入尺寸 |   Box AP   |                           下载                          |
+| :----------------| :--------: | :------: | :------: | :--------: | :-----------------------------------------------------: |
+| MobileNetV1      |  ImageNet  | baseline |   608    | 29.3         | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) |
+| MobileNetV1      |  ImageNet  | baseline |   416    | 29.3         | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) |
+| MobileNetV1      |  ImageNet  | baseline |   320    | 27.1         | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) |
+| MobileNetV1      |  ImageNet  |   post   |   608    | 27.9(-1.4)   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_quant_post.tar) |
+| MobileNetV1      |  ImageNet  |   post   |   416    | 28.0(-1.3)   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_quant_post.tar) |
+| MobileNetV1      |  ImageNet  |   post   |   320    | 26.0(-1.1)   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_quant_post.tar) |
+| MobileNetV1      |  ImageNet  |  aware   |   608    | 28.1(-1.2)   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenet_coco_quant_aware.tar) |
+| MobileNetV1      |  ImageNet  |  aware   |   416    | 28.2(-1.1)   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenet_coco_quant_aware.tar) |
+| MobileNetV1      |  ImageNet  |  aware   |   320    | 25.8(-1.3)   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenet_coco_quant_aware.tar) |
+| ResNet34         |  ImageNet  | baseline |   608    | 36.2         | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34.tar) |
+| ResNet34         |  ImageNet  | baseline |   416    | 34.3         | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34.tar) |
+| ResNet34         |  ImageNet  | baseline |   320    | 31.4         | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34.tar) |
+| ResNet34         |  ImageNet  |   post   |   608    | 35.7(-0.5)   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r34_coco_quant_post.tar) |
+| ResNet34         |  ImageNet  |  aware   |   608    | 35.2(-1.1)   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r34_coco_quant_aware.tar) |
+| ResNet34         |  ImageNet  |  aware   |   416    | 33.3(-1.0)   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r34_coco_quant_aware.tar) |
+| ResNet34         |  ImageNet  |  aware   |   320    | 30.3(-1.1)   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r34_coco_quant_aware.tar) |
+| R50vd-dcn        | object365  | baseline |   608    | 41.4         | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r50vd_dcn_obj365_pretrained_coco.tar) |
+| R50vd-dcn        | object365  |  aware   |   608    | 40.6(-0.8)   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r50vd_dcn_obj365_pretrained_coco_quant_aware.tar) |
+| R50vd-dcn        | object365  |  aware   |   416    | 37.5(-)      | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r50vd_dcn_obj365_pretrained_coco_quant_aware.tar) |
+| R50vd-dcn        | object365  |  aware   |   320    | 34.1(-)      | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r50vd_dcn_obj365_pretrained_coco_quant_aware.tar) |
+
+- YOLO v3在训练阶段对minibatch采用随机reshape，可以采用相同的模型权重不同尺寸图片，表中部分模型提供了在`608/416/320`三种不同尺寸下的精度结果
+- `YOLOv3-MobileNetV1`使用离线(post)和在线(aware)两种量化方式，输入图像尺寸为608时精度分别降低`1.4`和`1.2`
+- `YOLOv3-ResNet34`使用离线(post)和在线(aware)两种量化方式，输入图像尺寸为608时精度分别降低`0.5`和`1.1`
+- `YOLOv3-R50vd-dcn`使用在线(aware)量化方式，输入图像尺寸为608时精度降低`0.8`
+
+### BlazeFace on WIDER FACE
+
+| 模型             | 量化策略 | 输入尺寸 |  Easy Set  | Medium Set |  Hard Set  |                           下载                          |
+| :--------------- | :------: | :------: | :--------: | :--------: | :--------: | :-----------------------------------------------------: |
+| BlazeFace        | baseline |   640    | 91.5       | 89.2       | 79.7       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/blazeface_original.tar) |
+| BlazeFace        |   post   |   640    | 87.8(-3.7) | 85.1(-3.9) | 74.9(-4.8) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/blazeface_origin_quant_post.tar) |
+| BlazeFace        |  aware   |   640    | 90.5(-1.0) | 87.9(-1.3) | 77.6(-2.1) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/blazeface_origin_quant_aware.tar) |
+| BlazeFace-Lite   | baseline |   640    | 90.9       | 88.5       | 78.1       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/blazeface_lite.tar) |
+| BlazeFace-Lite   |   post   |   640    | 89.4(-1.5) | 86.7(-1.8) | 75.7(-2.4) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/blazeface_lite_quant_post.tar) |
+| BlazeFace-Lite   |  aware   |   640    | 89.7(-1.2) | 87.3(-1.2) | 77.0(-1.1) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/blazeface_lite_quant_aware.tar) |
+| BlazeFace-NAS    | baseline |   640    | 83.7       | 80.7       | 65.8       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/blazeface_nas.tar) |
+| BlazeFace-NAS    |   post   |   640    | 81.6(-2.1) | 78.3(-2.4) | 63.6(-2.2) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/blazeface_nas_quant_post.tar) |
+| BlazeFace-NAS    |  aware   |   640    | 83.1(-0.6) | 79.7(-1.0) | 64.2(-1.6) | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/blazeface_nas_quant_aware.tar) |
+
+- `BlazeFace`系列模型中在线(aware)量化性能明显优于离线(post)量化
+- `BlazeFace`模型使用在线(aware)量化方式，在`Easy/Medium/Hard`数据集上精度分别降低`1.0`, `1.3`和`2.1`
+- `BlazeFace-Lite`模型使用在线(aware)量化方式，在`Easy/Medium/Hard`数据集上精度分别降低`1.2`, `1.2`和`1.1`
+- `BlazeFace-NAS`模型使用在线(aware)量化方式，在`Easy/Medium/Hard`数据集上精度分别降低`0.6`, `1.0`和`1.6`
diff --git a/demo/distillation/README.md b/demo/distillation/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ce3bc6fa71a82af4b9ec3fdfe5006da7b2719d64
--- /dev/null
+++ b/demo/distillation/README.md
@@ -0,0 +1,40 @@
+# 知识蒸馏示例
+
+本示例将介绍如何使用知识蒸馏接口训练模型，蒸馏训练得到的模型相比不使用蒸馏策略的基线模型在精度上会有一定的提升。
+
+## 接口介绍
+
+请参考 [知识蒸馏API文档](https://paddlepaddle.github.io/PaddleSlim/api/single_distiller_api/)。
+
+### 1. 蒸馏训练配置
+
+示例使用ResNet50_vd作为teacher模型，对MobileNet结构的student网络进行蒸馏训练。
+
+默认配置:
+
+```yaml
+batch_size: 256
+init_lr: 0.1
+lr_strategy: piecewise_decay
+l2_decay: 3e-5
+momentum_rate: 0.9
+num_epochs: 120
+data: imagenet
+```
+训练使用默认配置启动即可
+
+### 2. 启动训练
+
+在配置好ImageNet数据集后，用以下命令启动训练即可:
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 python distill.py
+```
+
+### 3. 训练结果
+
+对比不使用蒸馏策略的基线模型(Top-1/Top-5: 70.99%/89.68%)，
+
+经过120轮的蒸馏训练，MobileNet模型的Top-1/Top-5准确率达到72.77%/90.68%, Top-1/Top-5性能提升+1.78%/+1.00%
+
+详细实验数据请参见[PaddleSlim模型库蒸馏部分](https://paddlepaddle.github.io/PaddleSlim/model_zoo/#13)
diff --git a/demo/distillation/distillation_demo.py b/demo/distillation/distill.py
similarity index 73%
rename from demo/distillation/distillation_demo.py
rename to demo/distillation/distill.py
index 3f47553e541ff86ae0a6f4d86c046a1dee66a03f..f1b2be691ff34f4765568de342d008e7d2657ff6 100644
--- a/demo/distillation/distillation_demo.py
+++ b/demo/distillation/distill.py
@@ -11,7 +11,7 @@ import argparse
 import functools
 import numpy as np
 import paddle.fluid as fluid
-sys.path.append(sys.path[0] + "/../")
+sys.path[0] = os.path.join(os.path.dirname("__file__"), os.path.pardir)
 import models
 from utility import add_arguments, print_arguments, _download, _decompress
 from paddleslim.dist import merge, l2_loss, soft_label_loss, fsp_loss
@@ -23,8 +23,9 @@ _logger.setLevel(logging.INFO)
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('batch_size',       int,  64*4,                 "Minibatch size.")
+add_arg('batch_size',       int,  64,                 "Minibatch size.")
 add_arg('use_gpu',          bool, True,                "Whether to use GPU or not.")
+add_arg('save_inference',   bool, False,                "Whether to save inference model.")
 add_arg('total_images',     int,  1281167,              "Training image number.")
 add_arg('image_shape',      str,  "3,224,224",         "Input image size")
 add_arg('lr',               float,  0.1,               "The learning rate used to fine-tune pruned model.")
@@ -32,12 +33,12 @@ add_arg('lr_strategy',      str,  "piecewise_decay",   "The learning rate decay
 add_arg('l2_decay',         float,  3e-5,               "The l2_decay parameter.")
 add_arg('momentum_rate',    float,  0.9,               "The value of momentum_rate.")
 add_arg('num_epochs',       int,  120,               "The number of total epochs.")
-add_arg('data',             str, "cifar10",                 "Which data to use. 'cifar10' or 'imagenet'")
+add_arg('data',             str, "imagenet",                 "Which data to use. 'cifar10' or 'imagenet'")
 add_arg('log_period',       int, 20,                 "Log period in batches.")
 add_arg('model',            str,  "MobileNet",          "Set the network to use.")
 add_arg('pretrained_model', str,  None,                "Whether to use pretrained model.")
-add_arg('teacher_model',    str,  "ResNet50",          "Set the teacher network to use.")
-add_arg('teacher_pretrained_model', str,  "./ResNet50_pretrained",                "Whether to use pretrained model.")
+add_arg('teacher_model',    str,  "ResNet50_vd",          "Set the teacher network to use.")
+add_arg('teacher_pretrained_model', str,  "./ResNet50_vd_pretrained",                "Whether to use pretrained model.")
 parser.add_argument('--step_epochs', nargs='+', type=int, default=[30, 60, 90], help="piecewise decay step")
 # yapf: enable
 
@@ -45,7 +46,12 @@ model_list = [m for m in dir(models) if "__" not in m]
 
 
 def piecewise_decay(args):
-    step = int(math.ceil(float(args.total_images) / args.batch_size))
+    if args.use_gpu:
+        devices_num = fluid.core.get_cuda_device_count()
+    else:
+        devices_num = int(os.environ.get('CPU_NUM', 1))
+    step = int(
+        math.ceil(float(args.total_images) / args.batch_size) / devices_num)
     bd = [step * e for e in args.step_epochs]
     lr = [args.lr * (0.1**i) for i in range(len(bd) + 1)]
     learning_rate = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
@@ -53,18 +59,23 @@ def piecewise_decay(args):
         learning_rate=learning_rate,
         momentum=args.momentum_rate,
         regularization=fluid.regularizer.L2Decay(args.l2_decay))
-    return optimizer
+    return learning_rate, optimizer
 
 
 def cosine_decay(args):
-    step = int(math.ceil(float(args.total_images) / args.batch_size))
+    if cfg.use_gpu:
+        devices_num = fluid.core.get_cuda_device_count()
+    else:
+        devices_num = int(os.environ.get('CPU_NUM', 1))
+    step = int(
+        math.ceil(float(args.total_images) / args.batch_size) / devices_num)
     learning_rate = fluid.layers.cosine_decay(
         learning_rate=args.lr, step_each_epoch=step, epochs=args.num_epochs)
     optimizer = fluid.optimizer.Momentum(
         learning_rate=learning_rate,
         momentum=args.momentum_rate,
         regularization=fluid.regularizer.L2Decay(args.l2_decay))
-    return optimizer
+    return learning_rate, optimizer
 
 
 def create_optimizer(args):
@@ -118,16 +129,13 @@ def compress(args):
             avg_cost = fluid.layers.mean(x=cost)
             acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
             acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
-    #print("="*50+"student_model_params"+"="*50)
-    #for v in student_program.list_vars():
-    #    print(v.name, v.shape)
 
     place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    train_reader = paddle.batch(
+    train_reader = paddle.fluid.io.batch(
         train_reader, batch_size=args.batch_size, drop_last=True)
-    val_reader = paddle.batch(
+    val_reader = paddle.fluid.io.batch(
         val_reader, batch_size=args.batch_size, drop_last=True)
     val_program = student_program.clone(for_test=True)
 
@@ -145,21 +153,23 @@ def compress(args):
                 name='image', shape=image_shape, dtype='float32')
             predict = teacher_model.net(image, class_dim=class_dim)
 
-    #print("="*50+"teacher_model_params"+"="*50)
-    #for v in teacher_program.list_vars():
-    #    print(v.name, v.shape)
-
     exe.run(t_startup)
-    _download('http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_pretrained.tar', '.')
-    _decompress('./ResNet50_pretrained.tar')
+    if not os.path.exists(args.teacher_pretrained_model):
+        _download(
+            'http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_pretrained.tar',
+            '.')
+        _decompress('./ResNet50_vd_pretrained.tar')
     assert args.teacher_pretrained_model and os.path.exists(
         args.teacher_pretrained_model
     ), "teacher_pretrained_model should be set when teacher_model is not None."
 
     def if_exist(var):
-        return os.path.exists(
-            os.path.join(args.teacher_pretrained_model, var.name)
-        ) and var.name != 'fc_0.w_0' and var.name != 'fc_0.b_0'
+        exist = os.path.exists(
+            os.path.join(args.teacher_pretrained_model, var.name))
+        if args.data == "cifar10" and (var.name == 'fc_0.w_0' or
+                                       var.name == 'fc_0.b_0'):
+            exist = False
+        return exist
 
     fluid.io.load_vars(
         exe,
@@ -168,35 +178,33 @@ def compress(args):
         predicate=if_exist)
 
     data_name_map = {'image': 'image'}
-    main = merge(
-        teacher_program,
-        student_program,
-        data_name_map,
-        place)
-
-    with fluid.program_guard(main, s_startup):
-        l2_loss = l2_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0", main)
-        loss = avg_cost + l2_loss
-        opt = create_optimizer(args)
+    merge(teacher_program, student_program, data_name_map, place)
+
+    with fluid.program_guard(student_program, s_startup):
+        distill_loss = soft_label_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0",
+                                       student_program)
+        loss = avg_cost + distill_loss
+        lr, opt = create_optimizer(args)
         opt.minimize(loss)
     exe.run(s_startup)
     build_strategy = fluid.BuildStrategy()
     build_strategy.fuse_all_reduce_ops = False
-    parallel_main = fluid.CompiledProgram(main).with_data_parallel(
+    parallel_main = fluid.CompiledProgram(student_program).with_data_parallel(
         loss_name=loss.name, build_strategy=build_strategy)
 
     for epoch_id in range(args.num_epochs):
         for step_id, data in enumerate(train_loader):
-            loss_1, loss_2, loss_3 = exe.run(
+            lr_np, loss_1, loss_2, loss_3 = exe.run(
                 parallel_main,
                 feed=data,
                 fetch_list=[
-                    loss.name, avg_cost.name, l2_loss.name
+                    lr.name, loss.name, avg_cost.name, distill_loss.name
                 ])
             if step_id % args.log_period == 0:
                 _logger.info(
-                    "train_epoch {} step {} loss {:.6f}, class loss {:.6f}, l2 loss {:.6f}".
-                    format(epoch_id, step_id, loss_1[0], loss_2[0], loss_3[0]))
+                    "train_epoch {} step {} lr {:.6f}, loss {:.6f}, class loss {:.6f}, distill loss {:.6f}".
+                    format(epoch_id, step_id, lr_np[0], loss_1[0], loss_2[0],
+                           loss_3[0]))
         val_acc1s = []
         val_acc5s = []
         for step_id, data in enumerate(valid_loader):
@@ -211,6 +219,10 @@ def compress(args):
                     "valid_epoch {} step {} loss {:.6f}, top1 {:.6f}, top5 {:.6f}".
                     format(epoch_id, step_id, val_loss[0], val_acc1[0],
                            val_acc5[0]))
+        if args.save_inference:
+            fluid.io.save_inference_model(
+                os.path.join("./saved_models", str(epoch_id)), ["image"],
+                [out], exe, student_program)
         _logger.info("epoch {} top1 {:.6f}, top5 {:.6f}".format(
             epoch_id, np.mean(val_acc1s), np.mean(val_acc5s)))
 
diff --git a/demo/distillation/image_classification_distillation_tutorial.ipynb b/demo/distillation/image_classification_distillation_tutorial.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..e1d679cdabaafc3eb71a5382dff1da0946b0b799
--- /dev/null
+++ b/demo/distillation/image_classification_distillation_tutorial.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# PaddleSlim Distillation知识蒸馏简介与实验\n",
+    "\n",
+    "一般情况下，模型参数量越多，结构越复杂，其性能越好，但参数也越冗余，运算量和资源消耗也越大。**知识蒸馏**就是一种将大模型学习到的有用信息（Dark Knowledge）压缩进更小更快的模型，而获得可以匹敌大模型结果的方法。\n",
+    "\n",
+    "在本文中性能强劲的大模型被称为teacher, 性能稍逊但体积较小的模型被称为student。示例包含以下步骤：\n",
+    "\n",
+    "1. 导入依赖\n",
+    "2. 定义student_program和teacher_program\n",
+    "3. 选择特征图\n",
+    "4. 合并program (merge)并添加蒸馏loss\n",
+    "5. 模型训练\n",
+    "\n",
+    "\n",
+    "## 1. 导入依赖\n",
+    "PaddleSlim依赖Paddle1.7版本，请确认已正确安装Paddle，然后按以下方式导入Paddle、PaddleSlim以及其他依赖:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import paddle\n",
+    "import paddle.fluid as fluid\n",
+    "import paddleslim as slim\n",
+    "import sys\n",
+    "sys.path.append(\"../\")\n",
+    "import models"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. 定义student_program和teacher_program\n",
+    "\n",
+    "本教程在MNIST数据集上进行知识蒸馏的训练和验证，输入图片尺寸为`[1, 28, 28]`，输出类别数为10。\n",
+    "选择`ResNet50`作为teacher对`MobileNet`结构的student进行蒸馏训练。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "model = models.__dict__['MobileNet']()\n",
+    "student_program = fluid.Program()\n",
+    "student_startup = fluid.Program()\n",
+    "with fluid.program_guard(student_program, student_startup):\n",
+    "    image = fluid.data(\n",
+    "        name='image', shape=[None] + [1, 28, 28], dtype='float32')\n",
+    "    label = fluid.data(name='label', shape=[None, 1], dtype='int64')\n",
+    "    out = model.net(input=image, class_dim=10)\n",
+    "    cost = fluid.layers.cross_entropy(input=out, label=label)\n",
+    "    avg_cost = fluid.layers.mean(x=cost)\n",
+    "    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)\n",
+    "    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "teacher_model = models.__dict__['ResNet50']()\n",
+    "teacher_program = fluid.Program()\n",
+    "teacher_startup = fluid.Program()\n",
+    "with fluid.program_guard(teacher_program, teacher_startup):\n",
+    "    with fluid.unique_name.guard():\n",
+    "        image = fluid.data(\n",
+    "            name='image', shape=[None] + [1, 28, 28], dtype='float32')\n",
+    "        predict = teacher_model.net(image, class_dim=10)\n",
+    "exe = fluid.Executor(fluid.CPUPlace())\n",
+    "exe.run(teacher_startup)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. 选择特征图\n",
+    "我们可以用student_的list_vars方法来观察其中全部的Variables，从中选出一个或多个变量（Variable）来拟合teacher相应的变量。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get all student variables\n",
+    "student_vars = []\n",
+    "for v in student_program.list_vars():\n",
+    "    student_vars.append((v.name, v.shape))\n",
+    "#uncomment the following lines to observe student's variables for distillation\n",
+    "#print(\"=\"*50+\"student_model_vars\"+\"=\"*50)\n",
+    "#print(student_vars)\n",
+    "\n",
+    "# get all teacher variables\n",
+    "teacher_vars = []\n",
+    "for v in teacher_program.list_vars():\n",
+    "    teacher_vars.append((v.name, v.shape))\n",
+    "#uncomment the following lines to observe teacher's variables for distillation\n",
+    "#print(\"=\"*50+\"teacher_model_vars\"+\"=\"*50)\n",
+    "#print(teacher_vars)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "经过筛选我们可以看到，teacher_program中的'bn5c_branch2b.output.1.tmp_3'和student_program的'depthwise_conv2d_11.tmp_0'尺寸一致，可以组成蒸馏损失函数。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. 合并program (merge)并添加蒸馏loss\n",
+    "merge操作将student_program和teacher_program中的所有Variables和Op都将被添加到同一个Program中，同时为了避免两个program中有同名变量会引起命名冲突，merge也会为teacher_program中的Variables添加一个同一的命名前缀name_prefix，其默认值是'teacher_'\n",
+    "\n",
+    "为了确保teacher网络和student网络输入的数据是一样的，merge操作也会对两个program的输入数据层进行合并操作，所以需要指定一个数据层名称的映射关系data_name_map，key是teacher的输入数据名称，value是student的"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_name_map = {'image': 'image'}\n",
+    "main = slim.dist.merge(teacher_program, student_program, data_name_map, fluid.CPUPlace())\n",
+    "with fluid.program_guard(student_program, student_startup):\n",
+    "    l2_loss = slim.dist.l2_loss('teacher_bn5c_branch2b.output.1.tmp_3', 'depthwise_conv2d_11.tmp_0', student_program)\n",
+    "    loss = l2_loss + avg_cost\n",
+    "    opt = fluid.optimizer.Momentum(0.01, 0.9)\n",
+    "    opt.minimize(loss)\n",
+    "exe.run(student_startup)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. 模型训练\n",
+    "\n",
+    "为了快速执行该示例，我们选取简单的MNIST数据，Paddle框架的`paddle.dataset.mnist`包定义了MNIST数据的下载和读取。\n",
+    "代码如下："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_reader = paddle.fluid.io.batch(\n",
+    "    paddle.dataset.mnist.train(), batch_size=128, drop_last=True)\n",
+    "train_feeder = fluid.DataFeeder(['image', 'label'], fluid.CPUPlace(), student_program)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for data in train_reader():\n",
+    "    acc1, acc5, loss_np = exe.run(student_program, feed=train_feeder.feed(data), fetch_list=[acc_top1.name, acc_top5.name, loss.name])\n",
+    "    print(\"Acc1: {:.6f}, Acc5: {:.6f}, Loss: {:.6f}\".format(acc1.mean(), acc5.mean(), loss_np.mean()))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/demo/imagenet_reader.py b/demo/imagenet_reader.py
index 25bc756e93db829f3566754e079ba7711074e577..947fd023b9c6eea5f4d6d0a5d52337b1ba97cc3f 100644
--- a/demo/imagenet_reader.py
+++ b/demo/imagenet_reader.py
@@ -14,8 +14,7 @@ DATA_DIM = 224
 THREAD = 16
 BUF_SIZE = 10240
 
-#DATA_DIR = './data/ILSVRC2012/'
-DATA_DIR = './data/'
+DATA_DIR = './data/ILSVRC2012/'
 DATA_DIR = os.path.join(os.path.split(os.path.realpath(__file__))[0], DATA_DIR)
 
 img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
@@ -101,7 +100,11 @@ def distort_color(img):
 def process_image(sample, mode, color_jitter, rotate):
     img_path = sample[0]
 
-    img = Image.open(img_path)
+    try:
+        img = Image.open(img_path)
+    except:
+        print(img_path, "not exists!")
+        return None
     if mode == 'train':
         if rotate: img = rotate_image(img)
         img = random_crop(img, DATA_DIM)
@@ -157,8 +160,7 @@ def _reader_creator(file_list,
                 for line in lines:
                     if mode == 'train' or mode == 'val':
                         img_path, label = line.split()
-                        img_path = os.path.join(data_dir + "/" + mode,
-                                                img_path)
+                        img_path = os.path.join(data_dir, img_path)
                         yield img_path, int(label)
                     elif mode == 'test':
                         img_path = os.path.join(data_dir, line)
diff --git a/demo/mkldnn_quant/CMakeLists.txt b/demo/mkldnn_quant/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..07483b707b74610a5fcdb4b762df77b1925e7b6d
--- /dev/null
+++ b/demo/mkldnn_quant/CMakeLists.txt
@@ -0,0 +1,40 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 3.2)
+
+project(mkldnn_quantaware_demo CXX C)
+set(DEMO_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(DEMO_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+
+option(USE_GPU      "Compile the inference code with the support CUDA GPU" OFF)
+option(USE_PROFILER "Whether enable Paddle's profiler." OFF)
+
+set(USE_SHARED OFF)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+if(NOT PADDLE_ROOT)
+  set(PADDLE_ROOT ${DEMO_SOURCE_DIR}/fluid_inference)
+endif()
+find_package(Fluid)
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -std=c++11")
+
+if(USE_PROFILER)
+    find_package(Gperftools REQUIRED)
+    include_directories(${GPERFTOOLS_INCLUDE_DIR})
+    add_definitions(-DWITH_GPERFTOOLS)
+endif()
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+if(PADDLE_FOUND)
+  add_executable(inference sample_tester.cc)
+  target_link_libraries(inference
+      ${PADDLE_LIBRARIES}
+      ${PADDLE_THIRD_PARTY_LIBRARIES}
+      rt dl pthread)
+  if (mklml_FOUND)
+    target_link_libraries(inference "-L${THIRD_PARTY_ROOT}/install/mklml/lib -liomp5 -Wl,--as-needed")
+  endif()
+else()
+  message(FATAL_ERROR "Cannot find PaddlePaddle Fluid under ${PADDLE_ROOT}")
+endif()
diff --git a/demo/mkldnn_quant/README.md b/demo/mkldnn_quant/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2f49e45e26f9fb5ed827249fb42f837b09b9b509
--- /dev/null
+++ b/demo/mkldnn_quant/README.md
@@ -0,0 +1,176 @@
+# 图像分类INT8量化模型在CPU上的部署和预测
+
+## 概述
+
+本文主要介绍在CPU上转化PaddleSlim产出的量化模型并部署和预测的流程。在Casecade Lake机器上（例如Intel® Xeon® Gold 6271、6248，X2XX等），INT8模型进行推理的速度通常是FP32模型的3-3.7倍；在SkyLake机器（例如Intel® Xeon® Gold 6148、8180，X1XX等）上，使用INT8模型进行推理的速度通常是FP32模型的1.5倍。
+
+流程步骤如下：
+- 产出量化模型：使用PaddleSlim训练并产出量化模型。注意模型中被量化的算子的参数值应该在INT8范围内，但是类型仍为float型。
+- 在CPU上转换量化模型：在CPU上使用DNNL库转化量化模型为INT8模型。
+- 在CPU上部署预测：在CPU上部署样例并进行预测。
+
+## 1. 准备
+
+#### 安装构建PaddleSlim
+
+PaddleSlim 安装请参考[官方安装文档](https://paddlepaddle.github.io/PaddleSlim/install.html)安装
+```
+git clone https://github.com/PaddlePaddle/PaddleSlim.git
+cd PaddleSlim
+python setup.py install
+```
+#### 在代码中使用
+在用户自己的测试样例中，按以下方式导入Paddle和PaddleSlim:
+```
+import paddle
+import paddle.fluid as fluid
+import paddleslim as slim
+import numpy as np
+```
+
+## 2. 用PaddleSlim产出量化模型
+
+用户可以使用PaddleSlim产出量化训练模型或者离线量化模型。如果用户只想要验证部署和预测流程，可以跳过 2.1 和 2.2, 直接下载[mobilenetv2 post-training quant model](https://paddle-inference-dist.cdn.bcebos.com/quantizaiton/quant_post_models/mobilenetv2_quant_post.tgz)以及其对应的原始的FP32模型[mobilenetv2 fp32](https://paddle-inference-dist.cdn.bcebos.com/quantizaiton/fp32_models/mobilenetv2.tgz)。如果用户要转化部署自己的模型，请根据下面2.1, 2.2的步骤产出量化模型。
+
+#### 2.1 量化训练
+
+量化训练流程可以参考 [分类模型的量化训练流程](https://paddlepaddle.github.io/PaddleSlim/tutorials/quant_aware_demo/)
+
+**量化训练过程中config参数：**
+- **quantize_op_types:** 目前CPU上量化支持的算子为 `depthwise_conv2d`, `conv2d`, `fc`, `matmul`, `transpose2`, `reshape2`, `pool2d`, `scale`, `concat`。但是在量化训练阶段插入fake_quantize/fake_dequantize算子时，只需在前四种op前后插入fake_quantize/fake_dequantize 算子，因为后面四种算子 `transpose2`, `reshape2`, `pool2d`, `scale`, `concat`的scales将从其他op的`out_threshold`属性获取，因此`quantize_op_types`参数只需要设置为 `depthwise_conv2d`, `conv2d`, `fc`, `matmul` 即可。
+- **其他参数:** 请参考 [PaddleSlim quant_aware API](https://paddlepaddle.github.io/PaddleSlim/api/quantization_api/#quant_aware)
+
+#### 2.2 离线量化
+
+离线量化模型产出可以参考[分类模型的静态离线量化流程](https://paddlepaddle.github.io/PaddleSlim/tutorials/quant_post_demo/#_1)
+
+## 3. 转化产出的量化模型为DNNL优化后的INT8模型
+为了部署在CPU上，我们将保存的quant模型，通过一个转化脚本，移除fake_quantize/fake_dequantize op，进行算子融合和优化并且转化为INT8模型。脚本在官网的位置为[save_quant_model.py](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/contrib/slim/tests/save_quant_model.py)。复制脚本到本样例所在目录(`/PATH_TO_PaddleSlim/demo/mkldnn_quant/`)，并执行如下命令：
+```
+python save_quant_model.py --quant_model_path=/PATH/TO/SAVE/FLOAT32/QUANT/MODEL --int8_model_save_path=/PATH/TO/SAVE/INT8/MODEL
+```
+**参数说明：**
+- **quant_model_path:** 为输入参数，必填。为量化训练产出的quant模型。
+- **int8_model_save_path:** 将quant模型经过DNNL优化量化后保存的最终INT8模型输出路径。注意：quant_model_path必须传入PaddleSlim量化产出的含有fake_quant/fake_dequant ops的quant模型。
+- **ops_to_quantize:** 以逗号隔开的指定的需要量化的op类型列表。可选，默认为空，空表示量化所有可量化的op。目前，对于Benchmark中列出的图像分类和自然语言处理模型中，量化所有可量化的op可以获得最好的精度和性能，因此建议用户不设置这个参数。
+- **--op_ids_to_skip:** 以逗号隔开的op id号列表，可选，默认为空。这个列表中的op号将不量化，采用FP32类型。要获取特定op的ID，请先使用`--debug`选项运行脚本，并打开生成的文件`int8_<number>_cpu_quantize_placement_pass.dot`，找出不需量化的op, ID号在Op名称后面的括号中。
+- **--debug:** 添加此选项可在每个转换步骤之后生成一系列包含模型图的* .dot文件。 有关DOT格式的说明，请参见[DOT](https://graphviz.gitlab.io/_pages/doc/info/lang.html)。要打开`* .dot`文件，请使用系统上可用的任何Graphviz工具（例如Linux上的`xdot`工具或Windows上的`dot`工具有关文档，请参见[Graphviz](http://www.graphviz.org/documentation/)。
+- **注意：**
+  - 目前支持DNNL量化的op列表是`conv2d`, `depthwise_conv2d`, `fc`, `matmul`, `pool2d`, `reshape2`, `transpose2`,`scale`, `concat`。
+  - 如果设置 `--op_ids_to_skip`,只需要传入所有量化op中想要保持FP32类型的op ID号即可。
+  - 有时量化全部op不一定得到最优性能。例如：如果一个op是单个的INT8 op, 之前和之后的op都为float32 op,那么为了量化这个op，需要先做quantize，然后运行INT8 op, 再dequantize, 这样可能导致最终性能不如保持该op为fp32 op。如果用户使用默认设置性能较差，可以观察这个模型是否有单独的INT8 op，选出不同的`ops_to_quantize`组合，也可以通过`--op_ids_to_skip`排除部分可量化op ID，多运行几次获得最佳设置。
+
+## 4. 预测
+
+### 4.1 数据预处理转化
+在精度和性能预测中，需要先对数据进行二进制转化。运行脚本如下可转化完整ILSVRC2012 val数据集。使用`--local`可以转化用户自己的数据。在Paddle所在目录运行下面的脚本。脚本在官网位置为[full_ILSVRC2012_val_preprocess.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py)
+```
+python Paddle/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py --local --data_dir=/PATH/TO/USER/DATASET/  --output_file=/PATH/TO/SAVE/BINARY/FILE
+```
+
+可选参数：
+- 不设置任何参数。脚本将下载 ILSVRC2012_img_val数据集，并转化为二进制文件。
+- **local:** 设置便为true，表示用户将提供自己的数据
+- **data_dir:** 用户自己的数据目录
+- **label_list:** 图片路径-图片类别列表文件，类似于`val_list.txt`
+- **output_file:** 生成的binary文件路径。
+- **data_dim:** 预处理图片的长和宽。默认值 224。
+
+用户自己的数据集目录结构应该如下
+```
+imagenet_user
+├── val
+│   ├── ILSVRC2012_val_00000001.jpg
+│   ├── ILSVRC2012_val_00000002.jpg
+|   |── ...
+└── val_list.txt
+```
+其中，val_list.txt 内容应该如下：
+```
+val/ILSVRC2012_val_00000001.jpg 0
+val/ILSVRC2012_val_00000002.jpg 0
+```
+
+注意：
+- 为什么将数据集转化为二进制文件？因为paddle中的数据预处理（resize, crop等）都使用pythong.Image模块进行，训练出的模型也是基于Python预处理的图片，但是我们发现Python测试性能开销很大，导致预测性能下降。为了获得良好性能，在量化模型预测阶段，我们决定使用C++测试，而C++只支持Open-CV等库，Paddle不建议使用外部库，因此我们使用Python将图片预处理然后放入二进制文件，再在C++测试中读出。用户根据自己的需要，可以更改C++测试以直接读数据并预处理，精度不会有太大下降。我们还提供了python测试`sample_tester.py`作为参考，与C++测试`sample_tester.cc`相比，用户可以看到Python测试更大的性能开销。
+
+### 4.2 部署预测
+
+#### 部署前提
+- 用户可以通过在命令行红输入`lscpu`查看本机支持指令。
+- 在支持`avx512_vnni`的CPU服务器上，INT8精度和性能最高，如：Casecade Lake, Model name: Intel(R) Xeon(R) Gold X2XX，INT8性能提升为FP32模型的3~3.7倍
+- 在支持`avx512`但是不支持`avx512_vnni`的CPU服务器上，如：SkyLake, Model name：Intel(R) Xeon(R) Gold X1XX，INT8性能为FP32性能的1.5倍左右。
+
+#### 准备预测推理库
+
+用户可以从源码编译Paddle推理库，也可以直接下载推理库。
+- 用户可以从Paddle源码编译Paddle推理库，参考[从源码编译](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html#id12)，使用release/2.0以上版本。
+
+- 用户也可以从Paddle官网下载发布的[预测库](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html)。请选择`ubuntu14.04_cpu_avx_mkl` 最新发布版或者develop版。
+
+你可以将准备好的预测库解压并重命名为fluid_inference，放在当前目录下(`/PATH_TO_PaddleSlim/demo/mkldnn_quant/`)。或者在cmake时通过设置PADDLE_ROOT来指定Paddle预测库的位置。
+
+#### 编译应用
+样例所在目录为PaddleSlim下`demo/mkldnn_quant/`,样例`sample_tester.cc`和编译所需`cmake`文件夹都在这个目录下。
+```
+cd /PATH/TO/PaddleSlim
+cd demo/mkldnn_quant/
+mkdir build
+cd build
+cmake -DPADDLE_ROOT=$PADDLE_ROOT ..
+make -j
+```
+如果你从官网下载解压了[预测库](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html)到当前目录下，这里`-DPADDLE_ROOT`可以不设置，因为`-DPADDLE_ROOT`默认位置`demo/mkldnn_quant/fluid_inference`
+
+#### 运行测试
+```
+# Bind threads to cores
+export KMP_AFFINITY=granularity=fine,compact,1,0
+export KMP_BLOCKTIME=1
+# Turbo Boost could be set to OFF using the command
+echo 1 | sudo tee /sys/devices/system/cpu/intel_pstate/no_turbo
+# In the file run.sh, set `MODEL_DIR` to `/PATH/TO/FLOAT32/MODEL`或者`/PATH/TO/SAVE/INT8/MODEL`
+# In the file run.sh, set `DATA_FILE` to `/PATH/TO/SAVE/BINARY/FILE`
+# For 1 thread performance:
+./run.sh
+# For 20 thread performance:
+./run.sh -1 20
+```
+
+运行时需要配置以下参数：
+- **infer_model:** 模型所在目录，注意模型参数当前必须是分开保存成多个文件的。可以设置为`PATH/TO/SAVE/INT8/MODEL`, `PATH/TO/SAVE/FLOAT32/MODEL`。无默认值。
+- **infer_data:** 测试数据文件所在路径。注意需要是经`full_ILSVRC2012_val_preprocess`转化后的binary文件。
+- **batch_size:** 预测batch size大小。默认值为50。
+- **iterations:** batches迭代数。默认为0，0表示预测infer_data中所有batches (image numbers/batch_size)
+- **num_threads:** 预测使用CPU 线程数，默认为单核一个线程。
+- **with_accuracy_layer:** 模型为包含精度计算层的测试模型还是不包含精度计算层的预测模型，默认为true。
+- **use_analysis** 是否使用`paddle::AnalysisConfig`对模型优化、融合(fuse)，加速。默认为false
+
+你可以直接修改`/PATH_TO_PaddleSlim/demo/mkldnn_quant/`目录下的`run.sh`中的MODEL_DIR和DATA_DIR，即可执行`./run.sh`进行CPU预测。
+
+### 4.3 用户编写自己的测试：
+如果用户编写自己的测试：
+1. 测试INT8模型
+    如果用户测试转化好的INT8模型，使用 `paddle::NativeConfig` 即可测试。在demo中，设置`use_analysis`为`false`。
+2. 测试FP32模型
+   如果用户要测试PF32模型，使用`paddle::AnalysisConfig`对原始FP32模型先优化（fuses等）再测试。在样例中，直接设置`use_analysis`为`true`。AnalysisConfig设置如下：
+```
+static void SetConfig(paddle::AnalysisConfig *cfg) {
+  cfg->SetModel(FLAGS_infer_model);  // 必须。表示需要测试的模型
+  cfg->DisableGpu();      // 必须。部署在CPU上预测，必须Disablegpu
+  cfg->EnableMKLDNN();  //必须。表示使用MKLDNN算子，将比 native 快
+  cfg->SwitchIrOptim();   // 如果传入FP32原始，这个配置设置为true将优化加速模型
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_num_threads);  //非必须。默认设置为1。表示多线程运行
+}
+```
+- 在我们提供的样例中，只要设置`use_analysis`为true并且`infer_model`传入原始FP32模型，AnalysisConfig的上述设置将被执行，传入的FP32模型将被DNNL优化加速（包括fuses等）。
+- 如果infer_model传入INT8模型，则 `use_analysis`将不起任何作用，因为INT8模型已经被优化量化。
+- 如果infer_model传入PaddleSlim产出的quant模型，`use_analysis`即使设置为true不起作用，因为quant模型包含fake_quantize/fake_dequantize ops,无法fuse,无法优化。
+
+## 5. 精度和性能数据
+INT8模型精度和性能结果参考[CPU部署预测INT8模型的精度和性能](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/docs/zh_cn/tutorials/image_classification_mkldnn_quant_tutorial.md)
+
+## FAQ
+
+- 自然语言处理模型在CPU上的部署和预测参考样例[ERNIE 模型 QUANT INT8 精度与性能复现](https://github.com/PaddlePaddle/benchmark/tree/master/Inference/c++/ernie/mkldnn)
+- 具体DNNL量化原理可以查看[SLIM Quant for INT8 DNNL](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/contrib/slim/tests/README.md)。
diff --git a/demo/mkldnn_quant/README_en.md b/demo/mkldnn_quant/README_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..ea7d0d74f6993c7a46ed94a4c356ad8fc9d022d9
--- /dev/null
+++ b/demo/mkldnn_quant/README_en.md
@@ -0,0 +1,183 @@
+# Image classification INT8 model deployment and Inference on CPU
+
+## Overview
+
+This document describes the process of converting, deploying and executing the DNNL INT8 model using fakely quantized (quant) model generated by PaddleSlim. On Casecade Lake machines (eg. Intel(R) Xeon(R) Gold 6271, 6248, X2XX etc), inference using INT8 model is ususally 3-3.7 times faster than with FP32 model. On SkyLake machines (eg. Intel(R) Xeon(R) Gold 6148, 8180, X1XX etc.), inference using INT8 model is ~1.5 times faster than with FP32 model.
+
+The process comprises the following steps:
+- Generating a fakely quantized model: Use PaddleSlim to generate fakely quantized model with `quant-aware` or `post-training` training strategy. Note that the parameters of the quantized ops will be in the range of `INT8`, but their type remains `float32`.
+- Converting fakely quantized model into the final DNNL INT8 model: Use provided python script to convert the quant model into DNNL-based and CPU-optimized INT8 model.
+- Deployment and inference on CPU: Deploy the demo on CPUs and run inference.
+
+## 1. Preparation
+
+#### Install PaddleSlim
+
+For PaddleSlim installation, please see [Paddle Installation Document](https://paddlepaddle.github.io/PaddleSlim/install.html)
+```
+git clone https://github.com/PaddlePaddle/PaddleSlim.git
+cd PaddleSlim
+python setup.py install
+```
+#### Use it in examples
+In sample tests, import Paddle and PaddleSlim as follows:
+```
+import paddle
+import paddle.fluid as fluid
+import paddleslim as slim
+import numpy as np
+```
+
+## 2. Use PaddleSlim to generate a fake quantized model
+
+One can generate fake-quantized model with post-training or quant-aware strategy. If user would like to skip the step of generating fakely quantized model and check quantization speedup directly, download [mobilenetv2 post-training quant model](https://paddle-inference-dist.cdn.bcebos.com/quantizaiton/quant_post_models/mobilenetv2_quant_post.tgz) and its original FP32 model [mobilenetv2 fp32 model](https://paddle-inference-dist.cdn.bcebos.com/quantizaiton/fp32_models/mobilenetv2.tgz), then user could skip this paragraph and go to Point 3 directly
+
+#### 2.1 Quant-aware training
+
+To generate fake quantized model with quant-aware strategy, see [Quant-aware training tutorial](https://paddlepaddle.github.io/PaddleSlim/tutorials/quant_aware_demo/)
+
+**The parameters during quant-aware training:**
+- **quantize_op_types:** A list of operators to insert `fake_quantize` and `fake_dequantize` ops around them. In PaddlePaddle, quantization of following operators is supported for CPU: `depthwise_conv2d`, `conv2d`, `fc`, `matmul`, `transpose2`, `reshape2`, `pool2d`, `scale`, `concat`. However, inserting fake_quantize/fake_dequantize operators during training is needed only for the first four of them (`depthwise_conv2d`, `conv2d`, `fc`, `matmul`), so setting the `quantize_op_types` parameter to the list of those four ops is enough. Scala data needed for quantization of the other five operators is reused from the fake ops or gathered from the `out_threshold` attributes of the operators.
+- **Other parameters:** Please read [PaddleSlim quant_aware API](https://paddlepaddle.github.io/PaddleSlim/api/quantization_api/#quant_aware)
+
+#### 2.2 Post-training quantization
+
+To generate post-training fake quantized model, see [Offline post-training quantization tutorial](https://paddlepaddle.github.io/PaddleSlim/tutorials/quant_post_demo/#_1)
+
+## 3. Convert the fake quantized model to DNNL INT8 model
+In order to deploy an INT8 model on the CPU, we need to collect scales, remove all fake_quantize/fake_dequantize operators, optimize the graph and quantize it, turning it into the final DNNL INT8 model. This is done by the script [save_quant_model.py](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/contrib/slim/tests/save_quant_model.py). Copy the script to the directory where the demo is located: `/PATH_TO_PaddleSlim/demo/mkldnn_quant/` and run it as follows:
+```
+python save_quant_model.py --quant_model_path=/PATH/TO/SAVE/FLOAT32/quant/MODEL --int8_model_save_path=/PATH/TO/SAVE/INT8/MODEL
+```
+
+**Available options in the above command and their descriptions are as follows:**
+- **quant_model_path:** input model path, required. A quant model for quantifying training output.
+- **int8_model_save_path:** The final INT8 model output path after the quant model is optimized and quantized by DNNL.
+- **ops_to_quantize:** A comma separated list of specified op types to be quantized. It is optional. If the option is skipped, all quantizable operators will be quantized. Skipping the option is recommended in the first approach as it usually yields best performance and accuracy for image classification models and NLP models listed in the Benchmark..
+- **--op_ids_to_skip:** "A comma-separated list of operator ID numbers. It is optional. Default value is none. The op ids in this list will not be quantized and will adopt FP32 type. To get the ID of a specific op, first run the script using the `--debug` option, and open the generated file `int8_<number>_cpu_quantize_placement_pass.dot` to find the op that does not need to be quantified, and the ID number is in parentheses after the Op name.
+- **--debug:** Generate models graph or not. If this option is present, .dot files with graphs of the model will be generated after each optimization step that modifies the graph. For the description of DOT format, please read [DOT](https://graphviz.gitlab.io/_pages/doc/info/lang.html). To open the `*.dot` file, please use any Graphviz tool available on the system(such as the `xdot` tool on Linux or the `dot` tool on Windows. For Graphviz documentation, see [Graphviz](http://www. graphviz.org/documentation/).
+  
+- **Note:**
+  - The DNNL supported quantizable ops are `conv2d`, `depthwise_conv2d`, `fc`, `matmul`, `pool2d`, `reshape2`, `transpose2`, `scale`, `concat`.
+  - If you want to skip quantization of particular operators,  use the `--op_ids_to_skip` option and set it exactly to the list of ids of the operators you want to keep as FP32 operators.
+  - Quantization yields best performance and accuracy when a long sequences of consecutive quantized operators are present in the model. When a model contains quantizable operators surrounded by non-quantizable operators, quantization of the single operators (or very short sequences of them) can give no speedup or even cause drop in performance because of frequent quantizing and dequantizing the data. In that case user can tweak the quantization process by limiting it to the particular types of operators (using the `--ops_to_quantize` option) or by disabling quantization of particular operators, e.g. the single ones or in short sequences (using the `--op_ids_to_skip` option).
+
+## 4. Inference
+
+### 4.1 Data preprocessing
+To deploy the model on the CPU, the validation dataset needs to be converted into the binary format. Run the following command in the root directory of Paddle repository to convert the complete ILSVRC2012 val dataset. Use `--local` option to provide your own image classification dataset for conversion. The script is also located on the official website at [full_ILSVRC2012_val_preprocess.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py)
+```
+python python/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py --output_file=/PATH/TO/SAVE/BINARY/FILE
+
+```
+or
+```
+python python/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py --local --data_dir=/PATH/TO/USER/DATASET/ --output_file=/PATH/TO/SAVE/BINARY/FILE
+```
+
+**Available options in the above command and their descriptions are as follows:**
+- If no parameters are set, the script will download the ILSVRC2012_img_val dataset and convert it into a binary file.
+- **local:** If used, a user's dataset is expected in the `data_dir` option.
+- **data_dir:** user's own data directory.
+- **label_list:** Picture path-Picture category list file, similar to `val_list.txt`.
+- **output_file:** A path for the generated binary file.
+- **data_dim:** The length and width for the pre-processed pictures in the resulting binary. The default value is 224.
+
+The structure of the directory with the user's dataset should be as follows:
+```
+imagenet_user
+├── val
+│ ├── ILSVRC2012_val_00000001.jpg
+│ ├── ILSVRC2012_val_00000002.jpg
+| |── ...
+└── val_list.txt
+```
+Then, the contents of val_list.txt should be as follows:
+```
+val/ILSVRC2012_val_00000001.jpg 0
+val/ILSVRC2012_val_00000002.jpg 0
+```
+
+note:
+- Performance measuring is recommended to be done using C++ tests rather than python tests, because python tests incur big overhead of the python itself. However, testing requires the dataset images to be preprocessed first. This can be done easily using native tools in python, but in C++ it requires additional libraries. To avoid introducing C++ external dependencies on image processing libraries like Open-CV, we preprocess the dataset using the python script and save the result in a binary format, ready to use by C++ tests. User can modify the C++ test code to enable image preprocessing with open-cv or any other library and read the image data directly from the original dataset. The accuracy result should differ only a bit from the accuracy obtained using the preprocessed binary dataset. The python test `sample_tester.py` is provided as a reference, to show the difference in performance between it and the C++ test `sample_tester.cc`
+
+### 4.2 Deploying Inference demo
+
+#### Deployment premises
+- Users can check which instruction sets are supported by their machines' CPUs by issuing the command `lscpu`.
+- INT8 performance and accuracy is best on CPU servers which support `avx512_vnni` instruction set (e.g. Intel Cascade Lake CPUs: Intel(R) Xeon(R) Gold 6271, 6248 or other X2XX). INT8 inference performance is then 3-3.7 times better than for FP32.
+- On CPU servers that support `avx512` but do not support `avx512_vnni` instructions (SkyLake, Model name: Intel(R) Xeon(R) Gold X1XX, such as 6148), the performance of INT8 models is around 1.5 times faster than FP32 models.
+
+#### Prepare Paddle inference library
+
+Users can compile the Paddle inference library from the source code or download the inference library directly.
+
+- For instructions on how to compile the Paddle inference library from source, see [Compile from Source](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html#id12), checkout release/2.0 or develop branch and compile it.
+
+- Users can also download the published [inference Library](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html). Please select `ubuntu14.04_cpu_avx_mkl` latest release or develop version. The downloaded library has to be decompressed and renamed into `fluid_inference` directory and placed in current directory (`/PATH_TO_PaddleSlim/demo/mkldnn_quant/`) for the library to be available. Another option is to set the `PADDLE_ROOT` cmake variable to the `fluid_inference` directory location to link the tests with the Paddle inference library properly.
+
+#### Compile the application
+The source code file of the sample test (`sample_tester.cc`) and the `cmake` files are all located in `demo/mkldnn_quant/`directory.
+
+```
+cd /PATH/TO/PaddleSlim
+cd demo/mkldnn_quant/
+mkdir build
+cd build
+cmake -DPADDLE_ROOT=$PADDLE_ROOT ..
+make -j
+```
+- `-DPADDLE_ROOT` default value is `demo/mkldnn_quant/fluid_inference`. If users download and unzip the library [Inference library from the official website](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html) in current directory `demo/mkldnn_quant/`, users could skip this option.
+
+#### Run the test
+```
+# Bind threads to cores
+export KMP_AFFINITY=granularity=fine,compact,1,0
+export KMP_BLOCKTIME=1
+# Turbo Boost could be set to OFF using the command
+echo 1 | sudo tee /sys/devices/system/cpu/intel_pstate/no_turbo
+# In the file run.sh, set `MODEL_DIR` to `/PATH/TO/FLOAT32/MODEL` or `/PATH/TO/SAVE/INT8/MODEL`
+# In the file run.sh, set `DATA_FILE` to `/PATH/TO/SAVE/BINARY/FILE`
+# For 1 thread performance:
+./run.sh
+# For 20 thread performance:
+./run.sh -1 20
+```
+
+**Available options in the above command and their descriptions are as follows:**
+- **infer_model:** Required. Tested model path. Note that the model parameters files need be saved into multiple files.
+- **infer_data:** Required. The path of the tested data file. Note that it needs to be a binary file converted by `full_ILSVRC2012_val_preprocess`.
+- **batch_size:** Batch size. The default value is 50.
+- **iterations:** Batch iterations. The default is 0, which means predict all batches (image numbers/batch size) in infer_data
+- **num_threads:** Number of CPU threads used. The default value is 1.
+- **with_accuracy_layer:** The model is with accuracy layer or not. Default value false.
+- **use_analysis** Whether to use paddle::AnalysisConfig to optimize the model. Default value is false.
+
+One can directly modify MODEL_DIR and DATA_DIR in `run.sh` under `/PATH_TO_PaddleSlim/demo/mkldnn_quant/` directory, then execute `./run.sh` for CPU inference.
+
+### 4.3 Writing your own tests:
+When writing their own test, users can:
+1. Test the resulting INT8 model - then paddle::NativeConfig should be used (without applying additional optimizations) and the option `use_analysis` should be set to `false` in the demo.
+2. Test the original FP32 model - then paddle::AnalysisConfig should be used (applying FP32 fuses and optimizations) and the option `use_analysis` should be set to `true` in the demo.
+AnalysisConfig configuration in this demo are set as follows:
+```
+static void SetConfig(paddle::AnalysisConfig *cfg) {
+  cfg->SetModel(FLAGS_infer_model); // Required. The model to be tested
+  cfg->DisableGpu(); // Required. Deploy on the CPU to predict, you must Disablegpu
+  cfg->EnableMKLDNN(); // Required. Configure with MKLDNN enabled make the inference faster than native configuration
+  cfg->SwitchIrOptim(); // Required. Enable SwitchIrOptim will fuses many ops and improve the performance
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_num_threads); // The default setting is 1
+}
+```
+**Notes:**
+- If `infer_model` is a path to an FP32 model and `use_analysis` is set to true, the paddle::AnalysisConfig will be called. Hence the FP32 model will be fused and optimized, the performance should be faster than FP32 inference using paddle::NativeConfig.
+- If `infer_model` is a path to converted DNNL INT8 model, the `use_analysis` option will make no difference, because INT8 model has been fused, optimized and quantized.
+- If `infer_model` is a path to a fakely quantized model generated by PaddleSlim, `use_analysis` will not work even if it is set to true, because the fake quantized model contains fake quantize/dequantize ops, which cannot be fused or optimized.
+
+## 5. Accuracy and performance benchmark
+For INT8 models accuracy and performance results see [CPU deployment predicts the accuracy and performance of INT8 model](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/docs/zh_cn/tutorials/image_classification_mkldnn_quant_tutorial.md)
+
+## FAQ
+
+- For deploying INT8 NLP models on CPU, see [ERNIE model quant INT8 accuracy and performance reproduction](https://github.com/PaddlePaddle/benchmark/tree/master/Inference/c++/ernie/mkldnn)
+- The detailed DNNL quantification process can be viewed in [SLIM quant for INT8 DNNL](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/contrib/slim/tests/README.md)
diff --git a/demo/mkldnn_quant/cmake/FindFluid.cmake b/demo/mkldnn_quant/cmake/FindFluid.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..49120d623720b545996b001cf870d720e1e00d3e
--- /dev/null
+++ b/demo/mkldnn_quant/cmake/FindFluid.cmake
@@ -0,0 +1,149 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+set(PADDLE_FOUND OFF)
+
+if(NOT PADDLE_ROOT)
+  set(PADDLE_ROOT $ENV{PADDLE_ROOT} CACHE PATH "Paddle Path")
+endif()
+if(NOT PADDLE_ROOT)
+  message(FATAL_ERROR "Set PADDLE_ROOT as your root directory installed PaddlePaddle")
+endif()
+set(THIRD_PARTY_ROOT ${PADDLE_ROOT}/third_party)
+
+if(USE_GPU)
+  set(CUDA_ROOT $ENV{CUDA_ROOT} CACHE PATH "CUDA root Path")
+  set(CUDNN_ROOT $ENV{CUDNN_ROOT} CACHE PATH "CUDNN root Path")
+endif()
+
+# Support directory orgnizations
+find_path(PADDLE_INC_DIR NAMES paddle_inference_api.h PATHS ${PADDLE_ROOT}/paddle/include)
+if(PADDLE_INC_DIR)
+  set(LIB_PATH "paddle/lib")
+else()
+  find_path(PADDLE_INC_DIR NAMES paddle/fluid/inference/paddle_inference_api.h PATHS ${PADDLE_ROOT})
+  if(PADDLE_INC_DIR)
+    include_directories(${PADDLE_ROOT}/paddle/fluid/inference)
+  endif()
+  set(LIB_PATH "paddle/fluid/inference")
+endif()
+  
+include_directories(${PADDLE_INC_DIR})
+
+find_library(PADDLE_FLUID_SHARED_LIB NAMES "libpaddle_fluid.so" PATHS
+    ${PADDLE_ROOT}/${LIB_PATH})
+find_library(PADDLE_FLUID_STATIC_LIB NAMES "libpaddle_fluid.a" PATHS
+    ${PADDLE_ROOT}/${LIB_PATH})
+
+if(USE_SHARED AND PADDLE_INC_DIR AND PADDLE_FLUID_SHARED_LIB)
+  set(PADDLE_FOUND ON)
+  add_library(paddle_fluid_shared SHARED IMPORTED)
+  set_target_properties(paddle_fluid_shared PROPERTIES IMPORTED_LOCATION
+                        ${PADDLE_FLUID_SHARED_LIB})
+  set(PADDLE_LIBRARIES paddle_fluid_shared)
+  message(STATUS "Found PaddlePaddle Fluid (include: ${PADDLE_INC_DIR}; "
+          "library: ${PADDLE_FLUID_SHARED_LIB}")
+elseif(PADDLE_INC_DIR AND PADDLE_FLUID_STATIC_LIB)
+  set(PADDLE_FOUND ON)
+  add_library(paddle_fluid_static STATIC IMPORTED)
+  set_target_properties(paddle_fluid_static PROPERTIES IMPORTED_LOCATION
+                        ${PADDLE_FLUID_STATIC_LIB})
+  set(PADDLE_LIBRARIES paddle_fluid_static)
+  message(STATUS "Found PaddlePaddle Fluid (include: ${PADDLE_INC_DIR}; "
+          "library: ${PADDLE_FLUID_STATIC_LIB}")
+else()
+  set(PADDLE_FOUND OFF)
+  message(WARNING "Cannot find PaddlePaddle Fluid under ${PADDLE_ROOT}")
+  return()
+endif()
+
+
+# including directory of third_party libraries
+set(PADDLE_THIRD_PARTY_INC_DIRS)
+function(third_party_include TARGET_NAME HEADER_NAME TARGET_DIRNAME)
+  find_path(PADDLE_${TARGET_NAME}_INC_DIR NAMES ${HEADER_NAME} PATHS
+            ${TARGET_DIRNAME}
+            NO_DEFAULT_PATH)
+  if(PADDLE_${TARGET_NAME}_INC_DIR)
+    message(STATUS "Found PaddlePaddle third_party including directory: " ${PADDLE_${TARGET_NAME}_INC_DIR})
+    set(PADDLE_THIRD_PARTY_INC_DIRS ${PADDLE_THIRD_PARTY_INC_DIRS} ${PADDLE_${TARGET_NAME}_INC_DIR} PARENT_SCOPE)
+  endif()
+endfunction()
+
+third_party_include(glog glog/logging.h ${THIRD_PARTY_ROOT}/install/glog/include)
+third_party_include(protobuf google/protobuf/message.h ${THIRD_PARTY_ROOT}/install/protobuf/include)
+third_party_include(gflags gflags/gflags.h ${THIRD_PARTY_ROOT}/install/gflags/include)
+third_party_include(eigen unsupported/Eigen/CXX11/Tensor ${THIRD_PARTY_ROOT}/eigen3)
+third_party_include(boost boost/config.hpp ${THIRD_PARTY_ROOT}/boost)
+if(USE_GPU)
+  third_party_include(cuda cuda.h ${CUDA_ROOT}/include)
+  third_party_include(cudnn cudnn.h ${CUDNN_ROOT}/include)
+endif()
+
+message(STATUS "PaddlePaddle need to include these third party directories: ${PADDLE_THIRD_PARTY_INC_DIRS}")
+include_directories(${PADDLE_THIRD_PARTY_INC_DIRS})
+
+set(PADDLE_THIRD_PARTY_LIBRARIES)
+function(third_party_library TARGET_NAME TARGET_DIRNAME)
+  set(library_names ${ARGN})
+  set(local_third_party_libraries)
+  foreach(lib ${library_names})
+    string(REGEX REPLACE "^lib" "" lib_noprefix ${lib})
+    if(${lib} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$")
+      set(libtype STATIC)
+      string(REGEX REPLACE "${CMAKE_STATIC_LIBRARY_SUFFIX}$" "" libname ${lib_noprefix})
+    elseif(${lib} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}(\\.[0-9]+)?$")
+      set(libtype SHARED)
+      string(REGEX REPLACE "${CMAKE_SHARED_LIBRARY_SUFFIX}(\\.[0-9]+)?$" "" libname ${lib_noprefix})
+    else()
+      message(FATAL_ERROR "Unknown library type: ${lib}")
+    endif()
+    #message(STATUS "libname: ${libname}")
+    find_library(${libname}_LIBRARY NAMES "${lib}" PATHS
+        ${TARGET_DIRNAME}
+        NO_DEFAULT_PATH)
+    if(${libname}_LIBRARY)
+      set(${TARGET_NAME}_FOUND ON PARENT_SCOPE)
+      add_library(${libname} ${libtype} IMPORTED)
+      set_target_properties(${libname} PROPERTIES IMPORTED_LOCATION ${${libname}_LIBRARY})
+      set(local_third_party_libraries ${local_third_party_libraries} ${libname})
+      message(STATUS "Found PaddlePaddle third_party library: " ${${libname}_LIBRARY})
+    else()
+      set(${TARGET_NAME}_FOUND OFF PARENT_SCOPE)
+      message(WARNING "Cannot find ${lib} under ${THIRD_PARTY_ROOT}")
+    endif()
+  endforeach()
+  set(PADDLE_THIRD_PARTY_LIBRARIES ${PADDLE_THIRD_PARTY_LIBRARIES} ${local_third_party_libraries} PARENT_SCOPE)
+endfunction()
+
+third_party_library(mklml ${THIRD_PARTY_ROOT}/install/mklml/lib libiomp5.so libmklml_intel.so)
+third_party_library(mkldnn ${THIRD_PARTY_ROOT}/install/mkldnn/lib libmkldnn.so)
+if(NOT mkldnn_FOUND)
+  third_party_library(mkldnn ${THIRD_PARTY_ROOT}/install/mkldnn/lib libmkldnn.so.0)
+endif()
+if(NOT USE_SHARED)
+  third_party_library(glog ${THIRD_PARTY_ROOT}/install/glog/lib libglog.a)
+  third_party_library(protobuf ${THIRD_PARTY_ROOT}/install/protobuf/lib libprotobuf.a)
+  third_party_library(gflags ${THIRD_PARTY_ROOT}/install/gflags/lib libgflags.a)
+  if(NOT mklml_FOUND)
+    third_party_library(openblas ${THIRD_PARTY_ROOT}/install/openblas/lib libopenblas.a)
+  endif()
+  third_party_library(zlib ${THIRD_PARTY_ROOT}/install/zlib/lib libz.a)
+  third_party_library(snappystream ${THIRD_PARTY_ROOT}/install/snappystream/lib libsnappystream.a)
+  third_party_library(snappy ${THIRD_PARTY_ROOT}/install/snappy/lib libsnappy.a)
+  third_party_library(xxhash ${THIRD_PARTY_ROOT}/install/xxhash/lib libxxhash.a)
+  if(USE_GPU)
+    third_party_library(cudart ${CUDA_ROOT}/lib64 libcudart.so)
+  endif()
+endif()
\ No newline at end of file
diff --git a/demo/mkldnn_quant/cmake/FindGperftools.cmake b/demo/mkldnn_quant/cmake/FindGperftools.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..7f6be835cc5b08240c28f065c7a2ff6718cc68dd
--- /dev/null
+++ b/demo/mkldnn_quant/cmake/FindGperftools.cmake
@@ -0,0 +1,77 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+# Tries to find Gperftools.
+#
+# Usage of this module as follows:
+#
+#     find_package(Gperftools)
+#
+# Variables used by this module, they can change the default behaviour and need
+# to be set before calling find_package:
+#
+#  Gperftools_ROOT_DIR  Set this variable to the root installation of
+#                       Gperftools if the module has problems finding
+#                       the proper installation path.
+#
+# Variables defined by this module:
+#
+#  GPERFTOOLS_FOUND              System has Gperftools libs/headers
+#  GPERFTOOLS_LIBRARIES          The Gperftools libraries (tcmalloc & profiler)
+#  GPERFTOOLS_INCLUDE_DIR        The location of Gperftools headers
+
+find_library(GPERFTOOLS_TCMALLOC
+  NAMES tcmalloc
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+
+find_library(GPERFTOOLS_PROFILER
+  NAMES profiler
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+
+find_library(GPERFTOOLS_TCMALLOC_AND_PROFILER
+  NAMES tcmalloc_and_profiler
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+
+find_path(GPERFTOOLS_INCLUDE_DIR
+  NAMES gperftools/heap-profiler.h
+  HINTS ${Gperftools_ROOT_DIR}/include)
+
+set(GPERFTOOLS_LIBRARIES ${GPERFTOOLS_TCMALLOC_AND_PROFILER})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+  Gperftools
+  DEFAULT_MSG
+  GPERFTOOLS_LIBRARIES
+  GPERFTOOLS_INCLUDE_DIR)
+
+mark_as_advanced(
+  Gperftools_ROOT_DIR
+  GPERFTOOLS_TCMALLOC
+  GPERFTOOLS_PROFILER
+  GPERFTOOLS_TCMALLOC_AND_PROFILER
+  GPERFTOOLS_LIBRARIES
+  GPERFTOOLS_INCLUDE_DIR)
+
+# create IMPORTED targets
+if (Gperftools_FOUND AND NOT TARGET gperftools::tcmalloc)
+  add_library(gperftools::tcmalloc UNKNOWN IMPORTED)
+  set_target_properties(gperftools::tcmalloc PROPERTIES
+    IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC}
+    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
+  add_library(gperftools::profiler UNKNOWN IMPORTED)
+  set_target_properties(gperftools::profiler PROPERTIES
+    IMPORTED_LOCATION ${GPERFTOOLS_PROFILER}
+    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
+endif()
diff --git a/demo/mkldnn_quant/run.sh b/demo/mkldnn_quant/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4e28e44d0c797c291a6ac60328dc44a5884e7bcd
--- /dev/null
+++ b/demo/mkldnn_quant/run.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+MODEL_DIR=./mobilenetv2_INT8
+DATA_FILE=/data/datasets/ImageNet_py/val.bin
+num_threads=1
+with_accuracy_layer=false
+use_profile=true
+ITERATIONS=0
+
+./build/inference --logtostderr=1 \
+    --infer_model=${MODEL_DIR} \
+    --infer_data=${DATA_FILE} \
+    --batch_size=1 \
+    --num_threads=${num_threads} \
+    --iterations=${ITERATIONS} \
+    --with_accuracy_layer=${with_accuracy_layer} \
+    --use_profile=${use_profile} \
+    --use_analysis=false
diff --git a/demo/mkldnn_quant/sample_tester.cc b/demo/mkldnn_quant/sample_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8386df94b33a9e6667d5139d1add1fee968f8c6e
--- /dev/null
+++ b/demo/mkldnn_quant/sample_tester.cc
@@ -0,0 +1,325 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <paddle_inference_api.h>
+#include <algorithm>
+#include <chrono>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <vector>
+#ifdef WITH_GPERFTOOLS
+#include <gperftools/profiler.h>
+#include <paddle/fluid/platform/profiler.h>
+#endif
+
+DEFINE_string(infer_model, "", "path to the model");
+DEFINE_string(infer_data, "", "path to the input data");
+DEFINE_int32(batch_size, 50, "inference batch size");
+DEFINE_int32(iterations,
+             0,
+             "number of batches to process. 0 means testing whole dataset");
+DEFINE_int32(num_threads, 1, "num of threads to run in parallel");
+DEFINE_bool(with_accuracy_layer,
+            true,
+            "Set with_accuracy_layer to true if provided model has accuracy "
+            "layer and requires label input");
+DEFINE_bool(use_profile,
+            false,
+            "Set use_profile to true to get profile information");
+DEFINE_bool(use_analysis,
+            false,
+            "If use_analysis is set to true, the model will be optimized");
+
+struct Timer {
+  std::chrono::high_resolution_clock::time_point start;
+  std::chrono::high_resolution_clock::time_point startu;
+
+  void tic() { start = std::chrono::high_resolution_clock::now(); }
+  double toc() {
+    startu = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> time_span =
+        std::chrono::duration_cast<std::chrono::duration<double>>(startu -
+                                                                  start);
+    double used_time_ms = static_cast<double>(time_span.count()) * 1000.0;
+    return used_time_ms;
+  }
+};
+
+template <typename T>
+constexpr paddle::PaddleDType GetPaddleDType();
+
+template <>
+constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
+  return paddle::PaddleDType::INT64;
+}
+
+template <>
+constexpr paddle::PaddleDType GetPaddleDType<float>() {
+  return paddle::PaddleDType::FLOAT32;
+}
+
+template <typename T>
+class TensorReader {
+ public:
+  TensorReader(std::ifstream &file,
+               size_t beginning_offset,
+               std::vector<int> shape,
+               std::string name)
+      : file_(file), position_(beginning_offset), shape_(shape), name_(name) {
+    numel_ = std::accumulate(
+        shape_.begin(), shape_.end(), size_t{1}, std::multiplies<size_t>());
+  }
+
+  paddle::PaddleTensor NextBatch() {
+    paddle::PaddleTensor tensor;
+    tensor.name = name_;
+    tensor.shape = shape_;
+    tensor.dtype = GetPaddleDType<T>();
+    tensor.data.Resize(numel_ * sizeof(T));
+    file_.seekg(position_);
+    file_.read(static_cast<char *>(tensor.data.data()), numel_ * sizeof(T));
+    position_ = file_.tellg();
+    if (file_.eof()) LOG(ERROR) << name_ << ": reached end of stream";
+    if (file_.bad()) LOG(ERROR) << name_ << "ERROR: badbit is true";
+    if (file_.fail())
+      throw std::runtime_error(name_ + ": failed reading file.");
+    return tensor;
+  }
+
+ protected:
+  std::ifstream &file_;
+  size_t position_;
+  std::vector<int> shape_;
+  std::string name_;
+  size_t numel_;
+};
+
+void SetInput(std::vector<std::vector<paddle::PaddleTensor>> *inputs,
+              std::vector<paddle::PaddleTensor> *labels_gt,
+              bool with_accuracy_layer = FLAGS_with_accuracy_layer,
+              int32_t batch_size = FLAGS_batch_size) {
+  std::ifstream file(FLAGS_infer_data, std::ios::binary);
+  if (!file) {
+    throw std::runtime_error("Couldn't open file: " + FLAGS_infer_data);
+  }
+
+  int64_t total_images{0};
+  file.seekg(0, std::ios::beg);
+  file.read(reinterpret_cast<char *>(&total_images), sizeof(total_images));
+  LOG(INFO) << "Total images in file: " << total_images;
+
+  std::vector<int> image_batch_shape{batch_size, 3, 224, 224};
+  std::vector<int> label_batch_shape{batch_size, 1};
+  auto images_offset_in_file = static_cast<size_t>(file.tellg());
+
+  TensorReader<float> image_reader(
+      file, images_offset_in_file, image_batch_shape, "image");
+
+  auto iterations_max = total_images / batch_size;
+  auto iterations = iterations_max;
+  if (FLAGS_iterations > 0 && FLAGS_iterations < iterations_max) {
+    iterations = FLAGS_iterations;
+  }
+
+  auto labels_offset_in_file =
+      images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224;
+
+  TensorReader<int64_t> label_reader(
+      file, labels_offset_in_file, label_batch_shape, "label");
+  for (auto i = 0; i < iterations; i++) {
+    auto images = image_reader.NextBatch();
+    std::vector<paddle::PaddleTensor> tmp_vec;
+    tmp_vec.push_back(std::move(images));
+    auto labels = label_reader.NextBatch();
+    if (with_accuracy_layer) {
+      tmp_vec.push_back(std::move(labels));
+    } else {
+      labels_gt->push_back(std::move(labels));
+    }
+    inputs->push_back(std::move(tmp_vec));
+    if (i > 0 && i % 100) {
+      LOG(INFO) << "Read " << i * 100 * FLAGS_batch_size << " samples";
+    }
+  }
+}
+
+static void PrintTime(int batch_size,
+                      int num_threads,
+                      double batch_latency,
+                      int epoch = 1) {
+  double sample_latency = batch_latency / batch_size;
+  LOG(INFO) << "Model: " << FLAGS_infer_model;
+  LOG(INFO) << "====== num of threads: " << num_threads << " ======";
+  LOG(INFO) << "====== batch size: " << batch_size << ", iterations: " << epoch;
+  LOG(INFO) << "====== batch latency: " << batch_latency
+            << "ms, number of samples: " << batch_size * epoch;
+  LOG(INFO) << ", sample latency: " << sample_latency
+            << "ms, fps: " << 1000.f / sample_latency << " ======";
+}
+
+void PredictionRun(paddle::PaddlePredictor *predictor,
+                   const std::vector<std::vector<paddle::PaddleTensor>> &inputs,
+                   std::vector<std::vector<paddle::PaddleTensor>> *outputs,
+                   int num_threads,
+                   float *sample_latency = nullptr) {
+  int iterations = inputs.size();  // process the whole dataset ...
+  if (FLAGS_iterations > 0 &&
+      FLAGS_iterations < static_cast<int64_t>(inputs.size()))
+    iterations =
+        FLAGS_iterations;  // ... unless the number of iterations is set
+  outputs->resize(iterations);
+  Timer run_timer;
+  double elapsed_time = 0;
+#ifdef WITH_GPERFTOOLS
+  ResetProfiler();
+  ProfilerStart("paddle_inference.prof");
+#endif
+  int predicted_num = 0;
+
+  for (int i = 0; i < iterations; i++) {
+    run_timer.tic();
+    predictor->Run(inputs[i], &(*outputs)[i], FLAGS_batch_size);
+    elapsed_time += run_timer.toc();
+
+    predicted_num += FLAGS_batch_size;
+    if (predicted_num % 100 == 0) {
+      LOG(INFO) << "Infer " << predicted_num << " samples";
+    }
+  }
+
+#ifdef WITH_GPERFTOOLS
+  ProfilerStop();
+#endif
+
+  auto batch_latency = elapsed_time / iterations;
+  PrintTime(FLAGS_batch_size, num_threads, batch_latency, iterations);
+
+  if (sample_latency != nullptr)
+    *sample_latency = batch_latency / FLAGS_batch_size;
+}
+
+std::pair<float, float> CalculateAccuracy(
+    const std::vector<std::vector<paddle::PaddleTensor>> &outputs,
+    const std::vector<paddle::PaddleTensor> &labels_gt,
+    bool with_accuracy = FLAGS_with_accuracy_layer) {
+  LOG_IF(ERROR, !with_accuracy && labels_gt.size() == 0)
+      << "if with_accuracy set to false, labels_gt must be not empty";
+  std::vector<float> acc1_ss;
+  std::vector<float> acc5_ss;
+  if (!with_accuracy) {     // model with_accuracy_layer = false
+    float *result_array;    // for one batch 50*1000
+    int64_t *batch_labels;  // 50*1
+    LOG_IF(ERROR, outputs.size() != labels_gt.size())
+        << "outputs first dimension must be equal to labels_gt first dimension";
+    for (auto i = 0; i < outputs.size();
+         ++i) {  // same as labels first dimension
+      result_array = static_cast<float *>(outputs[i][0].data.data());
+      batch_labels = static_cast<int64_t *>(labels_gt[i].data.data());
+      int correct_1 = 0, correct_5 = 0, total = FLAGS_batch_size;
+      for (auto j = 0; j < FLAGS_batch_size; j++) {  // batch_size
+        std::vector<float> v(result_array + j * 1000,
+                             result_array + (j + 1) * 1000);
+        std::vector<std::pair<float, int>> vx;
+        for (int k = 0; k < 1000; k++) {
+          vx.push_back(std::make_pair(v[k], k));
+        }
+        std::partial_sort(vx.begin(),
+                          vx.begin() + 5,
+                          vx.end(),
+                          [](std::pair<float, int> a, std::pair<float, int> b) {
+                            return a.first > b.first;
+                          });
+        if (static_cast<int>(batch_labels[j]) == vx[0].second) correct_1 += 1;
+        if (std::find_if(vx.begin(),
+                         vx.begin() + 5,
+                         [batch_labels, j](std::pair<float, int> a) {
+                           return static_cast<int>(batch_labels[j]) == a.second;
+                         }) != vx.begin() + 5)
+          correct_5 += 1;
+      }
+      acc1_ss.push_back(static_cast<float>(correct_1) /
+                        static_cast<float>(total));
+      acc5_ss.push_back(static_cast<float>(correct_5) /
+                        static_cast<float>(total));
+    }
+  } else {  // model with_accuracy_layer = true
+    for (auto i = 0; i < outputs.size(); ++i) {
+      LOG_IF(ERROR, outputs[i].size() < 3UL) << "To get top1 and top5 "
+                                                "accuracy, output[i] size must "
+                                                "be bigger than or equal to 3";
+      acc1_ss.push_back(
+          *static_cast<float *>(outputs[i][1].data.data()));  // 1 is top1 acc
+      acc5_ss.push_back(*static_cast<float *>(
+          outputs[i][2].data.data()));  // 2 is top5 acc or mAP
+    }
+  }
+  auto acc1_ss_avg =
+      std::accumulate(acc1_ss.begin(), acc1_ss.end(), 0.0) / acc1_ss.size();
+  auto acc5_ss_avg =
+      std::accumulate(acc5_ss.begin(), acc5_ss.end(), 0.0) / acc5_ss.size();
+  return std::make_pair(acc1_ss_avg, acc5_ss_avg);
+}
+
+static void SetIrOptimConfig(paddle::AnalysisConfig *cfg) {
+  cfg->DisableGpu();
+  cfg->SwitchIrOptim();
+  cfg->EnableMKLDNN();
+  if (FLAGS_use_profile) {
+    cfg->EnableProfile();
+  }
+}
+
+std::unique_ptr<paddle::PaddlePredictor> CreatePredictor(
+    const paddle::PaddlePredictor::Config *config, bool use_analysis = true) {
+  const auto *analysis_config =
+      reinterpret_cast<const paddle::AnalysisConfig *>(config);
+  if (use_analysis) {
+    return paddle::CreatePaddlePredictor<paddle::AnalysisConfig>(
+        *analysis_config);
+  }
+  auto native_config = analysis_config->ToNativeConfig();
+  return paddle::CreatePaddlePredictor<paddle::NativeConfig>(native_config);
+}
+
+int main(int argc, char *argv[]) {
+  // InitFLAGS(argc, argv);
+  google::InitGoogleLogging(*argv);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::AnalysisConfig cfg;
+  cfg.SetModel(FLAGS_infer_model);
+  cfg.SetCpuMathLibraryNumThreads(FLAGS_num_threads);
+  if (FLAGS_use_analysis) {
+    SetIrOptimConfig(&cfg);
+  }
+
+  std::vector<std::vector<paddle::PaddleTensor>> input_slots_all;
+  std::vector<std::vector<paddle::PaddleTensor>> outputs;
+  std::vector<paddle::PaddleTensor> labels_gt;  // optional
+  SetInput(&input_slots_all, &labels_gt);       // iterations*batch_size
+  auto predictor =
+      CreatePredictor(reinterpret_cast<paddle::PaddlePredictor::Config *>(&cfg),
+                      FLAGS_use_analysis);
+  PredictionRun(predictor.get(), input_slots_all, &outputs, FLAGS_num_threads);
+  auto acc_pair = CalculateAccuracy(outputs, labels_gt);
+  LOG(INFO) << "Top1 accuracy: " << std::fixed << std::setw(6)
+            << std::setprecision(4) << acc_pair.first;
+  LOG(INFO) << "Top5 accuracy: " << std::fixed << std::setw(6)
+            << std::setprecision(4) << acc_pair.second;
+}
diff --git a/demo/mkldnn_quant/sample_tester.py b/demo/mkldnn_quant/sample_tester.py
new file mode 100644
index 0000000000000000000000000000000000000000..c43df11df11516ac34a3c3b068d65edb9e74789a
--- /dev/null
+++ b/demo/mkldnn_quant/sample_tester.py
@@ -0,0 +1,280 @@
+#   copyright (c) 2020 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import unittest
+import os
+import sys
+import argparse
+import logging
+import struct
+import six
+import numpy as np
+import time
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.framework import IrGraph
+from paddle.fluid import core
+
+logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
+_logger = logging.getLogger(__name__)
+_logger.setLevel(logging.INFO)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--batch_size', type=int, default=1, help='Batch size.')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=0,
+        help='Number of the first minibatches to skip in performance statistics.'
+    )
+    parser.add_argument(
+        '--infer_model',
+        type=str,
+        default='',
+        help='A path to an Inference model.')
+    parser.add_argument('--infer_data', type=str, default='', help='Data file.')
+    parser.add_argument(
+        '--batch_num',
+        type=int,
+        default=0,
+        help='Number of batches to process. 0 or less means whole dataset. Default: 0.'
+    )
+    parser.add_argument(
+        '--with_accuracy_layer',
+        type=bool,
+        default=False,
+        help='The model is with accuracy or without accuracy layer')
+    test_args, args = parser.parse_known_args(namespace=unittest)
+    return test_args, sys.argv[:1] + args
+
+
+class SampleTester(unittest.TestCase):
+    def _reader_creator(self, data_file='data.bin'):
+        def reader():
+            with open(data_file, 'rb') as fp:
+                num = fp.read(8)
+                num = struct.unpack('q', num)[0]
+                imgs_offset = 8
+                img_ch = 3
+                img_w = 224
+                img_h = 224
+                img_pixel_size = 4
+                img_size = img_ch * img_h * img_w * img_pixel_size
+                label_size = 8
+                labels_offset = imgs_offset + num * img_size
+
+                step = 0
+                while step < num:
+                    fp.seek(imgs_offset + img_size * step)
+                    img = fp.read(img_size)
+                    img = struct.unpack_from(
+                        '{}f'.format(img_ch * img_w * img_h), img)
+                    img = np.array(img)
+                    img.shape = (img_ch, img_w, img_h)
+                    fp.seek(labels_offset + label_size * step)
+                    label = fp.read(label_size)
+                    label = struct.unpack('q', label)[0]
+                    yield img, int(label)
+                    step += 1
+
+        return reader
+
+    def _get_batch_accuracy(self, batch_output=None, labels=None):
+        total = 0
+        correct = 0
+        correct_5 = 0
+        for n, result in enumerate(batch_output):
+            index = result.argsort()
+            top_1_index = index[-1]
+            top_5_index = index[-5:]
+            total += 1
+            if top_1_index == labels[n]:
+                correct += 1
+            if labels[n] in top_5_index:
+                correct_5 += 1
+        acc1 = float(correct) / float(total)
+        acc5 = float(correct_5) / float(total)
+        return acc1, acc5
+
+    def _prepare_for_fp32_mkldnn(self, graph):
+        ops = graph.all_op_nodes()
+        for op_node in ops:
+            name = op_node.name()
+            if name in ['depthwise_conv2d']:
+                input_var_node = graph._find_node_by_name(
+                    op_node.inputs, op_node.input("Input")[0])
+                weight_var_node = graph._find_node_by_name(
+                    op_node.inputs, op_node.input("Filter")[0])
+                output_var_node = graph._find_node_by_name(
+                    graph.all_var_nodes(), op_node.output("Output")[0])
+                attrs = {
+                    name: op_node.op().attr(name)
+                    for name in op_node.op().attr_names()
+                }
+
+                conv_op_node = graph.create_op_node(
+                    op_type='conv2d',
+                    attrs=attrs,
+                    inputs={
+                        'Input': input_var_node,
+                        'Filter': weight_var_node
+                    },
+                    outputs={'Output': output_var_node})
+
+                graph.link_to(input_var_node, conv_op_node)
+                graph.link_to(weight_var_node, conv_op_node)
+                graph.link_to(conv_op_node, output_var_node)
+                graph.safe_remove_nodes(op_node)
+
+        return graph
+
+    def _predict(self,
+                 test_reader=None,
+                 model_path=None,
+                 with_accuracy_layer=False,
+                 batch_size=1,
+                 batch_num=1,
+                 skip_batch_num=0):
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        inference_scope = fluid.executor.global_scope()
+        with fluid.scope_guard(inference_scope):
+            if os.path.exists(os.path.join(model_path, '__model__')):
+                [inference_program, feed_target_names,
+                 fetch_targets] = fluid.io.load_inference_model(model_path, exe)
+            else:
+                [inference_program, feed_target_names,
+                 fetch_targets] = fluid.io.load_inference_model(
+                     model_path, exe, 'model', 'params')
+
+            graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
+
+            graph = self._prepare_for_fp32_mkldnn(graph)
+
+            inference_program = graph.to_program()
+
+            dshape = [3, 224, 224]
+            outputs = []
+            infer_accs1 = []
+            infer_accs5 = []
+            batch_acc1 = 0.0
+            batch_acc5 = 0.0
+            fpses = []
+            batch_times = []
+            batch_time = 0.0
+            total_samples = 0
+            iters = 0
+            infer_start_time = time.time()
+            for data in test_reader():
+                if batch_num > 0 and iters >= batch_num:
+                    break
+                if iters == skip_batch_num:
+                    total_samples = 0
+                    infer_start_time = time.time()
+                if six.PY2:
+                    images = map(lambda x: x[0].reshape(dshape), data)
+                if six.PY3:
+                    images = list(map(lambda x: x[0].reshape(dshape), data))
+                images = np.array(images).astype('float32')
+                labels = np.array([x[1] for x in data]).astype('int64')
+
+                if (with_accuracy_layer == False):
+                    # models that do not have accuracy measuring layers
+                    start = time.time()
+                    out = exe.run(inference_program,
+                                  feed={feed_target_names[0]: images},
+                                  fetch_list=fetch_targets)
+                    batch_time = (time.time() - start) * 1000  # in miliseconds
+                    outputs.append(out[0])
+                    # Calculate accuracy result
+                    batch_acc1, batch_acc5 = self._get_batch_accuracy(out[0],
+                                                                      labels)
+                else:
+                    # models have accuracy measuring layers
+                    labels = labels.reshape([-1, 1])
+                    start = time.time()
+                    out = exe.run(inference_program,
+                                  feed={
+                                      feed_target_names[0]: images,
+                                      feed_target_names[1]: labels
+                                  },
+                                  fetch_list=fetch_targets)
+                    batch_time = (time.time() - start) * 1000  # in miliseconds
+                    batch_acc1, batch_acc5 = out[1][0], out[2][0]
+                    outputs.append(batch_acc1)
+                infer_accs1.append(batch_acc1)
+                infer_accs5.append(batch_acc5)
+                samples = len(data)
+                total_samples += samples
+                batch_times.append(batch_time)
+                fps = samples / batch_time * 1000
+                fpses.append(fps)
+                iters += 1
+                appx = ' (warm-up)' if iters <= skip_batch_num else ''
+                _logger.info('batch {0}{5}, acc1: {1:.4f}, acc5: {2:.4f}, '
+                             'latency: {3:.4f} ms, fps: {4:.2f}'.format(
+                                 iters, batch_acc1, batch_acc5, batch_time /
+                                 batch_size, fps, appx))
+
+            # Postprocess benchmark data
+            batch_latencies = batch_times[skip_batch_num:]
+            batch_latency_avg = np.average(batch_latencies)
+            latency_avg = batch_latency_avg / batch_size
+            fpses = fpses[skip_batch_num:]
+            fps_avg = np.average(fpses)
+            infer_total_time = time.time() - infer_start_time
+            acc1_avg = np.mean(infer_accs1)
+            acc5_avg = np.mean(infer_accs5)
+            _logger.info('Total inference run time: {:.2f} s'.format(
+                infer_total_time))
+
+            return outputs, acc1_avg, acc5_avg, fps_avg, latency_avg
+
+    def test_graph_transformation(self):
+        if not fluid.core.is_compiled_with_mkldnn():
+            return
+
+        infer_model_path = test_case_args.infer_model
+        assert infer_model_path, 'The model path cannot be empty. Please, use the --infer_model option.'
+        data_path = test_case_args.infer_data
+        assert data_path, 'The dataset path cannot be empty. Please, use the --infer_data option.'
+        batch_size = test_case_args.batch_size
+        batch_num = test_case_args.batch_num
+        skip_batch_num = test_case_args.skip_batch_num
+        with_accuracy_layer = test_case_args.with_accuracy_layer
+
+        _logger.info('Inference model: {0}'.format(infer_model_path))
+        _logger.info('Dataset: {0}'.format(data_path))
+        _logger.info('Batch size: {0}'.format(batch_size))
+        _logger.info('Batch number: {0}'.format(batch_num))
+
+        _logger.info('--- Inference prediction start ---')
+        val_reader = paddle.batch(
+            self._reader_creator(data_path), batch_size=batch_size)
+        fp32_output, fp32_acc1, fp32_acc5, fp32_fps, fp32_lat = self._predict(
+            val_reader, infer_model_path, with_accuracy_layer, batch_size,
+            batch_num, skip_batch_num)
+        _logger.info(
+            'Inference: avg top1 accuracy: {0:.4f}, avg top5 accuracy: {1:.4f}'.
+            format(fp32_acc1, fp32_acc5))
+        _logger.info('Inference: avg fps: {0:.2f}, avg latency: {1:.4f} ms'.
+                     format(fp32_fps, fp32_lat))
+
+
+if __name__ == '__main__':
+    global test_case_args
+    test_case_args, remaining_args = parse_args()
+    unittest.main(argv=remaining_args)
diff --git a/demo/models/__init__.py b/demo/models/__init__.py
index e843697407850c049a5427d2b6533c417e59c228..b6771d7086bb150742c4a7198f2224f63d603e8e 100644
--- a/demo/models/__init__.py
+++ b/demo/models/__init__.py
@@ -1,5 +1,19 @@
+from __future__ import absolute_import
 from .mobilenet import MobileNet
 from .resnet import ResNet34, ResNet50
-from .mobilenet_v2 import MobileNetV2
+from .resnet_vd import ResNet50_vd, ResNet101_vd
+from .mobilenet_v2 import MobileNetV2_x0_25, MobileNetV2
+from .pvanet import PVANet
+from .slimfacenet import SlimFaceNet_A_x0_60, SlimFaceNet_B_x0_75, SlimFaceNet_C_x0_75
+from .mobilenet_v3 import *
+__all__ = [
+    "model_list", "MobileNet", "ResNet34", "ResNet50", "MobileNetV2", "PVANet",
+    "ResNet50_vd", "ResNet101_vd", "MobileNetV2_x0_25"
+]
+model_list = [
+    'MobileNet', 'ResNet34', 'ResNet50', 'MobileNetV2', 'PVANet',
+    'ResNet50_vd', "ResNet101_vd", "MobileNetV2_x0_25"
+]
 
-__all__ = ['MobileNet', 'ResNet34', 'ResNet50', 'MobileNetV2']
+__all__ += mobilenet_v3.__all__
+model_list += mobilenet_v3.__all__
diff --git a/demo/models/mobilenet.py b/demo/models/mobilenet.py
index 921d6226ca2a65d5c9b57e27bf6607c7376c51f6..9ae095e6bd54209a8144d73bdee07a45470855ce 100644
--- a/demo/models/mobilenet.py
+++ b/demo/models/mobilenet.py
@@ -127,13 +127,14 @@ class MobileNet():
             pool_stride=1,
             pool_type='avg',
             global_pooling=True)
-
-        output = fluid.layers.fc(input=input,
-                                 size=class_dim,
-                                 act='softmax',
-                                 param_attr=ParamAttr(
-                                     initializer=MSRA(), name="fc7_weights"),
-                                 bias_attr=ParamAttr(name="fc7_offset"))
+        with fluid.name_scope('last_fc'):
+            output = fluid.layers.fc(input=input,
+                                     size=class_dim,
+                                     act='softmax',
+                                     param_attr=ParamAttr(
+                                         initializer=MSRA(),
+                                         name="fc7_weights"),
+                                     bias_attr=ParamAttr(name="fc7_offset"))
 
         return output
 
diff --git a/demo/models/mobilenet_v3.py b/demo/models/mobilenet_v3.py
new file mode 100644
index 0000000000000000000000000000000000000000..3276b352744a199ee858d193cb46e1b5ce36bca7
--- /dev/null
+++ b/demo/models/mobilenet_v3.py
@@ -0,0 +1,303 @@
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+import math
+
+__all__ = [
+    'MobileNetV3', 'MobileNetV3_small_x0_25', 'MobileNetV3_small_x0_5',
+    'MobileNetV3_small_x0_75', 'MobileNetV3_small_x1_0',
+    'MobileNetV3_small_x1_25', 'MobileNetV3_large_x0_25',
+    'MobileNetV3_large_x0_5', 'MobileNetV3_large_x0_75',
+    'MobileNetV3_large_x1_0', 'MobileNetV3_large_x1_25',
+    'MobileNetV3_large_x2_0'
+]
+
+
+class MobileNetV3():
+    def __init__(self, scale=1.0, model_name='small'):
+        self.scale = scale
+        self.inplanes = 16
+        if model_name == "large":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, False, 'relu', 1],
+                [3, 64, 24, False, 'relu', 2],
+                [3, 72, 24, False, 'relu', 1],
+                [5, 72, 40, True, 'relu', 2],
+                [5, 120, 40, True, 'relu', 1],
+                [5, 120, 40, True, 'relu', 1],
+                [3, 240, 80, False, 'hard_swish', 2],
+                [3, 200, 80, False, 'hard_swish', 1],
+                [3, 184, 80, False, 'hard_swish', 1],
+                [3, 184, 80, False, 'hard_swish', 1],
+                [3, 480, 112, True, 'hard_swish', 1],
+                [3, 672, 112, True, 'hard_swish', 1],
+                [5, 672, 160, True, 'hard_swish', 2],
+                [5, 960, 160, True, 'hard_swish', 1],
+                [5, 960, 160, True, 'hard_swish', 1],
+            ]
+            self.cls_ch_squeeze = 960
+            self.cls_ch_expand = 1280
+        elif model_name == "small":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, True, 'relu', 2],
+                [3, 72, 24, False, 'relu', 2],
+                [3, 88, 24, False, 'relu', 1],
+                [5, 96, 40, True, 'hard_swish', 2],
+                [5, 240, 40, True, 'hard_swish', 1],
+                [5, 240, 40, True, 'hard_swish', 1],
+                [5, 120, 48, True, 'hard_swish', 1],
+                [5, 144, 48, True, 'hard_swish', 1],
+                [5, 288, 96, True, 'hard_swish', 2],
+                [5, 576, 96, True, 'hard_swish', 1],
+                [5, 576, 96, True, 'hard_swish', 1],
+            ]
+            self.cls_ch_squeeze = 576
+            self.cls_ch_expand = 1280
+        else:
+            raise NotImplementedError
+
+    def net(self, input, class_dim=1000):
+        scale = self.scale
+        inplanes = self.inplanes
+        cfg = self.cfg
+        cls_ch_squeeze = self.cls_ch_squeeze
+        cls_ch_expand = self.cls_ch_expand
+
+        #conv1
+        conv = self.conv_bn_layer(
+            input,
+            filter_size=3,
+            #num_filters=int(scale*inplanes),
+            num_filters=inplanes if scale <= 1.0 else int(inplanes * scale),
+            stride=2,
+            padding=1,
+            num_groups=1,
+            if_act=True,
+            act='hard_swish',
+            name='conv1')
+        print(conv.shape)
+        i = 0
+        for layer_cfg in cfg:
+            conv = self.residual_unit(
+                input=conv,
+                num_in_filter=inplanes,
+                num_mid_filter=int(scale * layer_cfg[1]),
+                num_out_filter=int(scale * layer_cfg[2]),
+                act=layer_cfg[4],
+                stride=layer_cfg[5],
+                filter_size=layer_cfg[0],
+                use_se=layer_cfg[3],
+                name='conv' + str(i + 2))
+
+            inplanes = int(scale * layer_cfg[2])
+            i += 1
+
+        conv = self.conv_bn_layer(
+            input=conv,
+            filter_size=1,
+            num_filters=int(scale * cls_ch_squeeze),
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            act='hard_swish',
+            name='conv_last')
+        conv = fluid.layers.pool2d(
+            input=conv, pool_type='avg', global_pooling=True, use_cudnn=False)
+        conv = fluid.layers.conv2d(
+            input=conv,
+            num_filters=cls_ch_expand,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=None,
+            param_attr=ParamAttr(name='last_1x1_conv_weights'),
+            bias_attr=False)
+        #conv = fluid.layers.hard_swish(conv)
+        conv = self.hard_swish(conv)
+        out = fluid.layers.fc(input=conv,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=ParamAttr(name='fc_weights'),
+                              bias_attr=ParamAttr(name='fc_offset'))
+        return out
+
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      num_groups=1,
+                      if_act=True,
+                      act=None,
+                      name=None,
+                      use_cudnn=True):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(name=name + '_weights'),
+            bias_attr=False)
+        bn_name = name + '_bn'
+        bn = fluid.layers.batch_norm(
+            input=conv,
+            param_attr=ParamAttr(
+                name=bn_name + "_scale",
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=0.0)),
+            bias_attr=ParamAttr(
+                name=bn_name + "_offset",
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=0.0)),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+        if if_act:
+            if act == 'relu':
+                bn = fluid.layers.relu(bn)
+            elif act == 'hard_swish':
+                #bn = fluid.layers.hard_swish(bn)
+                bn = self.hard_swish(bn)
+        return bn
+
+    def hard_swish(self, x):
+        return x * fluid.layers.relu6(x + 3) / 6.
+
+    def se_block(self, input, num_out_filter, ratio=4, name=None):
+        num_mid_filter = int(num_out_filter // ratio)
+        pool = fluid.layers.pool2d(
+            input=input, pool_type='avg', global_pooling=True, use_cudnn=False)
+        conv1 = fluid.layers.conv2d(
+            input=pool,
+            filter_size=1,
+            num_filters=num_mid_filter,
+            act='relu',
+            param_attr=ParamAttr(name=name + '_1_weights'),
+            bias_attr=ParamAttr(name=name + '_1_offset'))
+        conv2 = fluid.layers.conv2d(
+            input=conv1,
+            filter_size=1,
+            num_filters=num_out_filter,
+            act='hard_sigmoid',
+            param_attr=ParamAttr(name=name + '_2_weights'),
+            bias_attr=ParamAttr(name=name + '_2_offset'))
+
+        scale = fluid.layers.elementwise_mul(x=input, y=conv2, axis=0)
+        return scale
+
+    def residual_unit(self,
+                      input,
+                      num_in_filter,
+                      num_mid_filter,
+                      num_out_filter,
+                      stride,
+                      filter_size,
+                      act=None,
+                      use_se=False,
+                      name=None):
+
+        input_data = input
+        conv0 = self.conv_bn_layer(
+            input=input,
+            filter_size=1,
+            num_filters=num_mid_filter,
+            stride=1,
+            padding=0,
+            if_act=True,
+            act=act,
+            name=name + '_expand')
+
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            filter_size=filter_size,
+            num_filters=num_mid_filter,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            if_act=True,
+            act=act,
+            num_groups=num_mid_filter,
+            use_cudnn=False,
+            name=name + '_depthwise')
+
+        if use_se:
+            with fluid.name_scope('se_block_skip'):
+                conv1 = self.se_block(
+                    input=conv1,
+                    num_out_filter=num_mid_filter,
+                    name=name + '_se')
+
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            filter_size=1,
+            num_filters=num_out_filter,
+            stride=1,
+            padding=0,
+            if_act=False,
+            name=name + '_linear')
+        if num_in_filter != num_out_filter or stride != 1:
+            return conv2
+        else:
+            return fluid.layers.elementwise_add(
+                x=input_data, y=conv2, act=None)
+
+
+def MobileNetV3_small_x0_25():
+    model = MobileNetV3(model_name='small', scale=0.25)
+    return model
+
+
+def MobileNetV3_small_x0_5():
+    model = MobileNetV3(model_name='small', scale=0.5)
+    return model
+
+
+def MobileNetV3_small_x0_75():
+    model = MobileNetV3(model_name='small', scale=0.75)
+    return model
+
+
+def MobileNetV3_small_x1_0():
+    model = MobileNetV3(model_name='small', scale=1.0)
+    return model
+
+
+def MobileNetV3_small_x1_25():
+    model = MobileNetV3(model_name='small', scale=1.25)
+    return model
+
+
+def MobileNetV3_large_x0_25():
+    model = MobileNetV3(model_name='large', scale=0.25)
+    return model
+
+
+def MobileNetV3_large_x0_5():
+    model = MobileNetV3(model_name='large', scale=0.5)
+    return model
+
+
+def MobileNetV3_large_x0_75():
+    model = MobileNetV3(model_name='large', scale=0.75)
+    return model
+
+
+def MobileNetV3_large_x1_0():
+    model = MobileNetV3(model_name='large', scale=1.0)
+    return model
+
+
+def MobileNetV3_large_x1_25():
+    model = MobileNetV3(model_name='large', scale=1.25)
+    return model
+
+
+def MobileNetV3_large_x2_0():
+    model = MobileNetV3(model_name='large', scale=2.0)
+    return model
diff --git a/demo/models/pvanet.py b/demo/models/pvanet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f5024c94f33424b7a55474431cbf48d68133093
--- /dev/null
+++ b/demo/models/pvanet.py
@@ -0,0 +1,505 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+import os, sys, time, math
+import numpy as np
+from collections import namedtuple
+
+BLOCK_TYPE_MCRELU = 'BLOCK_TYPE_MCRELU'
+BLOCK_TYPE_INCEP = 'BLOCK_TYPE_INCEP'
+BlockConfig = namedtuple('BlockConfig',
+                         'stride, num_outputs, preact_bn, block_type')
+
+__all__ = ['PVANet']
+
+
+class PVANet():
+    def __init__(self):
+        pass
+
+    def net(self, input, include_last_bn_relu=True, class_dim=1000):
+        conv1 = self._conv_bn_crelu(input, 16, 7, stride=2, name="conv1_1")
+        pool1 = fluid.layers.pool2d(
+            input=conv1,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max',
+            name='pool1')
+
+        end_points = {}
+        conv2 = self._conv_stage(
+            pool1,
+            block_configs=[
+                BlockConfig(1, (24, 24, 48), False, BLOCK_TYPE_MCRELU),
+                BlockConfig(1, (24, 24, 48), True, BLOCK_TYPE_MCRELU),
+                BlockConfig(1, (24, 24, 48), True, BLOCK_TYPE_MCRELU)
+            ],
+            name='conv2',
+            end_points=end_points)
+
+        conv3 = self._conv_stage(
+            conv2,
+            block_configs=[
+                BlockConfig(2, (48, 48, 96), True, BLOCK_TYPE_MCRELU),
+                BlockConfig(1, (48, 48, 96), True, BLOCK_TYPE_MCRELU),
+                BlockConfig(1, (48, 48, 96), True, BLOCK_TYPE_MCRELU),
+                BlockConfig(1, (48, 48, 96), True, BLOCK_TYPE_MCRELU)
+            ],
+            name='conv3',
+            end_points=end_points)
+
+        conv4 = self._conv_stage(
+            conv3,
+            block_configs=[
+                BlockConfig(2, '64 48-96 24-48-48 96 128', True,
+                            BLOCK_TYPE_INCEP),
+                BlockConfig(1, '64 64-96 24-48-48 128', True,
+                            BLOCK_TYPE_INCEP),
+                BlockConfig(1, '64 64-96 24-48-48 128', True,
+                            BLOCK_TYPE_INCEP),
+                BlockConfig(1, '64 64-96 24-48-48 128', True, BLOCK_TYPE_INCEP)
+            ],
+            name='conv4',
+            end_points=end_points)
+
+        conv5 = self._conv_stage(
+            conv4,
+            block_configs=[
+                BlockConfig(2, '64 96-128 32-64-64 128 196', True,
+                            BLOCK_TYPE_INCEP),
+                BlockConfig(1, '64 96-128 32-64-64 196', True,
+                            BLOCK_TYPE_INCEP),
+                BlockConfig(1, '64 96-128 32-64-64 196', True,
+                            BLOCK_TYPE_INCEP), BlockConfig(
+                                1, '64 96-128 32-64-64 196', True,
+                                BLOCK_TYPE_INCEP)
+            ],
+            name='conv5',
+            end_points=end_points)
+
+        if include_last_bn_relu:
+            conv5 = self._bn(conv5, 'relu', 'conv5_4_last_bn')
+        end_points['conv5'] = conv5
+
+        output = fluid.layers.fc(input=input,
+                                 size=class_dim,
+                                 act='softmax',
+                                 param_attr=ParamAttr(
+                                     initializer=MSRA(), name="fc_weights"),
+                                 bias_attr=ParamAttr(name="fc_offset"))
+
+        return output
+
+    def _conv_stage(self, input, block_configs, name, end_points):
+        net = input
+        for idx, bc in enumerate(block_configs):
+            if bc.block_type == BLOCK_TYPE_MCRELU:
+                block_scope = '{}_{}'.format(name, idx + 1)
+                fn = self._mCReLU
+            elif bc.block_type == BLOCK_TYPE_INCEP:
+                block_scope = '{}_{}_incep'.format(name, idx + 1)
+                fn = self._inception_block
+            net = fn(net, bc, block_scope)
+            end_points[block_scope] = net
+        end_points[name] = net
+        return net
+
+    def _mCReLU(self, input, mc_config, name):
+        """
+        every cReLU has at least three conv steps:
+            conv_bn_relu, conv_bn_crelu, conv_bn_relu
+        if the inputs has a different number of channels as crelu output,
+        an extra 1x1 conv is added before sum.
+        """
+        if mc_config.preact_bn:
+            conv1_fn = self._bn_relu_conv
+            conv1_scope = name + '_1'
+        else:
+            conv1_fn = self._conv
+            conv1_scope = name + '_1_conv'
+
+        sub_conv1 = conv1_fn(input, mc_config.num_outputs[0], 1, conv1_scope,
+                             mc_config.stride)
+
+        sub_conv2 = self._bn_relu_conv(sub_conv1, mc_config.num_outputs[1], 3,
+                                       name + '_2')
+
+        sub_conv3 = self._bn_crelu_conv(sub_conv2, mc_config.num_outputs[2], 1,
+                                        name + '_3')
+
+        if int(input.shape[1]) == mc_config.num_outputs[2]:
+            conv_proj = input
+        else:
+            conv_proj = self._conv(input, mc_config.num_outputs[2], 1,
+                                   name + '_proj', mc_config.stride)
+
+        conv = sub_conv3 + conv_proj
+        return conv
+
+    def _inception_block(self, input, block_config, name):
+        num_outputs = block_config.num_outputs.split()  # e.g. 64 24-48-48 128
+        num_outputs = [map(int, s.split('-')) for s in num_outputs]
+        inception_outputs = num_outputs[-1][0]
+        num_outputs = num_outputs[:-1]
+        stride = block_config.stride
+        pool_path_outputs = None
+        if stride > 1:
+            pool_path_outputs = num_outputs[-1][0]
+            num_outputs = num_outputs[:-1]
+
+        scopes = [['_0']]  # follow the name style of caffe pva
+        kernel_sizes = [[1]]
+        for path_idx, path_outputs in enumerate(num_outputs[1:]):
+            path_idx += 1
+            path_scopes = ['_{}_reduce'.format(path_idx)]
+            path_scopes.extend([
+                '_{}_{}'.format(path_idx, i - 1)
+                for i in range(1, len(path_outputs))
+            ])
+            scopes.append(path_scopes)
+
+            path_kernel_sizes = [1, 3, 3][:len(path_outputs)]
+            kernel_sizes.append(path_kernel_sizes)
+
+        paths = []
+        if block_config.preact_bn:
+            preact = self._bn(input, 'relu', name + '_bn')
+        else:
+            preact = input
+
+        path_params = zip(num_outputs, scopes, kernel_sizes)
+        for path_idx, path_param in enumerate(path_params):
+            path_net = preact
+            for conv_idx, (num_output, scope,
+                           kernel_size) in enumerate(zip(*path_param)):
+                if conv_idx == 0:
+                    conv_stride = stride
+                else:
+                    conv_stride = 1
+                path_net = self._conv_bn_relu(path_net, num_output,
+                                              kernel_size, name + scope,
+                                              conv_stride)
+            paths.append(path_net)
+
+        if stride > 1:
+            path_net = fluid.layers.pool2d(
+                input,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max',
+                name=name + '_pool')
+            path_net = self._conv_bn_relu(path_net, pool_path_outputs, 1,
+                                          name + '_poolproj')
+            paths.append(path_net)
+        block_net = fluid.layers.concat(paths, axis=1)
+        block_net = self._conv(block_net, inception_outputs, 1,
+                               name + '_out_conv')
+
+        if int(input.shape[1]) == inception_outputs:
+            proj = input
+        else:
+            proj = self._conv(input, inception_outputs, 1, name + '_proj',
+                              stride)
+        return block_net + proj
+
+    def _scale(self, input, name, axis=1, num_axes=1):
+        assert num_axes == 1, "layer scale not support this num_axes[%d] now" % (
+            num_axes)
+
+        prefix = name + '_'
+        scale_shape = input.shape[axis:axis + num_axes]
+        param_attr = fluid.ParamAttr(name=prefix + 'gamma')
+        scale_param = fluid.layers.create_parameter(
+            shape=scale_shape,
+            dtype=input.dtype,
+            name=name,
+            attr=param_attr,
+            is_bias=True,
+            default_initializer=fluid.initializer.Constant(value=1.0))
+
+        offset_attr = fluid.ParamAttr(name=prefix + 'beta')
+        offset_param = fluid.layers.create_parameter(
+            shape=scale_shape,
+            dtype=input.dtype,
+            name=name,
+            attr=offset_attr,
+            is_bias=True,
+            default_initializer=fluid.initializer.Constant(value=0.0))
+
+        output = fluid.layers.elementwise_mul(
+            input, scale_param, axis=axis, name=prefix + 'mul')
+        output = fluid.layers.elementwise_add(
+            output, offset_param, axis=axis, name=prefix + 'add')
+        return output
+
+    def _conv(self,
+              input,
+              num_filters,
+              filter_size,
+              name,
+              stride=1,
+              groups=1,
+              act=None):
+        net = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=act,
+            use_cudnn=True,
+            param_attr=ParamAttr(name=name + '_weights'),
+            bias_attr=ParamAttr(name=name + '_bias'),
+            name=name)
+        return net
+
+    def _bn(self, input, act, name):
+        net = fluid.layers.batch_norm(
+            input=input,
+            act=act,
+            name=name,
+            moving_mean_name=name + '_mean',
+            moving_variance_name=name + '_variance',
+            param_attr=ParamAttr(name=name + '_scale'),
+            bias_attr=ParamAttr(name=name + '_offset'))
+        return net
+
+    def _bn_relu_conv(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      name,
+                      stride=1,
+                      groups=1):
+
+        net = self._bn(input, 'relu', name + '_bn')
+        net = self._conv(net, num_filters, filter_size, name + '_conv', stride,
+                         groups)
+        return net
+
+    def _conv_bn_relu(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      name,
+                      stride=1,
+                      groups=1):
+        net = self._conv(input, num_filters, filter_size, name + '_conv',
+                         stride, groups)
+        net = self._bn(net, 'relu', name + '_bn')
+        return net
+
+    def _bn_crelu(self, input, name):
+        net = self._bn(input, None, name + '_bn_1')
+        neg_net = fluid.layers.scale(net, scale=-1.0, name=name + '_neg')
+        net = fluid.layers.concat([net, neg_net], axis=1)
+        net = self._scale(net, name + '_scale')
+        net = fluid.layers.relu(net, name=name + '_relu')
+        return net
+
+    def _conv_bn_crelu(self,
+                       input,
+                       num_filters,
+                       filter_size,
+                       name,
+                       stride=1,
+                       groups=1,
+                       act=None):
+        net = self._conv(input, num_filters, filter_size, name + '_conv',
+                         stride, groups)
+        net = self._bn_crelu(net, name)
+        return net
+
+    def _bn_crelu_conv(self,
+                       input,
+                       num_filters,
+                       filter_size,
+                       name,
+                       stride=1,
+                       groups=1,
+                       act=None):
+        net = self._bn_crelu(input, name)
+        net = self._conv(net, num_filters, filter_size, name + '_conv', stride,
+                         groups)
+        return net
+
+    def deconv_bn_layer(self,
+                        input,
+                        num_filters,
+                        filter_size=4,
+                        stride=2,
+                        padding=1,
+                        act='relu',
+                        name=None):
+        """Deconv bn layer."""
+        deconv = fluid.layers.conv2d_transpose(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            act=None,
+            param_attr=ParamAttr(name=name + '_weights'),
+            bias_attr=ParamAttr(name=name + '_bias'),
+            name=name + 'deconv')
+        return self._bn(deconv, act, name + '_bn')
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      name,
+                      stride=1,
+                      groups=1):
+        return self._conv_bn_relu(input, num_filters, filter_size, name,
+                                  stride, groups)
+
+
+def Fpn_Fusion(blocks, net):
+    f = [blocks['conv5'], blocks['conv4'], blocks['conv3'], blocks['conv2']]
+    num_outputs = [64] * len(f)
+    g = [None] * len(f)
+    h = [None] * len(f)
+    for i in range(len(f)):
+        h[i] = net.conv_bn_layer(f[i], num_outputs[i], 1, 'fpn_pre_' + str(i))
+
+    for i in range(len(f) - 1):
+        if i == 0:
+            g[i] = net.deconv_bn_layer(h[i], num_outputs[i], name='fpn_0')
+        else:
+            out = fluid.layers.elementwise_add(x=g[i - 1], y=h[i])
+            out = net.conv_bn_layer(out, num_outputs[i], 1,
+                                    'fpn_trans_' + str(i))
+            g[i] = net.deconv_bn_layer(
+                out, num_outputs[i], name='fpn_' + str(i))
+
+    out = fluid.layers.elementwise_add(x=g[-2], y=h[-1])
+    out = net.conv_bn_layer(out, num_outputs[-1], 1, 'fpn_post_0')
+    out = net.conv_bn_layer(out, num_outputs[-1], 3, 'fpn_post_1')
+
+    return out
+
+
+def Detector_Header(f_common, net, class_num):
+    """Detector header."""
+    f_geo = net.conv_bn_layer(f_common, 64, 1, name='geo_1')
+    f_geo = net.conv_bn_layer(f_geo, 64, 3, name='geo_2')
+    f_geo = net.conv_bn_layer(f_geo, 64, 1, name='geo_3')
+    f_geo = fluid.layers.conv2d(
+        f_geo,
+        8,
+        1,
+        use_cudnn=True,
+        param_attr=ParamAttr(name='geo_4_conv_weights'),
+        bias_attr=ParamAttr(name='geo_4_conv_bias'),
+        name='geo_4_conv')
+
+    name = 'score_class_num' + str(class_num + 1)
+    f_score = net.conv_bn_layer(f_common, 64, 1, 'score_1')
+    f_score = net.conv_bn_layer(f_score, 64, 3, 'score_2')
+    f_score = net.conv_bn_layer(f_score, 64, 1, 'score_3')
+    f_score = fluid.layers.conv2d(
+        f_score,
+        class_num + 1,
+        1,
+        use_cudnn=True,
+        param_attr=ParamAttr(name=name + '_conv_weights'),
+        bias_attr=ParamAttr(name=name + '_conv_bias'),
+        name=name + '_conv')
+
+    f_score = fluid.layers.transpose(f_score, perm=[0, 2, 3, 1])
+    f_score = fluid.layers.reshape(f_score, shape=[-1, class_num + 1])
+    f_score = fluid.layers.softmax(input=f_score)
+
+    return f_score, f_geo
+
+
+def east(input, class_num=31):
+    net = PVANet()
+    out = net.net(input)
+    blocks = []
+    for i, j, k in zip(['conv2', 'conv3', 'conv4', 'conv5'], [1, 2, 4, 8],
+                       [64, 64, 64, 64]):
+        if j == 1:
+            conv = net.conv_bn_layer(
+                out[i], k, 1, name='fusion_' + str(len(blocks)))
+        elif j <= 4:
+            conv = net.deconv_bn_layer(
+                out[i], k, 2 * j, j, j // 2,
+                name='fusion_' + str(len(blocks)))
+        else:
+            conv = net.deconv_bn_layer(
+                out[i], 32, 8, 4, 2, name='fusion_' + str(len(blocks)) + '_1')
+            conv = net.deconv_bn_layer(
+                conv,
+                k,
+                j // 2,
+                j // 4,
+                j // 8,
+                name='fusion_' + str(len(blocks)) + '_2')
+        blocks.append(conv)
+    conv = fluid.layers.concat(blocks, axis=1)
+    f_score, f_geo = Detector_Header(conv, net, class_num)
+    return f_score, f_geo
+
+
+def inference(input, class_num=1, nms_thresh=0.2, score_thresh=0.5):
+    f_score, f_geo = east(input, class_num)
+    print("f_geo shape={}".format(f_geo.shape))
+    print("f_score shape={}".format(f_score.shape))
+    f_score = fluid.layers.transpose(f_score, perm=[1, 0])
+    return f_score, f_geo
+
+
+def loss(f_score, f_geo, l_score, l_geo, l_mask, class_num=1):
+    '''
+    predictions: f_score: -1 x 1 x H x W; f_geo: -1 x 8 x H x W
+    targets: l_score: -1 x 1 x H x W; l_geo: -1 x 1 x H x W; l_mask: -1 x 1 x H x W
+    return: dice_loss + smooth_l1_loss
+    '''
+    #smooth_l1_loss
+    channels = 8
+    l_geo_split, l_short_edge = fluid.layers.split(
+        l_geo, num_or_sections=[channels, 1],
+        dim=1)  #last channel is short_edge_norm
+    f_geo_split = fluid.layers.split(f_geo, num_or_sections=[channels], dim=1)
+    f_geo_split = f_geo_split[0]
+
+    geo_diff = l_geo_split - f_geo_split
+    abs_geo_diff = fluid.layers.abs(geo_diff)
+    l_flag = l_score >= 1
+    l_flag = fluid.layers.cast(x=l_flag, dtype="float32")
+    l_flag = fluid.layers.expand(x=l_flag, expand_times=[1, channels, 1, 1])
+
+    smooth_l1_sign = abs_geo_diff < l_flag
+    smooth_l1_sign = fluid.layers.cast(x=smooth_l1_sign, dtype="float32")
+
+    in_loss = abs_geo_diff * abs_geo_diff * smooth_l1_sign + (
+        abs_geo_diff - 0.5) * (1.0 - smooth_l1_sign)
+    l_short_edge = fluid.layers.expand(
+        x=l_short_edge, expand_times=[1, channels, 1, 1])
+    out_loss = l_short_edge * in_loss * l_flag
+    out_loss = out_loss * l_flag
+    smooth_l1_loss = fluid.layers.reduce_mean(out_loss)
+
+    ##softmax_loss
+    l_score.stop_gradient = True
+    l_score = fluid.layers.transpose(l_score, perm=[0, 2, 3, 1])
+    l_score.stop_gradient = True
+    l_score = fluid.layers.reshape(l_score, shape=[-1, 1])
+    l_score.stop_gradient = True
+    l_score = fluid.layers.cast(x=l_score, dtype="int64")
+    l_score.stop_gradient = True
+
+    softmax_loss = fluid.layers.cross_entropy(input=f_score, label=l_score)
+    softmax_loss = fluid.layers.reduce_mean(softmax_loss)
+
+    return softmax_loss, smooth_l1_loss
diff --git a/demo/models/resnet_vd.py b/demo/models/resnet_vd.py
new file mode 100644
index 0000000000000000000000000000000000000000..c93606de906b57b9a0a1c7b8faa571a93784c915
--- /dev/null
+++ b/demo/models/resnet_vd.py
@@ -0,0 +1,291 @@
+#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = [
+    "ResNet", "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd",
+    "ResNet152_vd", "ResNet200_vd"
+]
+
+
+class ResNet():
+    def __init__(self, layers=50, is_3x3=False):
+        self.layers = layers
+        self.is_3x3 = is_3x3
+
+    def net(self, input, class_dim=1000):
+        is_3x3 = self.is_3x3
+        layers = self.layers
+        supported_layers = [18, 34, 50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_filters = [64, 128, 256, 512]
+        if is_3x3 == False:
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+        else:
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=32,
+                filter_size=3,
+                stride=2,
+                act='relu',
+                name='conv1_1')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=32,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name='conv1_2')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=64,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name='conv1_3')
+
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+
+        if layers >= 50:
+            for block in range(len(depth)):
+                for i in range(depth[block]):
+                    if layers in [101, 152, 200] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    conv = self.bottleneck_block(
+                        input=conv,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        if_first=block == i == 0,
+                        name=conv_name)
+        else:
+            for block in range(len(depth)):
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    conv = self.basic_block(
+                        input=conv,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        if_first=block == i == 0,
+                        name=conv_name)
+
+        pool = fluid.layers.pool2d(
+            input=conv, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+
+        out = fluid.layers.fc(
+            input=pool,
+            size=class_dim,
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)))
+
+        return out
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def conv_bn_layer_new(self,
+                          input,
+                          num_filters,
+                          filter_size,
+                          stride=1,
+                          groups=1,
+                          act=None,
+                          name=None):
+        pool = fluid.layers.pool2d(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            pool_padding=0,
+            pool_type='avg',
+            ceil_mode=True)
+
+        conv = fluid.layers.conv2d(
+            input=pool,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=1,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def shortcut(self, input, ch_out, stride, name, if_first=False):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            if if_first:
+                return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+            else:
+                return self.conv_bn_layer_new(
+                    input, ch_out, 1, stride, name=name)
+        elif if_first:
+            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+        else:
+            return input
+
+    def bottleneck_block(self, input, num_filters, stride, name, if_first):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        short = self.shortcut(
+            input,
+            num_filters * 4,
+            stride,
+            if_first=if_first,
+            name=name + "_branch1")
+
+        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+    def basic_block(self, input, num_filters, stride, name, if_first):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=3,
+            act='relu',
+            stride=stride,
+            name=name + "_branch2a")
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            name=name + "_branch2b")
+        short = self.shortcut(
+            input,
+            num_filters,
+            stride,
+            if_first=if_first,
+            name=name + "_branch1")
+        return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
+
+
+def ResNet18_vd():
+    model = ResNet(layers=18, is_3x3=True)
+    return model
+
+
+def ResNet34_vd():
+    model = ResNet(layers=34, is_3x3=True)
+    return model
+
+
+def ResNet50_vd():
+    model = ResNet(layers=50, is_3x3=True)
+    return model
+
+
+def ResNet101_vd():
+    model = ResNet(layers=101, is_3x3=True)
+    return model
+
+
+def ResNet152_vd():
+    model = ResNet(layers=152, is_3x3=True)
+    return model
+
+
+def ResNet200_vd():
+    model = ResNet(layers=200, is_3x3=True)
+    return model
diff --git a/demo/models/slimfacenet.py b/demo/models/slimfacenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..33a4deab9340855398c9c9e7d0aa4386a6377030
--- /dev/null
+++ b/demo/models/slimfacenet.py
@@ -0,0 +1,373 @@
+# ================================================================
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import datetime
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+
+
+class SlimFaceNet():
+    def __init__(self, class_dim, scale=0.6, arch=None):
+
+        assert arch is not None
+        self.arch = arch
+        self.class_dim = class_dim
+        kernels = [3]
+        expansions = [2, 4, 6]
+        SE = [0, 1]
+        self.table = []
+        for k in kernels:
+            for e in expansions:
+                for se in SE:
+                    self.table.append((k, e, se))
+
+        if scale == 1.0:
+            # 100% - channel
+            self.Slimfacenet_bottleneck_setting = [
+                # t, c , n ,s
+                [2, 64, 5, 2],
+                [4, 128, 1, 2],
+                [2, 128, 6, 1],
+                [4, 128, 1, 2],
+                [2, 128, 2, 1]
+            ]
+        elif scale == 0.9:
+            # 90% - channel
+            self.Slimfacenet_bottleneck_setting = [
+                # t, c , n ,s
+                [2, 56, 5, 2],
+                [4, 116, 1, 2],
+                [2, 116, 6, 1],
+                [4, 116, 1, 2],
+                [2, 116, 2, 1]
+            ]
+        elif scale == 0.75:
+            # 75% - channel
+            self.Slimfacenet_bottleneck_setting = [
+                # t, c , n ,s
+                [2, 48, 5, 2],
+                [4, 96, 1, 2],
+                [2, 96, 6, 1],
+                [4, 96, 1, 2],
+                [2, 96, 2, 1]
+            ]
+        elif scale == 0.6:
+            # 60% - channel
+            self.Slimfacenet_bottleneck_setting = [
+                # t, c , n ,s
+                [2, 40, 5, 2],
+                [4, 76, 1, 2],
+                [2, 76, 6, 1],
+                [4, 76, 1, 2],
+                [2, 76, 2, 1]
+            ]
+        else:
+            print('WRONG scale')
+            exit()
+        self.extract_feature = True
+
+    def set_extract_feature_flag(self, flag):
+        self.extract_feature = flag
+
+    def net(self, input, label=None):
+        x = self.conv_bn_layer(
+            input,
+            filter_size=3,
+            num_filters=64,
+            stride=2,
+            padding=1,
+            num_groups=1,
+            if_act=True,
+            name='conv3x3')
+        x = self.conv_bn_layer(
+            x,
+            filter_size=3,
+            num_filters=64,
+            stride=1,
+            padding=1,
+            num_groups=64,
+            if_act=True,
+            name='dw_conv3x3')
+
+        in_c = 64
+        cnt = 0
+        for _exp, out_c, times, _stride in self.Slimfacenet_bottleneck_setting:
+            for i in range(times):
+                stride = _stride if i == 0 else 1
+                filter_size, exp, se = self.table[self.arch[cnt]]
+                se = False if se == 0 else True
+                x = self.residual_unit(
+                    x,
+                    num_in_filter=in_c,
+                    num_out_filter=out_c,
+                    stride=stride,
+                    filter_size=filter_size,
+                    expansion_factor=exp,
+                    use_se=se,
+                    name='residual_unit' + str(cnt + 1))
+                cnt += 1
+                in_c = out_c
+
+        out_c = 512
+        x = self.conv_bn_layer(
+            x,
+            filter_size=1,
+            num_filters=out_c,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            name='conv1x1')
+        x = self.conv_bn_layer(
+            x,
+            filter_size=(7, 6),
+            num_filters=out_c,
+            stride=1,
+            padding=0,
+            num_groups=out_c,
+            if_act=False,
+            name='global_dw_conv7x7')
+        x = fluid.layers.conv2d(
+            x,
+            num_filters=128,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=None,
+            use_cudnn=True,
+            param_attr=ParamAttr(
+                name='linear_conv1x1_weights',
+                initializer=MSRA(),
+                regularizer=fluid.regularizer.L2Decay(4e-4)),
+            bias_attr=False)
+        bn_name = 'linear_conv1x1_bn'
+        x = fluid.layers.batch_norm(
+            x,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+        x = fluid.layers.reshape(x, shape=[x.shape[0], x.shape[1]])
+
+        if self.extract_feature:
+            return x
+
+        out = self.arc_margin_product(
+            x, label, self.class_dim, s=32.0, m=0.50, mode=2)
+        softmax = fluid.layers.softmax(input=out)
+        cost = fluid.layers.cross_entropy(input=softmax, label=label)
+        loss = fluid.layers.mean(x=cost)
+        acc = fluid.layers.accuracy(input=out, label=label, k=1)
+        return loss, acc
+
+    def residual_unit(self,
+                      input,
+                      num_in_filter,
+                      num_out_filter,
+                      stride,
+                      filter_size,
+                      expansion_factor,
+                      use_se=False,
+                      name=None):
+
+        num_expfilter = int(round(num_in_filter * expansion_factor))
+        input_data = input
+
+        expand_conv = self.conv_bn_layer(
+            input=input,
+            filter_size=1,
+            num_filters=num_expfilter,
+            stride=1,
+            padding=0,
+            if_act=True,
+            name=name + '_expand')
+
+        depthwise_conv = self.conv_bn_layer(
+            input=expand_conv,
+            filter_size=filter_size,
+            num_filters=num_expfilter,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            if_act=True,
+            num_groups=num_expfilter,
+            use_cudnn=True,
+            name=name + '_depthwise')
+
+        if use_se:
+            depthwise_conv = self.se_block(
+                input=depthwise_conv,
+                num_out_filter=num_expfilter,
+                name=name + '_se')
+
+        linear_conv = self.conv_bn_layer(
+            input=depthwise_conv,
+            filter_size=1,
+            num_filters=num_out_filter,
+            stride=1,
+            padding=0,
+            if_act=False,
+            name=name + '_linear')
+        if num_in_filter != num_out_filter or stride != 1:
+            return linear_conv
+        else:
+            return fluid.layers.elementwise_add(
+                x=input_data, y=linear_conv, act=None)
+
+    def se_block(self, input, num_out_filter, ratio=4, name=None):
+        num_mid_filter = int(num_out_filter // ratio)
+        pool = fluid.layers.pool2d(
+            input=input, pool_type='avg', global_pooling=True, use_cudnn=False)
+        conv1 = fluid.layers.conv2d(
+            input=pool,
+            filter_size=1,
+            num_filters=num_mid_filter,
+            act=None,
+            param_attr=ParamAttr(name=name + '_1_weights'),
+            bias_attr=ParamAttr(name=name + '_1_offset'))
+        conv1 = fluid.layers.prelu(
+            conv1,
+            mode='channel',
+            param_attr=ParamAttr(
+                name=name + '_prelu',
+                regularizer=fluid.regularizer.L2Decay(0.0)))
+        conv2 = fluid.layers.conv2d(
+            input=conv1,
+            filter_size=1,
+            num_filters=num_out_filter,
+            act='hard_sigmoid',
+            param_attr=ParamAttr(name=name + '_2_weights'),
+            bias_attr=ParamAttr(name=name + '_2_offset'))
+        scale = fluid.layers.elementwise_mul(x=input, y=conv2, axis=0)
+        return scale
+
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      num_groups=1,
+                      if_act=True,
+                      name=None,
+                      use_cudnn=True):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(
+                name=name + '_weights', initializer=MSRA()),
+            bias_attr=False)
+        bn_name = name + '_bn'
+        bn = fluid.layers.batch_norm(
+            input=conv,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+        if if_act:
+            return fluid.layers.prelu(
+                bn,
+                mode='channel',
+                param_attr=ParamAttr(
+                    name=name + '_prelu',
+                    regularizer=fluid.regularizer.L2Decay(0.0)))
+        else:
+            return bn
+
+    def arc_margin_product(self, input, label, out_dim, s=32.0, m=0.50,
+                           mode=2):
+        input_norm = fluid.layers.sqrt(
+            fluid.layers.reduce_sum(
+                fluid.layers.square(input), dim=1))
+        input = fluid.layers.elementwise_div(input, input_norm, axis=0)
+
+        weight = fluid.layers.create_parameter(
+            shape=[out_dim, input.shape[1]],
+            dtype='float32',
+            name='weight_norm',
+            attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Xavier(),
+                regularizer=fluid.regularizer.L2Decay(4e-4)))
+
+        weight_norm = fluid.layers.sqrt(
+            fluid.layers.reduce_sum(
+                fluid.layers.square(weight), dim=1))
+        weight = fluid.layers.elementwise_div(weight, weight_norm, axis=0)
+        weight = fluid.layers.transpose(weight, perm=[1, 0])
+        cosine = fluid.layers.mul(input, weight)
+        sine = fluid.layers.sqrt(1.0 - fluid.layers.square(cosine))
+
+        cos_m = math.cos(m)
+        sin_m = math.sin(m)
+        phi = cosine * cos_m - sine * sin_m
+
+        th = math.cos(math.pi - m)
+        mm = math.sin(math.pi - m) * m
+
+        if mode == 1:
+            phi = self.paddle_where_more_than(cosine, 0, phi, cosine)
+        elif mode == 2:
+            phi = self.paddle_where_more_than(cosine, th, phi, cosine - mm)
+        else:
+            pass
+
+        one_hot = fluid.one_hot(input=label, depth=out_dim)
+        output = fluid.layers.elementwise_mul(
+            one_hot, phi) + fluid.layers.elementwise_mul(
+                (1.0 - one_hot), cosine)
+        output = output * s
+        return output
+
+    def paddle_where_more_than(self, target, limit, x, y):
+        mask = fluid.layers.cast(x=(target > limit), dtype='float32')
+        output = fluid.layers.elementwise_mul(
+            mask, x) + fluid.layers.elementwise_mul((1.0 - mask), y)
+        return output
+
+
+def SlimFaceNet_A_x0_60(class_dim=None, scale=0.6, arch=None):
+    scale = 0.6
+    arch = [0, 1, 5, 1, 0, 2, 1, 2, 0, 1, 2, 1, 1, 0, 1]
+    return SlimFaceNet(class_dim=class_dim, scale=scale, arch=arch)
+
+
+def SlimFaceNet_B_x0_75(class_dim=None, scale=0.6, arch=None):
+    scale = 0.75
+    arch = [1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 3, 2, 2, 3]
+    return SlimFaceNet(class_dim=class_dim, scale=scale, arch=arch)
+
+
+def SlimFaceNet_C_x0_75(class_dim=None, scale=0.6, arch=None):
+    scale = 0.75
+    arch = [1, 1, 2, 1, 0, 2, 1, 0, 1, 0, 1, 1, 2, 2, 3]
+    return SlimFaceNet(class_dim=class_dim, scale=scale, arch=arch)
+
+
+if __name__ == "__main__":
+    x = fluid.data(name='x', shape=[-1, 3, 112, 112], dtype='float32')
+    print(x.shape)
+    model = SlimFaceNet(10000, [1, 3, 3, 1, 1, 0, 0, 1, 0, 1, 1, 0, 5, 5, 3])
+    y = model.net(x)
diff --git a/demo/nas/README.md b/demo/nas/README.md
index f28dbd71dba67703935598699935d9a91fd60c3f..b3ee0d18238f99d3c769c8d463c511efae968eae 100644
--- a/demo/nas/README.md
+++ b/demo/nas/README.md
@@ -1,70 +1,52 @@
-# 网络结构搜索示例
+# SANAS网络结构搜索示例
 
-本示例介绍如何使用网络结构搜索接口，搜索到一个更小或者精度更高的模型，该文档仅介绍paddleslim中SANAS的使用及如何利用SANAS得到模型结构，完整示例代码请参考sa_nas_mobilenetv2.py或者block_sa_nas_mobilenetv2.py。
+本示例介绍如何使用网络结构搜索接口，搜索到一个更小或者精度更高的模型，该示例介绍paddleslim中SANAS的使用及如何利用SANAS得到模型结构，完整示例代码请参考sa_nas_mobilenetv2.py或者block_sa_nas_mobilenetv2.py。
+
+## 数据准备
+本示例默认使用cifar10数据，cifar10数据会根据调用的paddle接口自动下载，无需额外准备。
 
 ## 接口介绍
-请参考。
+请参考<a href='../../docs/zh_cn/api_cn/nas_api.rst'>神经网络搜索API文档</a>。
 
-### 1. 配置搜索空间
-详细的搜索空间配置可以参考<a href='../../../paddleslim/nas/nas_api.md'>神经网络搜索API文档</a>。
-```
-config = [('MobileNetV2Space')]
+本示例为利用SANAS在MobileNetV2的搜索空间上搜索FLOPs更小的模型。
+## 1 搜索空间配置
+默认搜索空间为`MobileNetV2`，详细的搜索空间配置请参考<a href='../../docs/zh_cn/api_cn/search_space.md'>搜索空间配置文档</a>。
 
-```
+## 2 启动训练
 
-### 2. 利用搜索空间初始化SANAS实例
+### 2.1 启动基于MobileNetV2初始模型结构构造搜索空间的实验
+```shell
+CUDA_VISIBLE_DEVICES=0 python sa_nas_mobilenetv2.py
 ```
-from paddleslim.nas import SANAS
 
-sa_nas = SANAS(
-    config,
-    server_addr=("", 8881),
-    init_temperature=10.24,
-    reduce_rate=0.85,
-    search_steps=300,
-    is_server=True)
 
+### 2.2 启动基于MobileNetV2的block构造搜索空间的实验
+```shell
+CUDA_VISIBLE_DEVICES=0 python block_sa_nas_mobilenetv2.py
 ```
 
-### 3. 根据实例化的NAS得到当前的网络结构
-```
-archs = sa_nas.next_archs()
-```
+# RLNAS网络结构搜索示例
 
-### 4. 根据得到的网络结构和输入构造训练和测试program
-```
-import paddle.fluid as fluid
-
-train_program = fluid.Program()
-test_program = fluid.Program()
-startup_program = fluid.Program()
-
-with fluid.program_guard(train_program, startup_program):
-    data = fluid.data(name='data', shape=[None, 3, 32, 32], dtype='float32')
-    label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-    for arch in archs:
-        data = arch(data)
-    output = fluid.layers.fc(data, 10)
-    softmax_out = fluid.layers.softmax(input=output, use_cudnn=False)
-    cost = fluid.layers.cross_entropy(input=softmax_out, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1)
-
-    test_program = train_program.clone(for_test=True)
-    sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-    sgd.minimize(avg_cost)
-    
-```
+本示例介绍如何使用RLNAS接口进行网络结构搜索，该示例介绍paddleslim中RLNAS的使用，完整示例代码请参考rl_nas_mobilenetv2.py或者parl_nas_mobilenetv2.py。
 
-### 5. 根据构造的训练program添加限制条件
-```
-from paddleslim.analysis import flops
+## 数据准备
+本示例默认使用cifar10数据，cifar10数据会根据调用的paddle接口自动下载，无需额外准备。
 
-if flops(train_program) > 321208544:
-    continue
-```
+## 接口介绍
+请参考<a href='../../docs/zh_cn/api_cn/nas_api.rst'>神经网络搜索API文档</a>。
 
-### 6. 回传score
+示例为利用SANAS在MobileNetV2的搜索空间上搜索精度更高的模型。
+## 1 搜索空间配置
+默认搜索空间为`MobileNetV2`，详细的搜索空间配置请参考<a href='../../docs/zh_cn/api_cn/search_space.md'>搜索空间配置文档</a>。
+
+## 2 启动训练
+
+### 2.1 启动基于MobileNetV2初始模型结构构造搜索空间，强化学习算法为lstm的搜索实验
+```shell
+CUDA_VISIBLE_DEVICES=0 python rl_nas_mobilenetv2.py
 ```
-sa_nas.reward(score)
+
+### 2.2 启动基于MobileNetV2初始模型结构构造搜索空间，强化学习算法为ddpg的搜索实验
+```shell
+CUDA_VISIBLE_DEVICES=0 python parl_nas_mobilenetv2.py
 ```
diff --git a/demo/nas/block_sa_nas_mobilenetv2.py b/demo/nas/block_sa_nas_mobilenetv2.py
index 27fb1f4cf9076dfdf6500fc79d4df5697042cb09..a32f97b0b8bb5b7bbe20fa0842e94ad58ff9751d 100644
--- a/demo/nas/block_sa_nas_mobilenetv2.py
+++ b/demo/nas/block_sa_nas_mobilenetv2.py
@@ -16,13 +16,6 @@ import imagenet_reader
 
 _logger = get_logger(__name__, level=logging.INFO)
 
-reduce_rate = 0.85
-init_temperature = 10.24
-max_flops = 321208544
-server_address = ""
-port = 8979
-retain_epoch = 5
-
 
 def create_data_loader(image_shape):
     data_shape = [None] + image_shape
@@ -71,17 +64,13 @@ def search_mobilenetv2_block(config, args, image_size):
     if args.is_server:
         sa_nas = SANAS(
             config,
-            server_addr=("", port),
-            init_temperature=init_temperature,
-            reduce_rate=reduce_rate,
+            server_addr=(args.server_address, args.port),
             search_steps=args.search_steps,
             is_server=True)
     else:
         sa_nas = SANAS(
             config,
-            server_addr=(server_address, port),
-            init_temperature=init_temperature,
-            reduce_rate=reduce_rate,
+            server_addr=(args.server_address, args.port),
             search_steps=args.search_steps,
             is_server=False)
 
@@ -140,7 +129,7 @@ def search_mobilenetv2_block(config, args, image_size):
 
         current_flops = flops(train_program)
         print('step: {}, current_flops: {}'.format(step, current_flops))
-        if current_flops > max_flops:
+        if current_flops > int(321208544):
             continue
 
         place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
@@ -148,22 +137,22 @@ def search_mobilenetv2_block(config, args, image_size):
         exe.run(startup_program)
 
         if args.data == 'cifar10':
-            train_reader = paddle.batch(
+            train_reader = paddle.fluid.io.batch(
                 paddle.reader.shuffle(
                     paddle.dataset.cifar.train10(cycle=False), buf_size=1024),
                 batch_size=args.batch_size,
                 drop_last=True)
 
-            test_reader = paddle.batch(
+            test_reader = paddle.fluid.io.batch(
                 paddle.dataset.cifar.test10(cycle=False),
                 batch_size=args.batch_size,
                 drop_last=False)
         elif args.data == 'imagenet':
-            train_reader = paddle.batch(
+            train_reader = paddle.fluid.io.batch(
                 imagenet_reader.train(),
                 batch_size=args.batch_size,
                 drop_last=True)
-            test_reader = paddle.batch(
+            test_reader = paddle.fluid.io.batch(
                 imagenet_reader.val(),
                 batch_size=args.batch_size,
                 drop_last=False)
@@ -178,7 +167,7 @@ def search_mobilenetv2_block(config, args, image_size):
         train_compiled_program = fluid.CompiledProgram(
             train_program).with_data_parallel(
                 loss_name=avg_cost.name, build_strategy=build_strategy)
-        for epoch_id in range(retain_epoch):
+        for epoch_id in range(args.retain_epoch):
             for batch_id, data in enumerate(train_loader()):
                 fetches = [avg_cost.name]
                 s_time = time.time()
@@ -243,6 +232,11 @@ if __name__ == '__main__':
         type=int,
         default=100,
         help='controller server number.')
+    parser.add_argument(
+        '--server_address', type=str, default="", help='server ip.')
+    parser.add_argument('--port', type=int, default=8881, help='server port')
+    parser.add_argument(
+        '--retain_epoch', type=int, default=5, help='epoch for each token.')
     parser.add_argument('--lr', type=float, default=0.1, help='learning rate.')
     args = parser.parse_args()
     print(args)
@@ -257,7 +251,7 @@ if __name__ == '__main__':
                 args.data))
 
     # block mask means block number, 1 mean downsample, 0 means the size of feature map don't change after this block
-    config_info = {'block_mask': [0, 1, 1, 1, 1, 0, 1, 0]}
+    config_info = {'block_mask': [0, 1, 1, 1, 0]}
     config = [('MobileNetV2BlockSpace', config_info)]
 
     search_mobilenetv2_block(config, args, image_size)
diff --git a/demo/nas/darts_cifar10_reader.py b/demo/nas/darts_cifar10_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..7698c176f7a0eb1c10539f1531d6736bda29e344
--- /dev/null
+++ b/demo/nas/darts_cifar10_reader.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from PIL import Image
+from PIL import ImageOps
+import os
+import math
+import random
+import tarfile
+import functools
+import numpy as np
+from PIL import Image, ImageEnhance
+import paddle
+try:
+    import cPickle
+except:
+    import _pickle as cPickle
+
+IMAGE_SIZE = 32
+IMAGE_DEPTH = 3
+CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
+CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]
+
+URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
+CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+
+paddle.dataset.common.DATA_HOME = "dataset/"
+
+THREAD = 16
+BUF_SIZE = 10240
+
+num_workers = 4
+use_multiprocess = True
+cutout = True
+cutout_length = 16
+
+
+def preprocess(sample, is_training):
+    image_array = sample.reshape(IMAGE_DEPTH, IMAGE_SIZE, IMAGE_SIZE)
+    rgb_array = np.transpose(image_array, (1, 2, 0))
+    img = Image.fromarray(rgb_array, 'RGB')
+
+    if is_training:
+        # pad, ramdom crop, random_flip_left_right
+        img = ImageOps.expand(img, (4, 4, 4, 4), fill=0)
+        left_top = np.random.randint(8, size=2)
+        img = img.crop((left_top[1], left_top[0], left_top[1] + IMAGE_SIZE,
+                        left_top[0] + IMAGE_SIZE))
+        if np.random.randint(2):
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+    img = np.array(img).astype(np.float32)
+
+    img_float = img / 255.0
+    img = (img_float - CIFAR_MEAN) / CIFAR_STD
+
+    if is_training and cutout:
+        center = np.random.randint(IMAGE_SIZE, size=2)
+        offset_width = max(0, center[0] - cutout_length // 2)
+        offset_height = max(0, center[1] - cutout_length // 2)
+        target_width = min(center[0] + cutout_length // 2, IMAGE_SIZE)
+        target_height = min(center[1] + cutout_length // 2, IMAGE_SIZE)
+
+        for i in range(offset_height, target_height):
+            for j in range(offset_width, target_width):
+                img[i][j][:] = 0.0
+
+    img = np.transpose(img, (2, 0, 1))
+    return img
+
+
+def reader_generator(datasets, batch_size, is_training, is_shuffle):
+    def read_batch(datasets):
+        if is_shuffle:
+            random.shuffle(datasets)
+        for im, label in datasets:
+            im = preprocess(im, is_training)
+            yield im, [int(label)]
+
+    def reader():
+        batch_data = []
+        batch_label = []
+        for data in read_batch(datasets):
+            batch_data.append(data[0])
+            batch_label.append(data[1])
+            if len(batch_data) == batch_size:
+                batch_data = np.array(batch_data, dtype='float32')
+                batch_label = np.array(batch_label, dtype='int64')
+                batch_out = [batch_data, batch_label]
+                yield batch_out
+                batch_data = []
+                batch_label = []
+
+    return reader
+
+
+def cifar10_reader(file_name, data_name, is_shuffle):
+    with tarfile.open(file_name, mode='r') as f:
+        names = [
+            each_item.name for each_item in f if data_name in each_item.name
+        ]
+        names.sort()
+        datasets = []
+        for name in names:
+            print("Reading file " + name)
+            try:
+                batch = cPickle.load(
+                    f.extractfile(name), encoding='iso-8859-1')
+            except:
+                batch = cPickle.load(f.extractfile(name))
+            data = batch['data']
+            labels = batch.get('labels', batch.get('fine_labels', None))
+            assert labels is not None
+            dataset = zip(data, labels)
+            datasets.extend(dataset)
+        if is_shuffle:
+            random.shuffle(datasets)
+    return datasets
+
+
+def train_valid(batch_size, is_train, is_shuffle):
+    name = 'data_batch' if is_train else 'test_batch'
+    datasets = cifar10_reader(
+        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        name, is_shuffle)
+    n = int(math.ceil(len(datasets) //
+                      num_workers)) if use_multiprocess else len(datasets)
+    datasets_lists = [datasets[i:i + n] for i in range(0, len(datasets), n)]
+    multi_readers = []
+    for pid in range(len(datasets_lists)):
+        multi_readers.append(
+            reader_generator(datasets_lists[pid], batch_size, is_train,
+                             is_shuffle))
+    if use_multiprocess:
+        reader = paddle.reader.multiprocess_reader(multi_readers, False)
+    else:
+        reader = multi_readers[0]
+    return reader
diff --git a/demo/nas/image_classification_nas_quick_start.ipynb b/demo/nas/image_classification_nas_quick_start.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..72f04cf848b8caec65b65b177c99d15ebfa05cc6
--- /dev/null
+++ b/demo/nas/image_classification_nas_quick_start.ipynb
@@ -0,0 +1,163 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 图像分类网络结构搜索-快速开始\n",
+    "\n",
+    "该教程以图像分类模型MobileNetV2为例，说明如何在cifar10数据集上快速使用[网络结构搜索接口](../api/nas_api.md)。\n",
+    "该示例包含以下步骤：\n",
+    "\n",
+    "1. 导入依赖\n",
+    "2. 初始化SANAS搜索实例\n",
+    "3. 构建网络\n",
+    "4. 启动搜索实验\n",
+    "\n",
+    "以下章节依次介绍每个步骤的内容。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. 导入依赖\n",
+    "请确认已正确安装Paddle，导入需要的依赖包。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import paddle\n",
+    "import paddle.fluid as fluid\n",
+    "import paddleslim as slim\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. 初始化SANAS搜索实例"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sanas = slim.nas.SANAS(configs=[('MobileNetV2Space')]， server_addr=(\"\", 8337))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. 构建网络\n",
+    "根据传入的网络结构构造训练program和测试program。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_program(archs):\n",
+    "    train_program = fluid.Program()\n",
+    "    startup_program = fluid.Program()\n",
+    "    with fluid.program_guard(train_program, startup_program):\n",
+    "        data = fluid.data(name='data', shape=[None, 3, 32, 32], dtype='float32')\n",
+    "        label = fluid.data(name='label', shape=[None, 1], dtype='int64')\n",
+    "        output = archs(data)\n",
+    "        output = fluid.layers.fc(input=output, size=10)\n",
+    "\n",
+    "        softmax_out = fluid.layers.softmax(input=output, use_cudnn=False)\n",
+    "        cost = fluid.layers.cross_entropy(input=softmax_out, label=label)\n",
+    "        avg_cost = fluid.layers.mean(cost)\n",
+    "        acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1)\n",
+    "        acc_top5 = fluid.layers.accuracy(input=softmax_out, label=label, k=5)\n",
+    "        test_program = fluid.default_main_program().clone(for_test=True)\n",
+    "            \n",
+    "        optimizer = fluid.optimizer.Adam(learning_rate=0.1)\n",
+    "        optimizer.minimize(avg_cost)\n",
+    "\n",
+    "        place = fluid.CPUPlace()\n",
+    "        exe = fluid.Executor(place)\n",
+    "        exe.run(startup_program)\n",
+    "    return exe, train_program, test_program, (data, label), avg_cost, acc_top1, acc_top5"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. 启动搜索实验\n",
+    "获取每一轮的模型结构并开始训练。该教程中使用FLOPs作为约束条件，搜索实验一共搜索3个step，表示搜索到3个满足条件的模型结构进行训练，每搜索到一个网络结构训练7个epoch。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for step in range(3):\n",
+    "    archs = sanas.next_archs()[0]\n",
+    "    exe, train_program, test_progarm, inputs, avg_cost, acc_top1, acc_top5 = build_program(archs)\n",
+    "\n",
+    "    current_flops = slim.analysis.flops(train_program)\n",
+    "    if current_flops > 321208544:\n",
+    "        continue\n",
+    "    \n",
+    "    train_reader = paddle.fluid.io.batch(paddle.reader.shuffle(paddle.dataset.cifar.train10(cycle=False),                          buf_size=1024),batch_size=256)\n",
+    "    train_feeder = fluid.DataFeeder(inputs, fluid.CPUPlace())\n",
+    "    test_reader = paddle.fluid.io.batch(paddle.dataset.cifar.test10(cycle=False),\n",
+    "               batch_size=256)\n",
+    "    test_feeder = fluid.DataFeeder(inputs, fluid.CPUPlace())\n",
+    "\n",
+    "    outputs = [avg_cost.name, acc_top1.name, acc_top5.name]\n",
+    "    for epoch in range(7):\n",
+    "        for data in train_reader():\n",
+    "            loss, acc1, acc5 = exe.run(train_program, feed=train_feeder.feed(data), fetch_list = outputs)\n",
+    "            print(\"TRAIN: loss: {}, acc1: {}, acc5:{}\".format(loss, acc1, acc5))\n",
+    "\n",
+    "    reward = []\n",
+    "    for data in test_reader():\n",
+    "        batch_reward = exe.run(test_program, feed=test_feeder.feed(data), fetch_list = outputs)\n",
+    "        reward_avg = np.mean(np.array(batch_reward), axis=1)\n",
+    "        reward.append(reward_avg)\n",
+    "        print(\"TEST: loss: {}, acc1: {}, acc5:{}\".format(batch_reward[0], batch_reward[1], batch_reward[2]))\n",
+    "    finally_reward = np.mean(np.array(reward), axis=0)\n",
+    "    print(\"FINAL TEST: avg_cost: {}, acc1: {}, acc5: {}\".format(finally_reward[0], finally_reward[1], finally_reward[2]))\n",
+    "\n",
+    "    sanas.reward(float(finally_reward[1]))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/demo/nas/parl_nas_mobilenetv2.py b/demo/nas/parl_nas_mobilenetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef9195fb2dbb8ddf34ce574e4e6c823d1e0b790e
--- /dev/null
+++ b/demo/nas/parl_nas_mobilenetv2.py
@@ -0,0 +1,244 @@
+import sys
+sys.path.append('..')
+import numpy as np
+import argparse
+import ast
+import time
+import argparse
+import ast
+import logging
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+from paddleslim.nas import RLNAS
+from paddleslim.common import get_logger
+from optimizer import create_optimizer
+import imagenet_reader
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+
+def create_data_loader(image_shape):
+    data_shape = [None] + image_shape
+    data = fluid.data(name='data', shape=data_shape, dtype='float32')
+    label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+    data_loader = fluid.io.DataLoader.from_generator(
+        feed_list=[data, label],
+        capacity=1024,
+        use_double_buffer=True,
+        iterable=True)
+    return data_loader, data, label
+
+
+def build_program(main_program,
+                  startup_program,
+                  image_shape,
+                  archs,
+                  args,
+                  is_test=False):
+    with fluid.program_guard(main_program, startup_program):
+        with fluid.unique_name.guard():
+            data_loader, data, label = create_data_loader(image_shape)
+            output = archs(data)
+            output = fluid.layers.fc(input=output, size=args.class_dim)
+
+            softmax_out = fluid.layers.softmax(input=output, use_cudnn=False)
+            cost = fluid.layers.cross_entropy(input=softmax_out, label=label)
+            avg_cost = fluid.layers.mean(cost)
+            acc_top1 = fluid.layers.accuracy(
+                input=softmax_out, label=label, k=1)
+            acc_top5 = fluid.layers.accuracy(
+                input=softmax_out, label=label, k=5)
+
+            if is_test == False:
+                optimizer = create_optimizer(args)
+                optimizer.minimize(avg_cost)
+    return data_loader, avg_cost, acc_top1, acc_top5
+
+
+def search_mobilenetv2(config, args, image_size, is_server=True):
+    if is_server:
+        ### start a server and a client
+        rl_nas = RLNAS(
+            key='ddpg',
+            configs=config,
+            is_sync=False,
+            obs_dim=26,  ### step + length_of_token
+            server_addr=(args.server_address, args.port))
+    else:
+        ### start a client
+        rl_nas = RLNAS(
+            key='ddpg',
+            configs=config,
+            is_sync=False,
+            obs_dim=26,
+            server_addr=(args.server_address, args.port),
+            is_server=False)
+
+    image_shape = [3, image_size, image_size]
+    for step in range(args.search_steps):
+        if step == 0:
+            action_prev = [1. for _ in rl_nas.range_tables]
+        else:
+            action_prev = rl_nas.tokens[0]
+        obs = [step]
+        obs.extend(action_prev)
+        archs = rl_nas.next_archs(obs=obs)[0][0]
+
+        train_program = fluid.Program()
+        test_program = fluid.Program()
+        startup_program = fluid.Program()
+        train_loader, avg_cost, acc_top1, acc_top5 = build_program(
+            train_program, startup_program, image_shape, archs, args)
+
+        test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program(
+            test_program,
+            startup_program,
+            image_shape,
+            archs,
+            args,
+            is_test=True)
+        test_program = test_program.clone(for_test=True)
+
+        place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_program)
+
+        if args.data == 'cifar10':
+            train_reader = paddle.fluid.io.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.cifar.train10(cycle=False), buf_size=1024),
+                batch_size=args.batch_size,
+                drop_last=True)
+
+            test_reader = paddle.fluid.io.batch(
+                paddle.dataset.cifar.test10(cycle=False),
+                batch_size=args.batch_size,
+                drop_last=False)
+        elif args.data == 'imagenet':
+            train_reader = paddle.fluid.io.batch(
+                imagenet_reader.train(),
+                batch_size=args.batch_size,
+                drop_last=True)
+            test_reader = paddle.fluid.io.batch(
+                imagenet_reader.val(),
+                batch_size=args.batch_size,
+                drop_last=False)
+
+        train_loader.set_sample_list_generator(
+            train_reader,
+            places=fluid.cuda_places() if args.use_gpu else fluid.cpu_places())
+        test_loader.set_sample_list_generator(test_reader, places=place)
+
+        build_strategy = fluid.BuildStrategy()
+        train_compiled_program = fluid.CompiledProgram(
+            train_program).with_data_parallel(
+                loss_name=avg_cost.name, build_strategy=build_strategy)
+        for epoch_id in range(args.retain_epoch):
+            for batch_id, data in enumerate(train_loader()):
+                fetches = [avg_cost.name]
+                s_time = time.time()
+                outs = exe.run(train_compiled_program,
+                               feed=data,
+                               fetch_list=fetches)[0]
+                batch_time = time.time() - s_time
+                if batch_id % 10 == 0:
+                    _logger.info(
+                        'TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}, batch_time: {}ms'.
+                        format(step, epoch_id, batch_id, outs[0], batch_time))
+
+        reward = []
+        for batch_id, data in enumerate(test_loader()):
+            test_fetches = [
+                test_avg_cost.name, test_acc_top1.name, test_acc_top5.name
+            ]
+            batch_reward = exe.run(test_program,
+                                   feed=data,
+                                   fetch_list=test_fetches)
+            reward_avg = np.mean(np.array(batch_reward), axis=1)
+            reward.append(reward_avg)
+
+            _logger.info(
+                'TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}'.
+                format(step, batch_id, batch_reward[0], batch_reward[1],
+                       batch_reward[2]))
+
+        finally_reward = np.mean(np.array(reward), axis=0)
+        _logger.info(
+            'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format(
+                finally_reward[0], finally_reward[1], finally_reward[2]))
+
+        obs = np.expand_dims(obs, axis=0).astype('float32')
+        actions = rl_nas.tokens
+        obs_next = [step + 1]
+        obs_next.extend(actions[0])
+        obs_next = np.expand_dims(obs_next, axis=0).astype('float32')
+
+        if step == args.search_steps - 1:
+            terminal = np.expand_dims([True], axis=0).astype(np.bool)
+        else:
+            terminal = np.expand_dims([False], axis=0).astype(np.bool)
+        rl_nas.reward(
+            np.expand_dims(
+                np.float32(finally_reward[1]), axis=0),
+            obs=obs,
+            actions=actions.astype('float32'),
+            obs_next=obs_next,
+            terminal=terminal)
+
+        if step == 2:
+            sys.exit(0)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(
+        description='RL NAS MobileNetV2 cifar10 argparase')
+    parser.add_argument(
+        '--use_gpu',
+        type=ast.literal_eval,
+        default=True,
+        help='Whether to use GPU in train/test model.')
+    parser.add_argument(
+        '--batch_size', type=int, default=256, help='batch size.')
+    parser.add_argument(
+        '--class_dim', type=int, default=10, help='classify number.')
+    parser.add_argument(
+        '--data',
+        type=str,
+        default='cifar10',
+        choices=['cifar10', 'imagenet'],
+        help='server address.')
+    parser.add_argument(
+        '--is_server',
+        type=ast.literal_eval,
+        default=True,
+        help='Whether to start a server.')
+    parser.add_argument(
+        '--search_steps',
+        type=int,
+        default=100,
+        help='controller server number.')
+    parser.add_argument(
+        '--server_address', type=str, default="", help='server ip.')
+    parser.add_argument('--port', type=int, default=8881, help='server port')
+    parser.add_argument(
+        '--retain_epoch', type=int, default=5, help='epoch for each token.')
+    parser.add_argument('--lr', type=float, default=0.1, help='learning rate.')
+    args = parser.parse_args()
+    print(args)
+
+    if args.data == 'cifar10':
+        image_size = 32
+        block_num = 3
+    elif args.data == 'imagenet':
+        image_size = 224
+        block_num = 6
+    else:
+        raise NotImplementedError(
+            'data must in [cifar10, imagenet], but received: {}'.format(
+                args.data))
+
+    config = [('MobileNetV2Space')]
+
+    search_mobilenetv2(config, args, image_size, is_server=args.is_server)
diff --git a/demo/nas/rl_nas_mobilenetv2.py b/demo/nas/rl_nas_mobilenetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..4997a530ea1549f4afb4fb7d7fa51ab7115f2b65
--- /dev/null
+++ b/demo/nas/rl_nas_mobilenetv2.py
@@ -0,0 +1,229 @@
+import sys
+sys.path.append('..')
+import numpy as np
+import argparse
+import ast
+import time
+import argparse
+import ast
+import logging
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+from paddleslim.nas import RLNAS
+from paddleslim.common import get_logger
+from optimizer import create_optimizer
+import imagenet_reader
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+
+def create_data_loader(image_shape):
+    data_shape = [None] + image_shape
+    data = fluid.data(name='data', shape=data_shape, dtype='float32')
+    label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+    data_loader = fluid.io.DataLoader.from_generator(
+        feed_list=[data, label],
+        capacity=1024,
+        use_double_buffer=True,
+        iterable=True)
+    return data_loader, data, label
+
+
+def build_program(main_program,
+                  startup_program,
+                  image_shape,
+                  archs,
+                  args,
+                  is_test=False):
+    with fluid.program_guard(main_program, startup_program):
+        with fluid.unique_name.guard():
+            data_loader, data, label = create_data_loader(image_shape)
+            output = archs(data)
+            output = fluid.layers.fc(input=output, size=args.class_dim)
+
+            softmax_out = fluid.layers.softmax(input=output, use_cudnn=False)
+            cost = fluid.layers.cross_entropy(input=softmax_out, label=label)
+            avg_cost = fluid.layers.mean(cost)
+            acc_top1 = fluid.layers.accuracy(
+                input=softmax_out, label=label, k=1)
+            acc_top5 = fluid.layers.accuracy(
+                input=softmax_out, label=label, k=5)
+
+            if is_test == False:
+                optimizer = create_optimizer(args)
+                optimizer.minimize(avg_cost)
+    return data_loader, avg_cost, acc_top1, acc_top5
+
+
+def search_mobilenetv2(config, args, image_size, is_server=True):
+    if is_server:
+        ### start a server and a client
+        rl_nas = RLNAS(
+            key='lstm',
+            configs=config,
+            is_sync=False,
+            server_addr=(args.server_address, args.port),
+            controller_batch_size=1,
+            controller_decay_steps=1000,
+            controller_decay_rate=0.8,
+            lstm_num_layers=1,
+            hidden_size=10,
+            temperature=1.0)
+    else:
+        ### start a client
+        rl_nas = RLNAS(
+            key='lstm',
+            configs=config,
+            is_sync=False,
+            server_addr=(args.server_address, args.port),
+            lstm_num_layers=1,
+            hidden_size=10,
+            temperature=1.0,
+            controller_batch_size=1,
+            controller_decay_steps=1000,
+            controller_decay_rate=0.8,
+            is_server=False)
+
+    image_shape = [3, image_size, image_size]
+    for step in range(args.search_steps):
+        archs = rl_nas.next_archs(1)[0][0]
+
+        train_program = fluid.Program()
+        test_program = fluid.Program()
+        startup_program = fluid.Program()
+        train_loader, avg_cost, acc_top1, acc_top5 = build_program(
+            train_program, startup_program, image_shape, archs, args)
+
+        test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program(
+            test_program,
+            startup_program,
+            image_shape,
+            archs,
+            args,
+            is_test=True)
+        test_program = test_program.clone(for_test=True)
+
+        place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_program)
+
+        if args.data == 'cifar10':
+            train_reader = paddle.fluid.io.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.cifar.train10(cycle=False), buf_size=1024),
+                batch_size=args.batch_size,
+                drop_last=True)
+
+            test_reader = paddle.fluid.io.batch(
+                paddle.dataset.cifar.test10(cycle=False),
+                batch_size=args.batch_size,
+                drop_last=False)
+        elif args.data == 'imagenet':
+            train_reader = paddle.fluid.io.batch(
+                imagenet_reader.train(),
+                batch_size=args.batch_size,
+                drop_last=True)
+            test_reader = paddle.fluid.io.batch(
+                imagenet_reader.val(),
+                batch_size=args.batch_size,
+                drop_last=False)
+
+        train_loader.set_sample_list_generator(
+            train_reader,
+            places=fluid.cuda_places() if args.use_gpu else fluid.cpu_places())
+        test_loader.set_sample_list_generator(test_reader, places=place)
+
+        build_strategy = fluid.BuildStrategy()
+        train_compiled_program = fluid.CompiledProgram(
+            train_program).with_data_parallel(
+                loss_name=avg_cost.name, build_strategy=build_strategy)
+        for epoch_id in range(args.retain_epoch):
+            for batch_id, data in enumerate(train_loader()):
+                fetches = [avg_cost.name]
+                s_time = time.time()
+                outs = exe.run(train_compiled_program,
+                               feed=data,
+                               fetch_list=fetches)[0]
+                batch_time = time.time() - s_time
+                if batch_id % 10 == 0:
+                    _logger.info(
+                        'TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}, batch_time: {}ms'.
+                        format(step, epoch_id, batch_id, outs[0], batch_time))
+
+        reward = []
+        for batch_id, data in enumerate(test_loader()):
+            test_fetches = [
+                test_avg_cost.name, test_acc_top1.name, test_acc_top5.name
+            ]
+            batch_reward = exe.run(test_program,
+                                   feed=data,
+                                   fetch_list=test_fetches)
+            reward_avg = np.mean(np.array(batch_reward), axis=1)
+            reward.append(reward_avg)
+
+            _logger.info(
+                'TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}'.
+                format(step, batch_id, batch_reward[0], batch_reward[1],
+                       batch_reward[2]))
+
+        finally_reward = np.mean(np.array(reward), axis=0)
+        _logger.info(
+            'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format(
+                finally_reward[0], finally_reward[1], finally_reward[2]))
+
+        rl_nas.reward(np.float32(finally_reward[1]))
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(
+        description='RL NAS MobileNetV2 cifar10 argparase')
+    parser.add_argument(
+        '--use_gpu',
+        type=ast.literal_eval,
+        default=True,
+        help='Whether to use GPU in train/test model.')
+    parser.add_argument(
+        '--batch_size', type=int, default=256, help='batch size.')
+    parser.add_argument(
+        '--class_dim', type=int, default=10, help='classify number.')
+    parser.add_argument(
+        '--data',
+        type=str,
+        default='cifar10',
+        choices=['cifar10', 'imagenet'],
+        help='server address.')
+    parser.add_argument(
+        '--is_server',
+        type=ast.literal_eval,
+        default=True,
+        help='Whether to start a server.')
+    parser.add_argument(
+        '--search_steps',
+        type=int,
+        default=100,
+        help='controller server number.')
+    parser.add_argument(
+        '--server_address', type=str, default="", help='server ip.')
+    parser.add_argument('--port', type=int, default=8881, help='server port')
+    parser.add_argument(
+        '--retain_epoch', type=int, default=5, help='epoch for each token.')
+    parser.add_argument('--lr', type=float, default=0.1, help='learning rate.')
+    args = parser.parse_args()
+    print(args)
+
+    if args.data == 'cifar10':
+        image_size = 32
+        block_num = 3
+    elif args.data == 'imagenet':
+        image_size = 224
+        block_num = 6
+    else:
+        raise NotImplementedError(
+            'data must in [cifar10, imagenet], but received: {}'.format(
+                args.data))
+
+    config = [('MobileNetV2Space')]
+
+    search_mobilenetv2(config, args, image_size, is_server=args.is_server)
diff --git a/demo/nas/sa_nas_mobilenetv2.py b/demo/nas/sa_nas_mobilenetv2.py
index e6abe115d566f0779cbd8806f702a18b832233f5..da6f17548a0881aa325524c513c048324641210d 100644
--- a/demo/nas/sa_nas_mobilenetv2.py
+++ b/demo/nas/sa_nas_mobilenetv2.py
@@ -18,13 +18,6 @@ import imagenet_reader
 
 _logger = get_logger(__name__, level=logging.INFO)
 
-reduce_rate = 0.85
-init_temperature = 10.24
-max_flops = 321208544
-server_address = ""
-port = 8989
-retain_epoch = 5
-
 
 def create_data_loader(image_shape):
     data_shape = [None] + image_shape
@@ -45,19 +38,22 @@ def build_program(main_program,
                   args,
                   is_test=False):
     with fluid.program_guard(main_program, startup_program):
-        data_loader, data, label = create_data_loader(image_shape)
-        output = archs(data)
-        output = fluid.layers.fc(input=output, size=args.class_dim)
-
-        softmax_out = fluid.layers.softmax(input=output, use_cudnn=False)
-        cost = fluid.layers.cross_entropy(input=softmax_out, label=label)
-        avg_cost = fluid.layers.mean(cost)
-        acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1)
-        acc_top5 = fluid.layers.accuracy(input=softmax_out, label=label, k=5)
-
-        if is_test == False:
-            optimizer = create_optimizer(args)
-            optimizer.minimize(avg_cost)
+        with fluid.unique_name.guard():
+            data_loader, data, label = create_data_loader(image_shape)
+            output = archs(data)
+            output = fluid.layers.fc(input=output, size=args.class_dim)
+
+            softmax_out = fluid.layers.softmax(input=output, use_cudnn=False)
+            cost = fluid.layers.cross_entropy(input=softmax_out, label=label)
+            avg_cost = fluid.layers.mean(cost)
+            acc_top1 = fluid.layers.accuracy(
+                input=softmax_out, label=label, k=1)
+            acc_top5 = fluid.layers.accuracy(
+                input=softmax_out, label=label, k=5)
+
+            if is_test == False:
+                optimizer = create_optimizer(args)
+                optimizer.minimize(avg_cost)
     return data_loader, avg_cost, acc_top1, acc_top5
 
 
@@ -66,18 +62,14 @@ def search_mobilenetv2(config, args, image_size, is_server=True):
         ### start a server and a client
         sa_nas = SANAS(
             config,
-            server_addr=("", port),
-            init_temperature=init_temperature,
-            reduce_rate=reduce_rate,
+            server_addr=(args.server_address, args.port),
             search_steps=args.search_steps,
             is_server=True)
     else:
         ### start a client
         sa_nas = SANAS(
             config,
-            server_addr=(server_address, port),
-            init_temperature=init_temperature,
-            reduce_rate=reduce_rate,
+            server_addr=(args.server_address, args.port),
             search_steps=args.search_steps,
             is_server=False)
 
@@ -93,7 +85,7 @@ def search_mobilenetv2(config, args, image_size, is_server=True):
 
         current_flops = flops(train_program)
         print('step: {}, current_flops: {}'.format(step, current_flops))
-        if current_flops > max_flops:
+        if current_flops > int(321208544):
             continue
 
         test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program(
@@ -110,22 +102,22 @@ def search_mobilenetv2(config, args, image_size, is_server=True):
         exe.run(startup_program)
 
         if args.data == 'cifar10':
-            train_reader = paddle.batch(
+            train_reader = paddle.fluid.io.batch(
                 paddle.reader.shuffle(
                     paddle.dataset.cifar.train10(cycle=False), buf_size=1024),
                 batch_size=args.batch_size,
                 drop_last=True)
 
-            test_reader = paddle.batch(
+            test_reader = paddle.fluid.io.batch(
                 paddle.dataset.cifar.test10(cycle=False),
                 batch_size=args.batch_size,
                 drop_last=False)
         elif args.data == 'imagenet':
-            train_reader = paddle.batch(
+            train_reader = paddle.fluid.io.batch(
                 imagenet_reader.train(),
                 batch_size=args.batch_size,
                 drop_last=True)
-            test_reader = paddle.batch(
+            test_reader = paddle.fluid.io.batch(
                 imagenet_reader.val(),
                 batch_size=args.batch_size,
                 drop_last=False)
@@ -139,7 +131,7 @@ def search_mobilenetv2(config, args, image_size, is_server=True):
         train_compiled_program = fluid.CompiledProgram(
             train_program).with_data_parallel(
                 loss_name=avg_cost.name, build_strategy=build_strategy)
-        for epoch_id in range(retain_epoch):
+        for epoch_id in range(args.retain_epoch):
             for batch_id, data in enumerate(train_loader()):
                 fetches = [avg_cost.name]
                 s_time = time.time()
@@ -179,15 +171,13 @@ def search_mobilenetv2(config, args, image_size, is_server=True):
 def test_search_result(tokens, image_size, args, config):
     sa_nas = SANAS(
         config,
-        server_addr=("", 8887),
-        init_temperature=args.init_temperature,
-        reduce_rate=args.reduce_rate,
+        server_addr=(args.server_address, args.port),
         search_steps=args.search_steps,
         is_server=True)
 
     image_shape = [3, image_size, image_size]
 
-    archs = sa_nas.tokens2arch(tokens)
+    archs = sa_nas.tokens2arch(tokens)[0]
 
     train_program = fluid.Program()
     test_program = fluid.Program()
@@ -207,22 +197,22 @@ def test_search_result(tokens, image_size, args, config):
     exe.run(startup_program)
 
     if args.data == 'cifar10':
-        train_reader = paddle.batch(
+        train_reader = paddle.fluid.io.batch(
             paddle.reader.shuffle(
                 paddle.dataset.cifar.train10(cycle=False), buf_size=1024),
             batch_size=args.batch_size,
             drop_last=True)
 
-        test_reader = paddle.batch(
+        test_reader = paddle.fluid.io.batch(
             paddle.dataset.cifar.test10(cycle=False),
             batch_size=args.batch_size,
             drop_last=False)
     elif args.data == 'imagenet':
-        train_reader = paddle.batch(
+        train_reader = paddle.fluid.io.batch(
             imagenet_reader.train(),
             batch_size=args.batch_size,
             drop_last=True)
-        test_reader = paddle.batch(
+        test_reader = paddle.fluid.io.batch(
             imagenet_reader.val(), batch_size=args.batch_size, drop_last=False)
 
     train_loader.set_sample_list_generator(
@@ -234,7 +224,7 @@ def test_search_result(tokens, image_size, args, config):
     train_compiled_program = fluid.CompiledProgram(
         train_program).with_data_parallel(
             loss_name=avg_cost.name, build_strategy=build_strategy)
-    for epoch_id in range(retain_epoch):
+    for epoch_id in range(args.retain_epoch):
         for batch_id, data in enumerate(train_loader()):
             fetches = [avg_cost.name]
             s_time = time.time()
@@ -281,7 +271,7 @@ if __name__ == '__main__':
     parser.add_argument(
         '--batch_size', type=int, default=256, help='batch size.')
     parser.add_argument(
-        '--class_dim', type=int, default=1000, help='classify number.')
+        '--class_dim', type=int, default=10, help='classify number.')
     parser.add_argument(
         '--data',
         type=str,
@@ -298,6 +288,11 @@ if __name__ == '__main__':
         type=int,
         default=100,
         help='controller server number.')
+    parser.add_argument(
+        '--server_address', type=str, default="", help='server ip.')
+    parser.add_argument('--port', type=int, default=8881, help='server port')
+    parser.add_argument(
+        '--retain_epoch', type=int, default=5, help='epoch for each token.')
     parser.add_argument('--lr', type=float, default=0.1, help='learning rate.')
     args = parser.parse_args()
     print(args)
diff --git a/demo/nas/sanas_darts_space.py b/demo/nas/sanas_darts_space.py
new file mode 100644
index 0000000000000000000000000000000000000000..43705e8781ab2875e55f7f0b3df12a6123a0f475
--- /dev/null
+++ b/demo/nas/sanas_darts_space.py
@@ -0,0 +1,348 @@
+import os
+import sys
+sys.path.append('..')
+import numpy as np
+import argparse
+import ast
+import time
+import argparse
+import ast
+import logging
+import paddle.fluid as fluid
+from paddleslim.nas import SANAS
+from paddleslim.common import get_logger
+import darts_cifar10_reader as reader
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+auxiliary = True
+auxiliary_weight = 0.4
+trainset_num = 50000
+lr = 0.025
+momentum = 0.9
+weight_decay = 0.0003
+drop_path_probility = 0.2
+
+
+class AvgrageMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+
+    def update(self, val, n=1):
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+
+
+def count_parameters_in_MB(all_params, prefix='model'):
+    parameters_number = 0
+    for param in all_params:
+        if param.name.startswith(
+                prefix) and param.trainable and 'aux' not in param.name:
+            parameters_number += np.prod(param.shape)
+    return parameters_number / 1e6
+
+
+def create_data_loader(image_shape, is_train, args):
+    image = fluid.data(
+        name="image", shape=[None] + image_shape, dtype="float32")
+    label = fluid.data(name="label", shape=[None, 1], dtype="int64")
+    data_loader = fluid.io.DataLoader.from_generator(
+        feed_list=[image, label],
+        capacity=64,
+        use_double_buffer=True,
+        iterable=True)
+    drop_path_prob = ''
+    drop_path_mask = ''
+    if is_train:
+        drop_path_prob = fluid.data(
+            name="drop_path_prob", shape=[args.batch_size, 1], dtype="float32")
+        drop_path_mask = fluid.data(
+            name="drop_path_mask",
+            shape=[args.batch_size, 20, 4, 2],
+            dtype="float32")
+
+    return data_loader, image, label, drop_path_prob, drop_path_mask
+
+
+def build_program(main_program, startup_program, image_shape, archs, args,
+                  is_train):
+    with fluid.program_guard(main_program, startup_program):
+        data_loader, data, label, drop_path_prob, drop_path_mask = create_data_loader(
+            image_shape, is_train, args)
+        logits, logits_aux = archs(data, drop_path_prob, drop_path_mask,
+                                   is_train, 10)
+        top1 = fluid.layers.accuracy(input=logits, label=label, k=1)
+        top5 = fluid.layers.accuracy(input=logits, label=label, k=5)
+        loss = fluid.layers.reduce_mean(
+            fluid.layers.softmax_with_cross_entropy(logits, label))
+
+        if is_train:
+            if auxiliary:
+                loss_aux = fluid.layers.reduce_mean(
+                    fluid.layers.softmax_with_cross_entropy(logits_aux, label))
+                loss = loss + auxiliary_weight * loss_aux
+            step_per_epoch = int(trainset_num / args.batch_size)
+            learning_rate = fluid.layers.cosine_decay(lr, step_per_epoch,
+                                                      args.retain_epoch)
+            fluid.clip.set_gradient_clip(
+                clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0))
+            optimizer = fluid.optimizer.MomentumOptimizer(
+                learning_rate,
+                momentum,
+                regularization=fluid.regularizer.L2DecayRegularizer(
+                    weight_decay))
+            optimizer.minimize(loss)
+            outs = [loss, top1, top5, learning_rate]
+        else:
+            outs = [loss, top1, top5]
+    return outs, data_loader
+
+
+def train(main_prog, exe, epoch_id, train_loader, fetch_list, args):
+    loss = AvgrageMeter()
+    top1 = AvgrageMeter()
+    top5 = AvgrageMeter()
+    for step_id, data in enumerate(train_loader()):
+        devices_num = len(data)
+        if drop_path_probility > 0:
+            feed = []
+            for device_id in range(devices_num):
+                image = data[device_id]['image']
+                label = data[device_id]['label']
+                drop_path_prob = np.array(
+                    [[drop_path_probility * epoch_id / args.retain_epoch]
+                     for i in range(args.batch_size)]).astype(np.float32)
+                drop_path_mask = 1 - np.random.binomial(
+                    1, drop_path_prob[0],
+                    size=[args.batch_size, 20, 4, 2]).astype(np.float32)
+                feed.append({
+                    "image": image,
+                    "label": label,
+                    "drop_path_prob": drop_path_prob,
+                    "drop_path_mask": drop_path_mask
+                })
+        else:
+            feed = data
+        loss_v, top1_v, top5_v, lr = exe.run(
+            main_prog, feed=feed, fetch_list=[v.name for v in fetch_list])
+        loss.update(loss_v, args.batch_size)
+        top1.update(top1_v, args.batch_size)
+        top5.update(top5_v, args.batch_size)
+        if step_id % 10 == 0:
+            _logger.info(
+                "Train Epoch {}, Step {}, Lr {:.8f}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}".
+                format(epoch_id, step_id, lr[0], loss.avg[0], top1.avg[0],
+                       top5.avg[0]))
+    return top1.avg[0]
+
+
+def valid(main_prog, exe, epoch_id, valid_loader, fetch_list, args):
+    loss = AvgrageMeter()
+    top1 = AvgrageMeter()
+    top5 = AvgrageMeter()
+    for step_id, data in enumerate(valid_loader()):
+        loss_v, top1_v, top5_v = exe.run(
+            main_prog, feed=data, fetch_list=[v.name for v in fetch_list])
+        loss.update(loss_v, args.batch_size)
+        top1.update(top1_v, args.batch_size)
+        top5.update(top5_v, args.batch_size)
+        if step_id % 10 == 0:
+            _logger.info(
+                "Valid Epoch {}, Step {}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}".
+                format(epoch_id, step_id, loss.avg[0], top1.avg[0], top5.avg[
+                    0]))
+    return top1.avg[0]
+
+
+def search(config, args, image_size, is_server=True):
+    if is_server:
+        ### start a server and a client
+        sa_nas = SANAS(
+            config,
+            server_addr=(args.server_address, args.port),
+            search_steps=args.search_steps,
+            is_server=True)
+    else:
+        ### start a client
+        sa_nas = SANAS(
+            config,
+            server_addr=(args.server_address, args.port),
+            init_temperature=init_temperature,
+            is_server=False)
+
+    image_shape = [3, image_size, image_size]
+    for step in range(args.search_steps):
+        archs = sa_nas.next_archs()[0]
+
+        train_program = fluid.Program()
+        test_program = fluid.Program()
+        startup_program = fluid.Program()
+        train_fetch_list, train_loader = build_program(
+            train_program,
+            startup_program,
+            image_shape,
+            archs,
+            args,
+            is_train=True)
+
+        current_params = count_parameters_in_MB(
+            train_program.global_block().all_parameters(), 'cifar10')
+        _logger.info('step: {}, current_params: {}M'.format(step,
+                                                            current_params))
+        if current_params > float(3.77):
+            continue
+
+        test_fetch_list, test_loader = build_program(
+            test_program,
+            startup_program,
+            image_shape,
+            archs,
+            args,
+            is_train=False)
+        test_program = test_program.clone(for_test=True)
+
+        place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_program)
+
+        train_reader = reader.train_valid(
+            batch_size=args.batch_size, is_train=True, is_shuffle=True)
+        test_reader = reader.train_valid(
+            batch_size=args.batch_size, is_train=False, is_shuffle=False)
+
+        train_loader.set_batch_generator(train_reader, places=place)
+        test_loader.set_batch_generator(test_reader, places=place)
+
+        build_strategy = fluid.BuildStrategy()
+        train_compiled_program = fluid.CompiledProgram(
+            train_program).with_data_parallel(
+                loss_name=train_fetch_list[0].name,
+                build_strategy=build_strategy)
+
+        valid_top1_list = []
+        for epoch_id in range(args.retain_epoch):
+            train_top1 = train(train_compiled_program, exe, epoch_id,
+                               train_loader, train_fetch_list, args)
+            _logger.info("TRAIN: step: {}, Epoch {}, train_acc {:.6f}".format(
+                step, epoch_id, train_top1))
+            valid_top1 = valid(test_program, exe, epoch_id, test_loader,
+                               test_fetch_list, args)
+            _logger.info("TEST: Epoch {}, valid_acc {:.6f}".format(epoch_id,
+                                                                   valid_top1))
+            valid_top1_list.append(valid_top1)
+        sa_nas.reward(float(valid_top1_list[-1] + valid_top1_list[-2]) / 2)
+
+
+def final_test(config, args, image_size, token=None):
+    assert token != None, "If you want to start a final experiment, you must input a token."
+    sa_nas = SANAS(
+        config, server_addr=(args.server_address, args.port), is_server=True)
+
+    image_shape = [3, image_size, image_size]
+    archs = sa_nas.tokens2arch(token)[0]
+
+    train_program = fluid.Program()
+    test_program = fluid.Program()
+    startup_program = fluid.Program()
+    train_fetch_list, train_loader = build_program(
+        train_program,
+        startup_program,
+        image_shape,
+        archs,
+        args,
+        is_train=True)
+
+    current_params = count_parameters_in_MB(
+        train_program.global_block().all_parameters(), 'cifar10')
+    _logger.info('current_params: {}M'.format(current_params))
+    test_fetch_list, test_loader = build_program(
+        test_program,
+        startup_program,
+        image_shape,
+        archs,
+        args,
+        is_train=False)
+    test_program = test_program.clone(for_test=True)
+
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(startup_program)
+
+    train_reader = reader.train_valid(
+        batch_size=args.batch_size, is_train=True, is_shuffle=True, args=args)
+    test_reader = reader.train_valid(
+        batch_size=args.batch_size,
+        is_train=False,
+        is_shuffle=False,
+        args=args)
+
+    train_loader.set_batch_generator(train_reader, places=place)
+    test_loader.set_batch_generator(test_reader, places=place)
+
+    build_strategy = fluid.BuildStrategy()
+    train_compiled_program = fluid.CompiledProgram(
+        train_program).with_data_parallel(
+            loss_name=train_fetch_list[0].name, build_strategy=build_strategy)
+
+    valid_top1_list = []
+    for epoch_id in range(args.retain_epoch):
+        train_top1 = train(train_compiled_program, exe, epoch_id, train_loader,
+                           train_fetch_list, args)
+        _logger.info("TRAIN: Epoch {}, train_acc {:.6f}".format(epoch_id,
+                                                                train_top1))
+        valid_top1 = valid(test_program, exe, epoch_id, test_loader,
+                           test_fetch_list, args)
+        _logger.info("TEST: Epoch {}, valid_acc {:.6f}".format(epoch_id,
+                                                               valid_top1))
+        valid_top1_list.append(valid_top1)
+
+        output_dir = os.path.join('darts_output', str(epoch_id))
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        fluid.io.save_persistables(exe, output_dir, main_program=train_program)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(
+        description='SA NAS MobileNetV2 cifar10 argparase')
+    parser.add_argument(
+        '--use_gpu',
+        type=ast.literal_eval,
+        default=True,
+        help='Whether to use GPU in train/test model.')
+    parser.add_argument(
+        '--batch_size', type=int, default=96, help='batch size.')
+    parser.add_argument(
+        '--is_server',
+        type=ast.literal_eval,
+        default=True,
+        help='Whether to start a server.')
+    parser.add_argument(
+        '--server_address', type=str, default="", help='server ip.')
+    parser.add_argument('--port', type=int, default=8881, help='server port')
+    parser.add_argument(
+        '--retain_epoch', type=int, default=30, help='epoch for each token.')
+    parser.add_argument('--token', type=int, nargs='+', help='final token.')
+    parser.add_argument(
+        '--search_steps',
+        type=int,
+        default=200,
+        help='controller server number.')
+    args = parser.parse_args()
+    print(args)
+
+    image_size = 32
+
+    config = [('DartsSpace')]
+
+    if args.token == None:
+        search(config, args, image_size, is_server=args.is_server)
+    else:
+        final_test(config, args, image_size, token=args.token)
diff --git a/demo/nas/search_space_doc.md b/demo/nas/search_space_doc.md
deleted file mode 100644
index 682b0eac801bae4ae59b523475e8fa3c66586190..0000000000000000000000000000000000000000
--- a/demo/nas/search_space_doc.md
+++ /dev/null
@@ -1,116 +0,0 @@
-# paddleslim.nas 提供的搜索空间：
-
-1. 根据原本模型结构构造搜索空间：
-
-  1.1 MobileNetV2Space
-  
-  1.2 MobileNetV1Space
-  
-  1.3 ResNetSpace
-
-
-2. 根据相应模型的block构造搜索空间
-
-  2.1 MobileNetV1BlockSpace
-  
-  2.2 MobileNetV2BlockSpace
-  
-  2.3 ResNetBlockSpace
-  
-  2.4 InceptionABlockSpace
-  
-  2.5 InceptionCBlockSpace
-
-
-##搜索空间的配置介绍：
-
-**input_size(int|None)**：`input_size`表示输入feature map的大小。
-**output_size(int|None)**：`output_size`表示输出feature map的大小。
-**block_num(int|None)**：`block_num`表示搜索空间中block的数量。
-**block_mask(list|None)**：`block_mask`表示当前的block是一个reduction block还是一个normal block，是一组由0、1组成的列表，0表示当前block是normal block，1表示当前block是reduction block。如果设置了`block_mask`，则主要以`block_mask`为主要配置，`input_size`，`output_size`和`block_num`三种配置是无效的。
-
-**Note:** 
-1. reduction block表示经过这个block之后的feature map大小下降为之前的一半，normal block表示经过这个block之后feature map大小不变。
-2. `input_size`和`output_size`用来计算整个模型结构中reduction block数量。
-
-
-##搜索空间示例：
-
-1. 使用paddleslim中提供用原本的模型结构来构造搜索空间的话，仅需要指定搜索空间名字即可。例如：如果使用原本的MobileNetV2的搜索空间进行搜索的话，传入SANAS中的config直接指定为[('MobileNetV2Space')]。
-2. 使用paddleslim中提供的block搜索空间构造搜索空间：
-  2.1 使用`input_size`, `output_size`和`block_num`来构造搜索空间。例如：传入SANAS的config可以指定为[('MobileNetV2BlockSpace', {'input_size': 224, 'output_size': 32, 'block_num': 10})]。
-  2.2 使用`block_mask`构造搜索空间。例如：传入SANAS的config可以指定为[('MobileNetV2BlockSpace', {'block_mask': [0, 1, 1, 1, 1, 0, 1, 0]})]。
-
-
-# 自定义搜索空间(search space)
-
-自定义搜索空间类需要继承搜索空间基类并重写以下几部分：
-  1. 初始化的tokens(`init_tokens`函数)，可以设置为自己想要的tokens列表, tokens列表中的每个数字指的是当前数字在相应的搜索列表中的索引。例如本示例中若tokens=[0, 3, 5]，则代表当前模型结构搜索到的通道数为[8, 40, 128]。
-  2. token中每个数字的搜索列表长度(`range_table`函数)，tokens中每个token的索引范围。
-  3. 根据token产生模型结构(`token2arch`函数)，根据搜索到的tokens列表产生模型结构。
-
-以新增reset block为例说明如何构造自己的search space。自定义的search space不能和已有的search space同名。
-
-```python
-### 引入搜索空间基类函数和search space的注册类函数
-from .search_space_base import SearchSpaceBase
-from .search_space_registry import SEARCHSPACE
-import numpy as np
-
-### 需要调用注册函数把自定义搜索空间注册到space space中
-@SEARCHSPACE.register
-### 定义一个继承SearchSpaceBase基类的搜索空间的类函数
-class ResNetBlockSpace2(SearchSpaceBase):
-    def __init__(self, input_size, output_size, block_num, block_mask):
-        ### 定义一些实际想要搜索的内容，例如：通道数、每个卷积的重复次数、卷积核大小等等
-        ### self.filter_num 代表通道数的搜索列表
-        self.filter_num = np.array([8, 16, 32, 40, 64, 128, 256, 512])
-
-    ### 定义初始化token，初始化token的长度根据传入的block_num或者block_mask的长度来得到的
-    def init_tokens(self):
-        return [0] * 3 * len(self.block_mask)
-
-    ### 定义
-    def range_table(self):
-        return [len(self.filter_num)] * 3 * len(self.block_mask)
-
-    def token2arch(self, tokens=None):
-        if tokens == None:
-            tokens = self.init_tokens()
-
-        self.bottleneck_params_list = []
-        for i in range(len(self.block_mask)):
-            self.bottleneck_params_list.append(self.filter_num[tokens[i * 3 + 0]], 
-                                               self.filter_num[tokens[i * 3 + 1]],
-                                               self.filter_num[tokens[i * 3 + 2]],
-                                               2 if self.block_mask[i] == 1 else 1)
-
-        def net_arch(input):
-            for i, layer_setting in enumerate(self.bottleneck_params_list):
-                channel_num, stride = layer_setting[:-1], layer_setting[-1]
-                input = self._resnet_block(input, channel_num, stride, name='resnet_layer{}'.format(i+1))
-
-            return input
-
-        return net_arch
-
-    ### 构造具体block的操作
-    def _resnet_block(self, input, channel_num, stride, name=None):
-        shortcut_conv = self._shortcut(input, channel_num[2], stride, name=name)
-        input = self._conv_bn_layer(input=input, num_filters=channel_num[0], filter_size=1, act='relu', name=name + '_conv0')
-        input = self._conv_bn_layer(input=input, num_filters=channel_num[1], filter_size=3, stride=stride, act='relu', name=name + '_conv1')
-        input = self._conv_bn_layer(input=input, num_filters=channel_num[2], filter_size=1, name=name + '_conv2')
-        return fluid.layers.elementwise_add(x=shortcut_conv, y=input, axis=0, name=name+'_elementwise_add')
-
-    def _shortcut(self, input, channel_num, stride, name=None):
-        channel_in = input.shape[1]
-        if channel_in != channel_num or stride != 1:
-            return self.conv_bn_layer(input, num_filters=channel_num, filter_size=1, stride=stride, name=name+'_shortcut')
-        else:
-            return input
-
-    def _conv_bn_layer(self, input, num_filters, filter_size, stride=1, padding='SAME', act=None, name=None):
-        conv = fluid.layers.conv2d(input, num_filters, filter_size, stride, name=name+'_conv')
-        bn = fluid.layers.batch_norm(conv, act=act, name=name+'_bn')
-        return bn
-``` 
diff --git a/demo/ocr/PaddleOCR b/demo/ocr/PaddleOCR
new file mode 160000
index 0000000000000000000000000000000000000000..56c6c3ae0e5c9ae6b9401a9446c629e513d4617f
--- /dev/null
+++ b/demo/ocr/PaddleOCR
@@ -0,0 +1 @@
+Subproject commit 56c6c3ae0e5c9ae6b9401a9446c629e513d4617f
diff --git a/demo/ocr/README.md b/demo/ocr/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..959c066fd02699e00bba729773c4b62c14208f23
--- /dev/null
+++ b/demo/ocr/README.md
@@ -0,0 +1,183 @@
+[English](README_en.md) | 简体中文
+
+# SlimOCR模型库
+
+
+## 模型
+
+PaddleSlim对[PaddleOCR]()发布的模型进行了压缩，产出了如下一系列小模型：
+
+
+<table>
+<thead>
+  <tr>
+    <th>序号</th>
+    <th>任务</th>
+    <th>模型</th>
+    <th>压缩策略<sup><a href="#quant">[3]</a><a href="#prune">[4]</a><sup></th>
+    <th>精度(自建中文数据集)</th>
+    <th>耗时<sup><a href="#latency">[1]</a></sup>(ms)</th>
+    <th>整体耗时<sup><a href="#rec">[2]</a></sup>(ms)</th>
+    <th>加速比</th>
+    <th>整体模型大小(M)</th>
+    <th>压缩比例</th>
+    <th>下载链接</th>
+  </tr>
+</thead>
+<tbody>
+  <tr>
+    <td rowspan="2">0</td>
+    <td>检测</td>
+    <td>MobileNetV3_DB</td>
+    <td>无</td>
+    <td>61.7</td>
+    <td>224</td>
+    <td rowspan="2">375</td>
+    <td rowspan="2">-</td>
+    <td rowspan="2">8.6</td>
+    <td rowspan="2">-</td>
+    <td></td>
+  </tr>
+  <tr>
+    <td>识别</td>
+    <td>MobileNetV3_CRNN</td>
+    <td>无</td>
+    <td>62.0</td>
+    <td>9.52</td>
+    <td></td>
+  </tr>
+  <tr>
+    <td rowspan="2">1</td>
+    <td>检测</td>
+    <td>SlimTextDet</td>
+    <td>PACT量化训练</td>
+    <td>62.1</td>
+    <td>195</td>
+    <td rowspan="2">348</td>
+    <td rowspan="2">8%</td>
+    <td rowspan="2">2.8</td>
+    <td rowspan="2">67.82%</td>
+    <td></td>
+  </tr>
+  <tr>
+    <td>识别</td>
+    <td>SlimTextRec</td>
+    <td>PACT量化训练</td>
+    <td>61.48</td>
+    <td>8.6</td>
+    <td></td>
+  </tr>
+  <tr>
+    <td rowspan="2">2</td>
+    <td>检测</td>
+    <td>SlimTextDet_quat_pruning</td>
+    <td>剪裁+PACT量化训练</td>
+    <td>60.86</td>
+    <td>142</td>
+    <td rowspan="2">288</td>
+    <td rowspan="2">30%</td>
+    <td rowspan="2">2.8</td>
+    <td rowspan="2">67.82%</td>
+    <td></td>
+  </tr>
+  <tr>
+    <td>识别</td>
+    <td>SlimTextRec</td>
+    <td>PACT量化训练</td>
+    <td>61.48</td>
+    <td>8.6</td>
+    <td></td>
+  </tr>
+  <tr>
+    <td rowspan="2">3</td>
+    <td>检测</td>
+    <td>SlimTextDet_pruning</td>
+    <td>剪裁</td>
+    <td>61.57</td>
+    <td>138</td>
+    <td rowspan="2">295</td>
+    <td rowspan="2">27%</td>
+    <td rowspan="2">2.9</td>
+    <td rowspan="2">66.28%</td>
+    <td></td>
+  </tr>
+  <tr>
+    <td>识别</td>
+    <td>SlimTextRec</td>
+    <td>PACT量化训练</td>
+    <td>61.48</td>
+    <td>8.6</td>
+    <td></td>
+  </tr>
+</tbody>
+</table>
+
+
+**注意**:
+
+-   <a name="latency">[1]</a> 耗时评测环境为：骁龙855芯片+PaddleLite。
+-   <a name="rec">[2]</a> 整体耗时不等于检测耗时加识别耗时的原因是：识别模型的耗时为单个检测框的耗时，一张图片可能会有多个检测框。
+-   <a name="quant">[3]</a> 参考下面关于[OCR量化的说明](#OCR量化说明)。
+-   <a name="prune">[4]</a> 参考下面关于[OCR剪裁的说明](#OCR剪裁说明)。
+
+
+## OCR量化说明
+
+对于OCR模型，普通的量化训练精度损失较大，并且训练不稳定。所以我们选择PACT方法进行量化
+
+### 文本检测模型
+
+MobileNetV3_DB是一个全卷积模型，我们可以对整个模型进行量化。
+
+整个量化训练的轮数与全精度模型的训练轮数一致，量化的配置如下所示：
+
+```python
+    quant_config = {
+        'weight_quantize_type': 'channel_wise_abs_max',
+        'activation_quantize_type': 'moving_average_abs_max',
+        'weight_bits': 8,
+        'activation_bits': 8,
+        'quantize_op_types': ['conv2d', 'depthwise_conv2d', 'mul'],
+        'dtype': 'int8',
+        'window_size': 10000,
+        'moving_rate': 0.9,
+    }
+```
+
+对于PACT参数，我们沿用了论文中的方法，截断阈值$\alpha$的学习率与原模型其他参数保持一致。另外，对其增加一个系数为0.0001的L2正则化，使用`AdamOptimizer`对其进行优化，确保其能快速收敛。
+
+### 文本识别模型
+
+MobileNetV3_CRNN模型包含一个LSTM组件，因为暂时不支持对LSTM进行量化，我们暂时跳过这一部分。
+
+通过[scope_guard API](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/executor_cn/scope_guard_cn.html#scope-guard)将LSTM切换到新的作用域`skip_quant`，量化配置中通过`not_quant_pattern`设置不对这一部分进行量化，具体量化配置如下:
+```python
+    quant_config = {
+        'weight_quantize_type': 'channel_wise_abs_max',
+        'activation_quantize_type': 'moving_average_abs_max',
+        'weight_bits': 8,
+        'activation_bits': 8,
+        'not_quant_pattern': ['skip_quant'],
+        'quantize_op_types': ['conv2d', 'depthwise_conv2d', 'mul'],
+        'dtype': 'int8',
+        'window_size': 10000,
+        'moving_rate': 0.9,
+    }
+```
+
+同样地，量化训练的轮数与全精度模型的训练轮数一致，PACT阈值$\alpha$的学习率与原模型其他参数保持一致。我们发现，对$\alpha$使用与原模型其他参数一样的L2正则化系数，量化训练就可以很好地收敛。关于优化器，使用`AdamOptimizer`对其进行优化，确保其能快速收敛。
+
+
+更多量化教程请参考[OCR模型量化压缩教程](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/deploy/slim/quantization/README.md)
+
+
+## OCR剪裁说明
+
+### 敏感度分析
+  在对OCR文字检测模型进行裁剪敏感度分析时，分析对象为除depthwise convolution外的所有普通卷积层，裁剪的criterion被设置为'geometry_median'，pruned_ratios推荐设置为[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]。
+
+### 裁剪与finetune
+  裁剪时通过之前的敏感度分析文件决定每个网络层的裁剪比例。在具体实现时，为了尽可能多的保留从图像中提取的低阶特征，我们跳过了backbone中靠近输入的4个卷积层。同样，为了减少由于裁剪导致的模型性能损失，我们通过之前敏感度分析所获得敏感度表，挑选出了一些冗余较少，对裁剪较为敏感[网络层](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/deploy/slim/prune/pruning_and_finetune.py#L41)，并在之后的裁剪过程中选择避开这些网络层。裁剪过后finetune的过程沿用OCR检测模型原始的训练策略。
+
+
+更多OCR剪裁教程请参考[OCR模剪裁压缩教程](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/deploy/slim/prune/README.md)
diff --git a/demo/one_shot/ofa_train.py b/demo/one_shot/ofa_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a47a219c1096d750757f407cfde4ff37691efb7
--- /dev/null
+++ b/demo/one_shot/ofa_train.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.dygraph.nn as nn
+from paddle.nn import ReLU
+from paddleslim.nas.ofa import OFA, RunConfig, DistillConfig
+from paddleslim.nas.ofa import supernet
+
+
+class Model(fluid.dygraph.Layer):
+    def __init__(self):
+        super(Model, self).__init__()
+        with supernet(
+                kernel_size=(3, 5, 7), expand_ratio=[1, 2, 4]) as ofa_super:
+            models = []
+            models += [nn.Conv2D(1, 6, 3)]
+            models += [ReLU()]
+            models += [nn.Pool2D(2, 'max', 2)]
+            models += [nn.Conv2D(6, 16, 5, padding=0)]
+            models += [ReLU()]
+            models += [nn.Pool2D(2, 'max', 2)]
+            models += [
+                nn.Linear(784, 120), nn.Linear(120, 84), nn.Linear(84, 10)
+            ]
+            models = ofa_super.convert(models)
+        self.models = paddle.nn.Sequential(*models)
+
+    def forward(self, inputs, label, depth=None):
+        if depth != None:
+            assert isinstance(depth, int)
+            assert depth < len(self.models)
+            models = self.models[:depth]
+        else:
+            depth = len(self.models)
+            models = self.models[:]
+
+        for idx, layer in enumerate(models):
+            if idx == 6:
+                inputs = fluid.layers.flatten(inputs, 1)
+            inputs = layer(inputs)
+
+        inputs = fluid.layers.softmax(inputs)
+        return inputs
+
+
+def test_ofa():
+
+    default_run_config = {
+        'train_batch_size': 256,
+        'eval_batch_size': 64,
+        'n_epochs': [[1], [2, 3], [4, 5]],
+        'init_learning_rate': [[0.001], [0.003, 0.001], [0.003, 0.001]],
+        'dynamic_batch_size': [1, 1, 1],
+        'total_images': 50000,  #1281167,
+        'elastic_depth': (2, 5, 8)
+    }
+    run_config = RunConfig(**default_run_config)
+
+    default_distill_config = {
+        'lambda_distill': 0.01,
+        'teacher_model': Model,
+        'mapping_layers': ['models.0.fn']
+    }
+    distill_config = DistillConfig(**default_distill_config)
+
+    fluid.enable_dygraph()
+    model = Model()
+    ofa_model = OFA(model, run_config, distill_config=distill_config)
+
+    train_reader = paddle.fluid.io.batch(
+        paddle.dataset.mnist.train(), batch_size=256, drop_last=True)
+
+    start_epoch = 0
+    for idx in range(len(run_config.n_epochs)):
+        cur_idx = run_config.n_epochs[idx]
+        for ph_idx in range(len(cur_idx)):
+            cur_lr = run_config.init_learning_rate[idx][ph_idx]
+            adam = fluid.optimizer.Adam(
+                learning_rate=cur_lr,
+                parameter_list=(ofa_model.parameters() + ofa_model.netAs_param))
+            for epoch_id in range(start_epoch,
+                                  run_config.n_epochs[idx][ph_idx]):
+                for batch_id, data in enumerate(train_reader()):
+                    dy_x_data = np.array(
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                    img = fluid.dygraph.to_variable(dy_x_data)
+                    label = fluid.dygraph.to_variable(y_data)
+                    label.stop_gradient = True
+
+                    for model_no in range(run_config.dynamic_batch_size[idx]):
+                        output, _ = ofa_model(img, label)
+                        loss = fluid.layers.reduce_mean(output)
+                        dis_loss = ofa_model.calc_distill_loss()
+                        loss += dis_loss
+                        loss.backward()
+
+                        if batch_id % 10 == 0:
+                            print(
+                                'epoch: {}, batch: {}, loss: {}, distill loss: {}'.
+                                format(epoch_id, batch_id,
+                                       loss.numpy()[0], dis_loss.numpy()[0]))
+                    ### accumurate dynamic_batch_size network of gradients for same batch of data
+                    ### NOTE: need to fix gradients accumulate in PaddlePaddle
+                    adam.minimize(loss)
+                    adam.clear_gradients()
+            start_epoch = run_config.n_epochs[idx][ph_idx]
+
+
+test_ofa()
diff --git a/demo/one_shot/train.py b/demo/one_shot/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e8267ff66f37f5b24807eb86c3bdad7182de2b7
--- /dev/null
+++ b/demo/one_shot/train.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import argparse
+import ast
+import numpy as np
+from PIL import Image
+import os
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import AdamOptimizer
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
+from paddle.fluid.dygraph.base import to_variable
+
+from paddleslim.nas.one_shot import SuperMnasnet
+from paddleslim.nas.one_shot import OneShotSearch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("Training for Mnist.")
+    parser.add_argument(
+        "--use_data_parallel",
+        type=ast.literal_eval,
+        default=False,
+        help="The flag indicating whether to use data parallel mode to train the model."
+    )
+    parser.add_argument("-e", "--epoch", default=5, type=int, help="set epoch")
+    parser.add_argument("--ce", action="store_true", help="run ce")
+    args = parser.parse_args()
+    return args
+
+
+class SimpleImgConv(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 conv_stride=1,
+                 conv_padding=0,
+                 conv_dilation=1,
+                 conv_groups=1,
+                 act=None,
+                 use_cudnn=False,
+                 param_attr=None,
+                 bias_attr=None):
+        super(SimpleImgConv, self).__init__()
+
+        self._conv2d = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=conv_stride,
+            padding=conv_padding,
+            dilation=conv_dilation,
+            groups=conv_groups,
+            param_attr=None,
+            bias_attr=None,
+            act=act,
+            use_cudnn=use_cudnn)
+
+    def forward(self, inputs):
+        x = self._conv2d(inputs)
+        return x
+
+
+class MNIST(fluid.dygraph.Layer):
+    def __init__(self):
+        super(MNIST, self).__init__()
+
+        self._simple_img_conv_pool_1 = SimpleImgConv(1, 20, 2, act="relu")
+        self.arch = SuperMnasnet(
+            name_scope="super_net", input_channels=20, out_channels=20)
+        self._simple_img_conv_pool_2 = SimpleImgConv(20, 50, 2, act="relu")
+
+        self.pool_2_shape = 50 * 13 * 13
+        SIZE = 10
+        scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5
+        self._fc = Linear(
+            self.pool_2_shape,
+            10,
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.NormalInitializer(
+                    loc=0.0, scale=scale)),
+            act="softmax")
+
+    def forward(self, inputs, label=None, tokens=None):
+        x = self._simple_img_conv_pool_1(inputs)
+
+        x = self.arch(x, tokens=tokens)  # addddddd
+        x = self._simple_img_conv_pool_2(x)
+        x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape])
+        x = self._fc(x)
+        if label is not None:
+            acc = fluid.layers.accuracy(input=x, label=label)
+            return x, acc
+        else:
+            return x
+
+
+def test_mnist(model, tokens=None):
+    acc_set = []
+    avg_loss_set = []
+    batch_size = 64
+    test_reader = paddle.fluid.io.batch(
+        paddle.dataset.mnist.test(), batch_size=batch_size, drop_last=True)
+    for batch_id, data in enumerate(test_reader()):
+        dy_x_data = np.array([x[0].reshape(1, 28, 28)
+                              for x in data]).astype('float32')
+        y_data = np.array(
+            [x[1] for x in data]).astype('int64').reshape(batch_size, 1)
+
+        img = to_variable(dy_x_data)
+        label = to_variable(y_data)
+        label.stop_gradient = True
+        prediction, acc = model.forward(img, label, tokens=tokens)
+        loss = fluid.layers.cross_entropy(input=prediction, label=label)
+        avg_loss = fluid.layers.mean(loss)
+        acc_set.append(float(acc.numpy()))
+        avg_loss_set.append(float(avg_loss.numpy()))
+        if batch_id % 100 == 0:
+            print("Test - batch_id: {}".format(batch_id))
+        # get test acc and loss
+    acc_val_mean = np.array(acc_set).mean()
+    avg_loss_val_mean = np.array(avg_loss_set).mean()
+
+    return acc_val_mean
+
+
+def train_mnist(args, model, tokens=None):
+    epoch_num = args.epoch
+    BATCH_SIZE = 64
+
+    adam = AdamOptimizer(
+        learning_rate=0.001, parameter_list=model.parameters())
+
+    train_reader = paddle.fluid.io.batch(
+        paddle.dataset.mnist.train(), batch_size=BATCH_SIZE, drop_last=True)
+    if args.use_data_parallel:
+        train_reader = fluid.contrib.reader.distributed_batch_reader(
+            train_reader)
+
+    for epoch in range(epoch_num):
+        for batch_id, data in enumerate(train_reader()):
+            dy_x_data = np.array([x[0].reshape(1, 28, 28)
+                                  for x in data]).astype('float32')
+            y_data = np.array(
+                [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+            img = to_variable(dy_x_data)
+            label = to_variable(y_data)
+            label.stop_gradient = True
+
+            cost, acc = model.forward(img, label, tokens=tokens)
+
+            loss = fluid.layers.cross_entropy(cost, label)
+            avg_loss = fluid.layers.mean(loss)
+
+            if args.use_data_parallel:
+                avg_loss = model.scale_loss(avg_loss)
+                avg_loss.backward()
+                model.apply_collective_grads()
+            else:
+                avg_loss.backward()
+
+            adam.minimize(avg_loss)
+            # save checkpoint
+            model.clear_gradients()
+            if batch_id % 1 == 0:
+                print("Loss at epoch {} step {}: {:}".format(epoch, batch_id,
+                                                             avg_loss.numpy()))
+
+        model.eval()
+        test_acc = test_mnist(model, tokens=tokens)
+        model.train()
+        print("Loss at epoch {} , acc is: {}".format(epoch, test_acc))
+
+    save_parameters = (not args.use_data_parallel) or (
+        args.use_data_parallel and
+        fluid.dygraph.parallel.Env().local_rank == 0)
+    if save_parameters:
+        fluid.save_dygraph(model.state_dict(), "save_temp")
+        print("checkpoint saved")
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    place = fluid.CPUPlace()
+    with fluid.dygraph.guard(place):
+        model = MNIST()
+        # step 1: training super net
+        #train_mnist(args, model)
+        # step 2: search
+        best_tokens = OneShotSearch(model, test_mnist)
+    # step 3: final training
+    #    train_mnist(args, model, best_tokens)
diff --git a/demo/optimizer.py b/demo/optimizer.py
index 0f0c57985f839097e9e1ae4643ba2e5a2fb64698..6b8962749b6f5000fadc67356dbb302b57d4c3e7 100644
--- a/demo/optimizer.py
+++ b/demo/optimizer.py
@@ -20,7 +20,6 @@ import math
 
 import paddle.fluid as fluid
 import paddle.fluid.layers.ops as ops
-from paddle.fluid.initializer import init_on_cpu
 from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
 
 lr_strategy = 'cosine_decay'
@@ -40,10 +39,9 @@ def cosine_decay(learning_rate, step_each_epoch, epochs=120):
     """
     global_step = _decay_step_counter()
 
-    with init_on_cpu():
-        epoch = ops.floor(global_step / step_each_epoch)
-        decayed_lr = learning_rate * \
-                     (ops.cos(epoch * (math.pi / epochs)) + 1)/2
+    epoch = ops.floor(global_step / step_each_epoch)
+    decayed_lr = learning_rate * \
+                 (ops.cos(epoch * (math.pi / epochs)) + 1)/2
     return decayed_lr
 
 
@@ -63,17 +61,16 @@ def cosine_decay_with_warmup(learning_rate, step_each_epoch, epochs=120):
     warmup_epoch = fluid.layers.fill_constant(
         shape=[1], dtype='float32', value=float(5), force_cpu=True)
 
-    with init_on_cpu():
-        epoch = ops.floor(global_step / step_each_epoch)
-        with fluid.layers.control_flow.Switch() as switch:
-            with switch.case(epoch < warmup_epoch):
-                decayed_lr = learning_rate * (global_step /
-                                              (step_each_epoch * warmup_epoch))
-                fluid.layers.tensor.assign(input=decayed_lr, output=lr)
-            with switch.default():
-                decayed_lr = learning_rate * \
-                    (ops.cos((global_step - warmup_epoch * step_each_epoch) * (math.pi / (epochs * step_each_epoch))) + 1)/2
-                fluid.layers.tensor.assign(input=decayed_lr, output=lr)
+    epoch = ops.floor(global_step / step_each_epoch)
+    with fluid.layers.control_flow.Switch() as switch:
+        with switch.case(epoch < warmup_epoch):
+            decayed_lr = learning_rate * (global_step /
+                                          (step_each_epoch * warmup_epoch))
+            fluid.layers.tensor.assign(input=decayed_lr, output=lr)
+        with switch.default():
+            decayed_lr = learning_rate * \
+                (ops.cos((global_step - warmup_epoch * step_each_epoch) * (math.pi / (epochs * step_each_epoch))) + 1)/2
+            fluid.layers.tensor.assign(input=decayed_lr, output=lr)
     return lr
 
 
@@ -95,19 +92,18 @@ def exponential_decay_with_warmup(learning_rate,
     warmup_epoch = fluid.layers.fill_constant(
         shape=[1], dtype='float32', value=float(warm_up_epoch), force_cpu=True)
 
-    with init_on_cpu():
-        epoch = ops.floor(global_step / step_each_epoch)
-        with fluid.layers.control_flow.Switch() as switch:
-            with switch.case(epoch < warmup_epoch):
-                decayed_lr = learning_rate * (global_step /
-                                              (step_each_epoch * warmup_epoch))
-                fluid.layers.assign(input=decayed_lr, output=lr)
-            with switch.default():
-                div_res = (global_step - warmup_epoch * step_each_epoch
-                           ) / decay_epochs
-                div_res = ops.floor(div_res)
-                decayed_lr = learning_rate * (decay_rate**div_res)
-                fluid.layers.assign(input=decayed_lr, output=lr)
+    epoch = ops.floor(global_step / step_each_epoch)
+    with fluid.layers.control_flow.Switch() as switch:
+        with switch.case(epoch < warmup_epoch):
+            decayed_lr = learning_rate * (global_step /
+                                          (step_each_epoch * warmup_epoch))
+            fluid.layers.assign(input=decayed_lr, output=lr)
+        with switch.default():
+            div_res = (global_step - warmup_epoch * step_each_epoch
+                       ) / decay_epochs
+            div_res = ops.floor(div_res)
+            decayed_lr = learning_rate * (decay_rate**div_res)
+            fluid.layers.assign(input=decayed_lr, output=lr)
 
     return lr
 
diff --git a/demo/pantheon/lexical_anlysis/README.md b/demo/pantheon/lexical_anlysis/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ec3af05d28e42c9d3b0efac962ba9a8d8c283646
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/README.md
@@ -0,0 +1,40 @@
+# Distillation example: Chinese lexical analysis
+We demonstrated how to use the Pantheon framework for online distillation of the Chinese lexical analysis model with sample dataset. The effect of large-scale online distillation is shown below:
+| model | Precision | Recall | F1-score|
+| ------ | ------ | ------ | ------ |
+| BiGRU | 89.2 | 89.4 | 89.3 |
+| BERT fine-tuned | 90.2 | 90.4 | 90.3 |
+| ERNIE fine-tuned | 91.7 | 91.7 | 91.7 |
+| DistillBiGRU | 90.20  | 90.52 | 90.36 |
+
+BiGRU is to train a BiGRU based LAC model from scratch; BERT fine-tuned is to fine-tune LAC task on BERT base model; ERNIE fine-tuned is to fine-tune LAC task on BERT base model; DistillBiGRU is trained through large-scale online distillation with ERNIE fine-tuned as teacher model.
+
+## Introduction
+
+Lexical Analysis of Chinese, or LAC for short, is a lexical analysis model that completes the tasks of Chinese word segmentation, part-of-speech tagging, and named entity recognition in a single model. We conduct an overall evaluation of word segmentation, part-of-speech tagging, and named entity recognition on a self-built dataset. We use the finetuned [ERNIE](https://github.com/PaddlePaddle/LARK/tree/develop/ERNIE) model as the Teacher model and GRU as the Student model, which are needed by the Pantheon framework for online distillation.
+
+#### 1. Download the training data set
+
+Download the data set file, and after decompression, a `./data/` folder will be created.
+```bash
+python downloads.py dataset
+```
+
+#### 2. Download the Teacher model
+
+```bash
+# download ERNIE finetuned model
+python downloads.py finetuned
+python downloads.py conf
+```
+
+### 3. Distilling Student model
+```bash
+# start teacher service
+bash run_teacher.sh
+
+# start student service
+bash run_student.sh
+```
+
+> If you want to learn more about LAC, you can refer to this repo: https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/lexical_analysis
\ No newline at end of file
diff --git a/demo/pantheon/lexical_anlysis/README_cn.md b/demo/pantheon/lexical_anlysis/README_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..77e4a944012482e8a0b8ca26cbd4c088e6b969a4
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/README_cn.md
@@ -0,0 +1,41 @@
+# 蒸馏样例：中文词法分析
+我们在样例数据集上，对中文词法分析模型，演示了如何使用Pantheon框架进行在线蒸馏。大规模在线蒸馏的效果如下图所示：
+
+| 模型 | 精度 | 召回率 | F1值|
+| ------ | ------ | ------ | ------ |
+| BiGRU | 89.2 | 89.4 | 89.3 |
+| BERT fine-tuned | 90.2 | 90.4 | 90.3 |
+| ERNIE fine-tuned | 91.7 | 91.7 | 91.7 |
+| DistillBiGRU | 90.20  | 90.52 | 90.36 |
+
+BiGRU 是使用双向GRU网络从头训练LAC任务；BERT fine-tuned 是在BERT base模型上微调LAC任务；ERNIE fine-tuned 是在ERNIE base模型上微调LAC任务；DistillBiGRU 是使用ERNIE fine-tuned模型作为teacher模型，通过大规模蒸馏训练LAC任务。
+
+## 简介
+
+Lexical Analysis of Chinese，简称 LAC，是一个联合的词法分析模型，在单个模型中完成中文分词、词性标注、专名识别任务。我们在自建的数据集上对分词、词性标注、专名识别进行整体的评估效果。我们使用经过finetune的 [ERNIE](https://github.com/PaddlePaddle/LARK/tree/develop/ERNIE) 模型作为Teacher模型，使用GRU作为Student模型，使用Pantheon框架进行在线蒸馏。
+
+#### 1. 下载训练数据集
+
+下载数据集文件，解压后会生成 `./data/` 文件夹
+```bash
+python downloads.py dataset
+```
+
+#### 2. 下载Teacher模型
+
+```bash
+# download ERNIE finetuned model
+python downloads.py finetuned
+python downloads.py conf
+```
+
+### 3. 蒸馏Student模型
+```bash
+# start teacher service
+bash run_teacher.sh
+
+# start student service
+bash run_student.sh
+```
+
+> 如果你想详细了解LAC的原理可以参照相关repo: https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/lexical_analysis
diff --git a/demo/pantheon/lexical_anlysis/__init__.py b/demo/pantheon/lexical_anlysis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcc99e781b6d2ae6fa921ef65636817560e98d37
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/__init__.py
@@ -0,0 +1,4 @@
+from .teacher import Teacher
+from .student import Student
+
+__all__ = teacher.__all__ + student.__all__
diff --git a/demo/pantheon/lexical_anlysis/creator.py b/demo/pantheon/lexical_anlysis/creator.py
new file mode 100644
index 0000000000000000000000000000000000000000..48324091faa87b254a328dac9767744b4f294dbb
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/creator.py
@@ -0,0 +1,260 @@
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Define the function to create lexical analysis model and model's data reader
+"""
+import sys
+import os
+import math
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.initializer import NormalInitializer
+
+from reader import Dataset
+from ernie_reader import SequenceLabelReader
+
+from models.sequence_labeling import nets
+from models.representation.ernie import ernie_encoder, ernie_pyreader
+
+
+def create_model(args, vocab_size, num_labels, mode='train'):
+    """create lac model"""
+
+    # model's input data
+    words = fluid.data(name='words', shape=[-1, 1], dtype='int64', lod_level=1)
+    targets = fluid.data(
+        name='targets', shape=[-1, 1], dtype='int64', lod_level=1)
+    if mode == "train":
+        print("create model mode: ", mode)
+        teacher_crf_decode = fluid.data(
+            name='teacher_crf_decode', shape=[-1, 1], dtype='float32', lod_level=1)
+    else:
+        print("create model mode: ", mode)
+        teacher_crf_decode = None
+    
+    feed_list = [words, targets]
+    if teacher_crf_decode:
+        feed_list.append(teacher_crf_decode)
+        
+    pyreader = fluid.io.DataLoader.from_generator(
+                feed_list=feed_list,
+                capacity=200,
+                use_double_buffer=True,
+                iterable=False)
+    # for test or train process
+    avg_cost, crf_avg_cost, teacher_cost, crf_decode= nets.lex_net(
+        words, args, vocab_size, num_labels, teacher_crf_decode,for_infer=False, target=targets)
+
+    (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
+     num_correct_chunks) = fluid.layers.chunk_eval(
+         input=crf_decode,
+         label=targets,
+         chunk_scheme="IOB",
+         num_chunk_types=int(math.ceil((num_labels - 1) / 2.0)))
+    chunk_evaluator = fluid.metrics.ChunkEvaluator()
+    chunk_evaluator.reset()
+    
+    ret = {
+        "pyreader": pyreader,
+        "words": words,
+        "targets": targets,
+        "avg_cost": avg_cost,
+        "crf_avg_cost": crf_avg_cost,
+        "teacher_cost": teacher_cost,
+        "crf_decode": crf_decode,
+        "precision": precision,
+        "recall": recall,
+        "f1_score": f1_score,
+        "chunk_evaluator": chunk_evaluator,
+        "num_infer_chunks": num_infer_chunks,
+        "num_label_chunks": num_label_chunks,
+        "num_correct_chunks": num_correct_chunks
+    }
+    return ret
+
+def create_lexnet_data_generator(args,
+                                 reader,
+                                 file_name,
+                                 place,
+                                 mode='train'):
+    if mode == 'train':
+        def wrapper():
+            batch_words, batch_labels, batch_emissions, seq_lens = [], [], None, []
+            emi_lens = []
+            for epoch in range(args.epoch):
+                print("data epoch: {}".format(epoch))
+                for instance in reader.file_reader(file_name, mode="train")():
+                    words, labels, emission = instance
+                    if len(seq_lens) < args.batch_size:
+                        batch_words.append(words)
+                        batch_labels.append(labels)
+                        if batch_emissions is not None:
+                            batch_emissions = np.concatenate((batch_emissions, emission))
+                        else:
+                            batch_emissions = emission
+                        seq_lens.append(len(words))
+                        emi_lens.append(emission.shape[0])
+                    if len(seq_lens) == args.batch_size:  
+   
+                        #print("batch words len", [len(seq) for seq in batch_words])
+                        #print("batch labels len", [len(seq) for seq in batch_labels])
+                        #print("emi lens:", emi_lens)
+                        #print("emission first dim:", batch_emissions.shape[0])
+                        #print("reduced seq_lens:", sum(seq_lens))
+                        t_words = fluid.create_lod_tensor(batch_words, [seq_lens], place)
+                        t_labels = fluid.create_lod_tensor(batch_labels, [seq_lens], place)
+                        t_emissions = fluid.create_lod_tensor(batch_emissions, [seq_lens], place)
+                        yield t_words, t_labels, t_emissions
+                        batch_words, batch_labels, batch_emissions, seq_lens = [], [], None, []
+                        emi_lens = []
+
+                if len(seq_lens) > 0:                
+                    t_words = fluid.create_lod_tensor(batch_words, [seq_lens], place)
+                    t_labels = fluid.create_lod_tensor(batch_labels, [seq_lens], place)
+                    t_emissions = fluid.create_lod_tensor(batch_emissions, [seq_lens], place)
+                    yield t_words, t_labels, t_emissions
+                    batch_words, batch_labels, batch_emissions, seq_lens = [], [], None, []
+
+    else:
+        def wrapper():
+            batch_words, batch_labels, seq_lens = [], [], []
+            for instance in reader.file_reader(file_name, mode="test")():
+                words, labels = instance
+                if len(seq_lens) < args.batch_size:
+                    batch_words.append(words)
+                    batch_labels.append(labels)
+                    seq_lens.append(len(words))
+                if len(seq_lens) == args.batch_size:  
+                    t_words = fluid.create_lod_tensor(batch_words, [seq_lens], place)
+                    t_labels = fluid.create_lod_tensor(batch_labels, [seq_lens], place)
+                    yield t_words, t_labels
+                    batch_words, batch_labels, seq_lens = [], [], []
+    
+            if len(seq_lens) > 0:                
+                t_words = fluid.create_lod_tensor(batch_words, [seq_lens], place)
+                t_labels = fluid.create_lod_tensor(batch_labels, [seq_lens], place)
+                yield t_words, t_labels
+                batch_words, batch_labels, seq_lens = [], [], []
+    return wrapper
+
+def create_pyreader(args,
+                    file_name,
+                    feed_list,
+                    place,
+                    model='lac',
+                    reader=None,
+                    return_reader=False,
+                    mode='train'):
+    reader = SequenceLabelReader(
+                vocab_path=args.vocab_path,
+                label_map_config=args.label_map_config,
+                max_seq_len=args.max_seq_len,
+                do_lower_case=args.do_lower_case,
+                random_seed=args.random_seed)
+    return reader.data_generator(file_name,args.batch_size,args.epoch,shuffle=False,phase="train")
+
+
+def create_ernie_model(args, ernie_config):
+    """
+    Create Model for LAC based on ERNIE encoder
+    """
+    # ERNIE's input data
+
+    src_ids = fluid.data(
+        name='src_ids', shape=[-1, args.max_seq_len, 1], dtype='int64')
+    sent_ids = fluid.data(
+        name='sent_ids', shape=[-1, args.max_seq_len, 1], dtype='int64')
+    pos_ids = fluid.data(
+        name='pos_ids', shape=[-1, args.max_seq_len, 1], dtype='int64')
+    input_mask = fluid.data(
+        name='input_mask', shape=[-1, args.max_seq_len, 1], dtype='float32')
+
+    padded_labels = fluid.data(
+        name='padded_labels', shape=[-1, args.max_seq_len, 1], dtype='int64')
+
+    seq_lens = fluid.data(
+        name='seq_lens', shape=[-1], dtype='int64', lod_level=0)
+
+    squeeze_labels = fluid.layers.squeeze(padded_labels, axes=[-1])
+
+    # ernie_pyreader
+    ernie_inputs = {
+        "src_ids": src_ids,
+        "sent_ids": sent_ids,
+        "pos_ids": pos_ids,
+        "input_mask": input_mask,
+        "seq_lens": seq_lens
+    }
+    embeddings = ernie_encoder(ernie_inputs, ernie_config=ernie_config)
+
+    padded_token_embeddings = embeddings["padded_token_embeddings"]
+
+    emission = fluid.layers.fc(
+        size=args.num_labels,
+        input=padded_token_embeddings,
+        param_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Uniform(
+                low=-args.init_bound, high=args.init_bound),
+            regularizer=fluid.regularizer.L2DecayRegularizer(
+                regularization_coeff=1e-4)),
+        num_flatten_dims=2)
+
+    crf_cost = fluid.layers.linear_chain_crf(
+        input=emission,
+        label=padded_labels,
+        param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=args.crf_learning_rate),
+        length=seq_lens)
+
+    avg_cost = fluid.layers.mean(x=crf_cost)
+    crf_decode = fluid.layers.crf_decoding(
+        input=emission,
+        param_attr=fluid.ParamAttr(name='crfw'),
+        length=seq_lens)
+
+    (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
+     num_correct_chunks) = fluid.layers.chunk_eval(
+         input=crf_decode,
+         label=squeeze_labels,
+         chunk_scheme="IOB",
+         num_chunk_types=int(math.ceil((args.num_labels - 1) / 2.0)),
+         seq_length=seq_lens)
+    chunk_evaluator = fluid.metrics.ChunkEvaluator()
+    chunk_evaluator.reset()
+
+    ret = {
+        "feed_list":
+        [src_ids, sent_ids, pos_ids, input_mask, padded_labels, seq_lens],
+        "words": src_ids,
+        "pos_ids":pos_ids,
+        "sent_ids":sent_ids,
+        "input_mask":input_mask,
+        "labels": padded_labels,
+        "seq_lens": seq_lens,
+        "avg_cost": avg_cost,
+        "crf_decode": crf_decode,
+        "precision": precision,
+        "recall": recall,
+        "f1_score": f1_score,
+        "chunk_evaluator": chunk_evaluator,
+        "num_infer_chunks": num_infer_chunks,
+        "num_label_chunks": num_label_chunks,
+        "num_correct_chunks": num_correct_chunks,
+        "emission":emission, 
+        "alpha": None
+    }
+
+    return ret
diff --git a/demo/pantheon/lexical_anlysis/downloads.py b/demo/pantheon/lexical_anlysis/downloads.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0aae6ece3f12fd66631b256a145a5c1925ed392
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/downloads.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Download script, download dataset and pretrain models.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import io
+import os
+import sys
+import time
+import hashlib
+import tarfile
+import requests
+
+FILE_INFO = {
+    'BASE_URL': 'https://baidu-nlp.bj.bcebos.com/',
+    'DATA': {
+        'name': 'lexical_analysis-dataset-2.0.0.tar.gz',
+        'md5': '71e4a9a36d0f0177929a1bccedca7dba'
+    },
+    'FINETURN_MODEL': {
+        'name': 'lexical_analysis_finetuned-1.0.0.tar.gz',
+        'md5': "ee2c7614b06dcfd89561fbbdaac34342"
+    },
+    'CONF': {
+        'name': 'conf.tar.gz',
+        'md5': "7a0fe28db46db496fff4361eebaa6515", 
+        'url': 'https://paddlemodels.bj.bcebos.com/PaddleSlim/pantheon/lexical_analysis/',
+    }
+}
+
+
+def usage():
+    desc = ("\nDownload datasets and pretrained models for LAC.\n"
+            "Usage:\n"
+            "   1. python download.py all\n"
+            "   2. python download.py dataset\n"
+            "   3. python download.py finetuned\n"
+            "   4. python download.py conf\n")
+    print(desc)
+
+
+def md5file(fname):
+    hash_md5 = hashlib.md5()
+    with io.open(fname, "rb") as fin:
+        for chunk in iter(lambda: fin.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+
+
+def extract(fname, dir_path):
+    """
+    Extract tar.gz file
+    """
+    try:
+        tar = tarfile.open(fname, "r")
+        file_names = tar.getnames()
+        for file_name in file_names:
+            tar.extract(file_name, dir_path)
+            print(file_name)
+        tar.close()
+    except Exception as e:
+        raise e
+
+
+def _download(url, filename, md5sum):
+    """
+    Download file and check md5
+    """
+    retry = 0
+    retry_limit = 3
+    chunk_size = 4096
+    while not (os.path.exists(filename) and md5file(filename) == md5sum):
+        if retry < retry_limit:
+            retry += 1
+        else:
+            raise RuntimeError(
+                "Cannot download dataset ({0}) with retry {1} times.".format(
+                    url, retry_limit))
+        try:
+            start = time.time()
+            size = 0
+            res = requests.get(url, stream=True)
+            filesize = int(res.headers['content-length'])
+            if res.status_code == 200:
+                print("[Filesize]: %0.2f MB" % (filesize / 1024 / 1024))
+                # save by chunk
+                with io.open(filename, "wb") as fout:
+                    for chunk in res.iter_content(chunk_size=chunk_size):
+                        if chunk:
+                            fout.write(chunk)
+                            size += len(chunk)
+                            pr = '>' * int(size * 50 / filesize)
+                            print(
+                                '\r[Process ]: %s%.2f%%' %
+                                (pr, float(size / filesize * 100)),
+                                end='')
+            end = time.time()
+            print("\n[CostTime]: %.2f s" % (end - start))
+        except Exception as e:
+            print(e)
+
+
+def download(name, dir_path):
+    # import ipdb; ipdb.set_trace()
+    if name == 'CONF':
+        url = FILE_INFO[name]['url'] + FILE_INFO[name]['name']
+    else:
+        url = FILE_INFO['BASE_URL'] + FILE_INFO[name]['name']
+    file_path = os.path.join(dir_path, FILE_INFO[name]['name'])
+
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
+
+    # download data
+    print("Downloading : %s" % name)
+    _download(url, file_path, FILE_INFO[name]['md5'])
+
+    # extract data
+    print("Extracting : %s" % file_path)
+    extract(file_path, dir_path)
+    os.remove(file_path)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        usage()
+        sys.exit(1)
+    pwd = os.path.join(os.path.dirname(__file__), './')
+    ernie_dir = os.path.join(os.path.dirname(__file__), './pretrained')
+
+    if sys.argv[1] == 'all':
+        download('DATA', pwd)
+        download('FINETURN_MODEL', pwd)
+        download('CONF', pwd)
+
+    if sys.argv[1] == "dataset":
+        download('DATA', pwd)
+
+    elif sys.argv[1] == "finetuned":
+        download('FINETURN_MODEL', pwd)
+
+    elif sys.argv[1] == "conf":
+        download('CONF', pwd)
+
+    else:
+        usage()
+
diff --git a/demo/pantheon/lexical_anlysis/ernie_reader.py b/demo/pantheon/lexical_anlysis/ernie_reader.py
new file mode 100755
index 0000000000000000000000000000000000000000..5e8b6e4bf1f70b4f2ada8ea3238fa8364b5ba19a
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/ernie_reader.py
@@ -0,0 +1,160 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module provides reader for ernie model
+"""
+
+import sys
+
+from collections import namedtuple
+import numpy as np
+
+sys.path.append("..")
+from preprocess.ernie.task_reader import BaseReader, tokenization
+
+
+def pad_batch_data(insts,
+                   pad_idx=0,
+                   max_len=128,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False,
+                   return_seq_lens=False):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and input mask.
+    """
+    return_list = []
+    # max_len = max(len(inst) for inst in insts)
+    max_len = max_len
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+
+    inst_data = np.array(
+        [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts])
+    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] *
+                                    (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+
+    if return_max_len:
+        return_list += [max_len]
+
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+
+    if return_seq_lens:
+        seq_lens = np.array([len(inst) for inst in insts])
+        return_list += [seq_lens.astype("int64").reshape([-1])]
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+class SequenceLabelReader(BaseReader):
+    """SequenceLabelReader"""
+
+    def _pad_batch_records(self, batch_records):
+        batch_token_ids = [record.token_ids for record in batch_records]
+        batch_text_type_ids = [record.text_type_ids for record in batch_records]
+        batch_position_ids = [record.position_ids for record in batch_records]
+        batch_label_ids = [record.label_ids for record in batch_records]
+
+        # padding
+        padded_token_ids, input_mask, batch_seq_lens = pad_batch_data(
+            batch_token_ids,
+            max_len=self.max_seq_len,
+            pad_idx=self.pad_id,
+            return_input_mask=True,
+            return_seq_lens=True)
+        padded_text_type_ids = pad_batch_data(
+            batch_text_type_ids, max_len=self.max_seq_len, pad_idx=self.pad_id)
+        padded_position_ids = pad_batch_data(
+            batch_position_ids, max_len=self.max_seq_len, pad_idx=self.pad_id)
+        padded_label_ids = pad_batch_data(
+            batch_label_ids,
+            max_len=self.max_seq_len,
+            pad_idx=len(self.label_map) - 1)
+
+        return_list = [
+            padded_token_ids, padded_text_type_ids, padded_position_ids,
+            input_mask, padded_label_ids, batch_seq_lens
+        ]
+        return return_list
+
+    def _reseg_token_label(self, tokens, labels, tokenizer):
+        assert len(tokens) == len(labels)
+        ret_tokens = []
+        ret_labels = []
+        for token, label in zip(tokens, labels):
+            sub_token = tokenizer.tokenize(token)
+            if len(sub_token) == 0:
+                continue
+            ret_tokens.extend(sub_token)
+            ret_labels.append(label)
+            if len(sub_token) < 2:
+                continue
+            sub_label = label
+            if label.startswith("B-"):
+                sub_label = "I-" + label[2:]
+            ret_labels.extend([sub_label] * (len(sub_token) - 1))
+
+        assert len(ret_tokens) == len(ret_labels)
+        return ret_tokens, ret_labels
+
+    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
+        tokens = tokenization.convert_to_unicode(example.text_a).split(u"")
+        labels = tokenization.convert_to_unicode(example.label).split(u"")
+        tokens, labels = self._reseg_token_label(tokens, labels, tokenizer)
+
+        if len(tokens) > max_seq_length - 2:
+            tokens = tokens[0:(max_seq_length - 2)]
+            labels = labels[0:(max_seq_length - 2)]
+        tokens = ["[CLS]"] + tokens + ["[SEP]"]
+        token_ids = tokenizer.convert_tokens_to_ids(tokens)
+        position_ids = list(range(len(token_ids)))
+        text_type_ids = [0] * len(token_ids)
+        no_entity_id = len(self.label_map) - 1
+        labels = [
+            label if label in self.label_map else u"O" for label in labels
+        ]
+        label_ids = [no_entity_id] + [
+            self.label_map[label] for label in labels
+        ] + [no_entity_id]
+
+        Record = namedtuple(
+            'Record',
+            ['token_ids', 'text_type_ids', 'position_ids', 'label_ids'])
+        record = Record(
+            token_ids=token_ids,
+            text_type_ids=text_type_ids,
+            position_ids=position_ids,
+            label_ids=label_ids)
+        return record
diff --git a/demo/pantheon/lexical_anlysis/eval.py b/demo/pantheon/lexical_anlysis/eval.py
new file mode 100755
index 0000000000000000000000000000000000000000..b7a9072b292322b788d8e06440ce2e87342a47e7
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/eval.py
@@ -0,0 +1,131 @@
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import time
+import sys
+
+import paddle.fluid as fluid
+import paddle
+
+import model_utils
+import reader
+import creator
+sys.path.append('models/')
+from model_check import check_cuda
+from model_check import check_version
+
+parser = argparse.ArgumentParser(__doc__)
+# 1. model parameters
+model_g = model_utils.ArgumentGroup(parser, "model", "model configuration")
+model_g.add_arg("word_emb_dim", int, 128,
+                "The dimension in which a word is embedded.")
+model_g.add_arg("grnn_hidden_dim", int, 128,
+                "The number of hidden nodes in the GRNN layer.")
+model_g.add_arg("bigru_num", int, 2,
+                "The number of bi_gru layers in the network.")
+model_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.")
+
+# 2. data parameters
+data_g = model_utils.ArgumentGroup(parser, "data", "data paths")
+data_g.add_arg("word_dict_path", str, "./conf/word.dic",
+               "The path of the word dictionary.")
+data_g.add_arg("label_dict_path", str, "./conf/tag.dic",
+               "The path of the label dictionary.")
+data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic",
+               "The path of the word replacement Dictionary.")
+data_g.add_arg("test_data", str, "./data/test.tsv",
+               "The folder where the training data is located.")
+data_g.add_arg("init_checkpoint", str, "./model_baseline", "Path to init model")
+data_g.add_arg(
+    "batch_size", int, 200,
+    "The number of sequences contained in a mini-batch, "
+    "or the maximum number of tokens (include paddings) contained in a mini-batch."
+)
+
+
+def do_eval(args):
+    print('do_eval...........')
+    dataset = reader.Dataset(args)
+
+    test_program = fluid.Program()
+    with fluid.program_guard(test_program, fluid.default_startup_program()):
+        with fluid.unique_name.guard():
+            test_ret = creator.create_model(
+                args, dataset.vocab_size, dataset.num_labels, mode='test')
+    test_program = test_program.clone(for_test=True)
+
+    # init executor
+    if args.use_cuda:
+        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
+    else:
+        place = fluid.CPUPlace()
+
+    pyreader = creator.create_pyreader(
+        args,
+        file_name=args.test_data,
+        feed_list=test_ret['feed_list'],
+        place=place,
+        model='lac',
+        reader=dataset,
+        mode='test')
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    # load model
+    model_utils.init_checkpoint(exe, args.init_checkpoint, test_program)
+    test_process(
+        exe=exe, program=test_program, reader=pyreader, test_ret=test_ret)
+
+
+def test_process(exe, program, reader, test_ret):
+    """
+    the function to execute the infer process
+    :param exe: the fluid Executor
+    :param program: the infer_program
+    :param reader: data reader
+    :return: the list of prediction result
+    """
+    print('test_process...........')
+    test_ret["chunk_evaluator"].reset()
+    start_time = time.time()
+    reader.start()
+    while True:
+        try:
+            nums_infer, nums_label, nums_correct = exe.run(
+		program,
+		fetch_list=[
+		    test_ret["num_infer_chunks"],
+		    test_ret["num_label_chunks"],
+		    test_ret["num_correct_chunks"],
+		])
+            test_ret["chunk_evaluator"].update(nums_infer, nums_label, nums_correct)
+        except fluid.core.EOFException:
+            reader.reset()
+            break
+              
+    precision, recall, f1 = test_ret["chunk_evaluator"].eval()
+    end_time = time.time()
+    print("[test] P: %.5f, R: %.5f, F1: %.5f, elapsed time: %.3f s" %
+          (precision, recall, f1, end_time - start_time))
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    check_cuda(args.use_cuda)
+    check_version()
+    do_eval(args)
diff --git a/demo/pantheon/lexical_anlysis/model_utils.py b/demo/pantheon/lexical_anlysis/model_utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..d9f10b178d8ffc7833b2bbf2580b312bcfc8ff1b
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/model_utils.py
@@ -0,0 +1,248 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+util tools
+"""
+from __future__ import print_function
+import os
+import sys
+import numpy as np
+import paddle.fluid as fluid
+import yaml
+import io
+
+
+def str2bool(v):
+    """
+    argparse does not support True or False in python
+    """
+    return v.lower() in ("true", "t", "1")
+
+
+class ArgumentGroup(object):
+    """
+    Put arguments to one group
+    """
+
+    def __init__(self, parser, title, des):
+        """none"""
+        self._group = parser.add_argument_group(title=title, description=des)
+
+    def add_arg(self, name, type, default, help, **kwargs):
+        """ Add argument """
+        type = str2bool if type == bool else type
+        self._group.add_argument(
+            "--" + name,
+            default=default,
+            type=type,
+            help=help + ' Default: %(default)s.',
+            **kwargs)
+
+
+def load_yaml(parser, file_name, **kwargs):
+    with io.open(file_name, 'r', encoding='utf8') as f:
+        args = yaml.load(f)
+        for title in args:
+            group = parser.add_argument_group(title=title, description='')
+            for name in args[title]:
+                _type = type(args[title][name]['val'])
+                _type = str2bool if _type == bool else _type
+                group.add_argument(
+                    "--" + name,
+                    default=args[title][name]['val'],
+                    type=_type,
+                    help=args[title][name]['meaning'] +
+                    ' Default: %(default)s.',
+                    **kwargs)
+
+
+def print_arguments(args):
+    """none"""
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).items()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def to_str(string, encoding="utf-8"):
+    """convert to str for print"""
+    if sys.version_info.major == 3:
+        if isinstance(string, bytes):
+            return string.decode(encoding)
+    elif sys.version_info.major == 2:
+        if isinstance(string, unicode):
+            if os.name == 'nt':
+                return string
+            else:
+                return string.encode(encoding)
+    return string
+
+
+def to_lodtensor(data, place):
+    """
+    Convert data in list into lodtensor.
+    """
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.Tensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def parse_result(words, crf_decode, dataset):
+    """ parse result """
+    offset_list = (crf_decode.lod())[0]
+    words = np.array(words)
+    crf_decode = np.array(crf_decode)
+    batch_size = len(offset_list) - 1
+
+    batch_out = []
+    for sent_index in range(batch_size):
+        begin, end = offset_list[sent_index], offset_list[sent_index + 1]
+        sent = [dataset.id2word_dict[str(id[0])] for id in words[begin:end]]
+        tags = [
+            dataset.id2label_dict[str(id[0])] for id in crf_decode[begin:end]
+        ]
+
+        sent_out = []
+        tags_out = []
+        parital_word = ""
+        for ind, tag in enumerate(tags):
+            # for the first word
+            if parital_word == "":
+                parital_word = sent[ind]
+                tags_out.append(tag.split('-')[0])
+                continue
+
+            # for the beginning of word
+            if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"):
+                sent_out.append(parital_word)
+                tags_out.append(tag.split('-')[0])
+                parital_word = sent[ind]
+                continue
+
+            parital_word += sent[ind]
+
+        # append the last word, except for len(tags)=0
+        if len(sent_out) < len(tags_out):
+            sent_out.append(parital_word)
+
+        batch_out.append([sent_out, tags_out])
+    return batch_out
+
+
+def parse_padding_result(words, crf_decode, seq_lens, dataset):
+    """ parse padding result """
+    words = np.squeeze(words)
+    batch_size = len(seq_lens)
+
+    batch_out = []
+    for sent_index in range(batch_size):
+
+        sent = [
+            dataset.id2word_dict[str(id)]
+            for id in words[sent_index][1:seq_lens[sent_index] - 1]
+        ]
+        tags = [
+            dataset.id2label_dict[str(id)]
+            for id in crf_decode[sent_index][1:seq_lens[sent_index] - 1]
+        ]
+
+        sent_out = []
+        tags_out = []
+        parital_word = ""
+        for ind, tag in enumerate(tags):
+            # for the first word
+            if parital_word == "":
+                parital_word = sent[ind]
+                tags_out.append(tag.split('-')[0])
+                continue
+
+            # for the beginning of word
+            if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"):
+                sent_out.append(parital_word)
+                tags_out.append(tag.split('-')[0])
+                parital_word = sent[ind]
+                continue
+
+            parital_word += sent[ind]
+
+        # append the last word, except for len(tags)=0
+        if len(sent_out) < len(tags_out):
+            sent_out.append(parital_word)
+
+        batch_out.append([sent_out, tags_out])
+    return batch_out
+
+
+def init_checkpoint(exe, init_checkpoint_path, main_program):
+    """
+    Init CheckPoint
+    """
+    assert os.path.exists(
+        init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
+
+    def existed_persitables(var):
+        """
+        If existed presitabels
+        """
+        if not fluid.io.is_persistable(var):
+            return False
+        if os.path.exists(os.path.join(init_checkpoint_path, var.name)):
+            print("INIT {}".format(var.name))
+            return True
+        else:
+            print("SKIP {}".format(var.name))
+            return False
+
+    fluid.io.load_vars(
+        exe,
+        init_checkpoint_path,
+        main_program=main_program,
+        predicate=existed_persitables)
+    print("Load model from {}".format(init_checkpoint_path))
+
+
+def init_pretraining_params(exe,
+                            pretraining_params_path,
+                            main_program,
+                            use_fp16=False):
+    """load params of pretrained model, NOT including moment, learning_rate"""
+    assert os.path.exists(pretraining_params_path
+                          ), "[%s] cann't be found." % pretraining_params_path
+
+    def _existed_params(var):
+        if not isinstance(var, fluid.framework.Parameter):
+            return False
+        if os.path.exists(os.path.join(pretraining_params_path, var.name)):
+            print("INIT {}".format(var.name))
+            return True
+        else:
+            print("SKIP {}".format(var.name))
+            return False
+
+    fluid.io.load_vars(
+        exe,
+        pretraining_params_path,
+        main_program=main_program,
+        predicate=_existed_params)
+    print("Load pretraining parameters from {}.".format(
+        pretraining_params_path))
diff --git a/demo/pantheon/lexical_anlysis/models/__init__.py b/demo/pantheon/lexical_anlysis/models/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/demo/pantheon/lexical_anlysis/models/model_check.py b/demo/pantheon/lexical_anlysis/models/model_check.py
new file mode 100755
index 0000000000000000000000000000000000000000..51713452a7f0b1019c7b8b7d37d24e0c5f15c77c
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/models/model_check.py
@@ -0,0 +1,73 @@
+#encoding=utf8
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle
+import paddle.fluid as fluid
+
+
+def check_cuda(use_cuda, err = \
+    "\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \
+    Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n"
+                                                                                                                     ):
+    """
+    Log error and exit when set use_gpu=true in paddlepaddle
+    cpu version.
+    """
+    try:
+        if use_cuda == True and fluid.is_compiled_with_cuda() == False:
+            print(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
+
+def check_version():
+        """
+        Log error and exit when the installed version of paddlepaddle is
+        not satisfied.
+        """
+        err = "PaddlePaddle version 1.6 or higher is required, " \
+            "or a suitable develop version is satisfied as well. \n" \
+            "Please make sure the version is good with your code." \
+
+        try:
+            fluid.require_version('1.6.0')
+        except Exception as e:
+            print(err)
+            sys.exit(1)
+
+
+def check_version():
+    """
+    Log error and exit when the installed version of paddlepaddle is
+    not satisfied.
+    """
+    err = "PaddlePaddle version 1.6 or higher is required, " \
+        "or a suitable develop version is satisfied as well. \n" \
+        "Please make sure the version is good with your code." \
+
+    try:
+        fluid.require_version('1.6.0')
+    except Exception as e:
+        print(err)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    check_cuda(True)
+
+    check_cuda(False)
+
+    check_cuda(True, "This is only for testing.")
diff --git a/demo/pantheon/lexical_anlysis/models/representation/__init__.py b/demo/pantheon/lexical_anlysis/models/representation/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/demo/pantheon/lexical_anlysis/models/representation/ernie.py b/demo/pantheon/lexical_anlysis/models/representation/ernie.py
new file mode 100755
index 0000000000000000000000000000000000000000..ced3196f8953b74ca7d7aa67ac72e3ec99cbca84
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/models/representation/ernie.py
@@ -0,0 +1,322 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module provides ErnieModel and ErnieConfig
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+import six
+import paddle.fluid as fluid
+
+from models.transformer_encoder import encoder, pre_process_layer
+
+
+def ernie_pyreader(args, pyreader_name):
+    """define standard ernie pyreader"""
+    src_ids = fluid.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64')
+    sent_ids = fluid.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64')
+    pos_ids = fluid.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64')
+    input_mask = fluid.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='float32')
+    labels = fluid.data(name='5', shape=[-1, 1], dtype='int64')
+    seq_lens = fluid.data(name='6', shape=[-1], dtype='int64')
+
+    pyreader = fluid.io.DataLoader.from_generator(
+        feed_list=[src_ids, sent_ids, pos_ids, input_mask, labels, seq_lens],
+        capacity=50,
+        iterable=False,
+        use_double_buffer=True)
+
+    ernie_inputs = {
+        "src_ids": src_ids,
+        "sent_ids": sent_ids,
+        "pos_ids": pos_ids,
+        "input_mask": input_mask,
+        "seq_lens": seq_lens
+    }
+    return pyreader, ernie_inputs, labels
+
+
+def ernie_encoder_with_paddle_hub(ernie_inputs, max_seq_len):
+    ernie = hub.Module(name="ernie")
+    inputs, outputs, program = ernie.context(
+        trainable=True, max_seq_len=max_seq_len, learning_rate=1)
+
+    main_program = fluid.default_main_program()
+    input_dict = {
+        inputs["input_ids"].name: ernie_inputs["src_ids"],
+        inputs["segment_ids"].name: ernie_inputs["sent_ids"],
+        inputs["position_ids"].name: ernie_inputs["pos_ids"],
+        inputs["input_mask"].name: ernie_inputs["input_mask"]
+    }
+
+    hub.connect_program(
+        pre_program=main_program,
+        next_program=program,
+        input_dict=input_dict,
+        inplace=True)
+
+    enc_out = outputs["sequence_output"]
+    unpad_enc_out = fluid.layers.sequence_unpad(
+        enc_out, length=ernie_inputs["seq_lens"])
+    cls_feats = outputs["pooled_output"]
+
+    embeddings = {
+        "sentence_embeddings": cls_feats,
+        "token_embeddings": unpad_enc_out,
+        "padded_token_embeddings": enc_out
+    }
+
+    for k, v in embeddings.items():
+        v.persistable = True
+
+    return embeddings
+
+
+def ernie_encoder(ernie_inputs, ernie_config):
+    """return sentence embedding and token embeddings"""
+
+    ernie = ErnieModel(
+        src_ids=ernie_inputs["src_ids"],
+        position_ids=ernie_inputs["pos_ids"],
+        sentence_ids=ernie_inputs["sent_ids"],
+        input_mask=ernie_inputs["input_mask"],
+        config=ernie_config)
+
+    enc_out = ernie.get_sequence_output()
+    unpad_enc_out = fluid.layers.sequence_unpad(
+        enc_out, length=ernie_inputs["seq_lens"])
+    cls_feats = ernie.get_pooled_output()
+
+    embeddings = {
+        "sentence_embeddings": cls_feats,
+        "token_embeddings": unpad_enc_out,
+        "padded_token_embeddings": enc_out
+    }
+
+    for k, v in embeddings.items():
+        v.persistable = True
+
+    return embeddings
+
+
+class ErnieConfig(object):
+    """ErnieConfig"""
+
+    def __init__(self, config_path):
+        self._config_dict = self._parse(config_path)
+
+    def _parse(self, config_path):
+        try:
+            with open(config_path) as json_file:
+                config_dict = json.load(json_file)
+        except Exception:
+            raise IOError("Error in parsing Ernie model config file '%s'" %
+                          config_path)
+        else:
+            return config_dict
+
+    def __getitem__(self, key):
+        return self._config_dict[key]
+
+    def print_config(self):
+        """print config"""
+        for arg, value in sorted(six.iteritems(self._config_dict)):
+            print('%s: %s' % (arg, value))
+        print('------------------------------------------------')
+
+
+class ErnieModel(object):
+    """ErnieModel"""
+
+    def __init__(self,
+                 src_ids,
+                 position_ids,
+                 sentence_ids,
+                 input_mask,
+                 config,
+                 weight_sharing=True,
+                 use_fp16=False):
+
+        self._emb_size = config['hidden_size']
+        self._n_layer = config['num_hidden_layers']
+        self._n_head = config['num_attention_heads']
+        self._voc_size = config['vocab_size']
+        self._max_position_seq_len = config['max_position_embeddings']
+        self._sent_types = config['type_vocab_size']
+        self._hidden_act = config['hidden_act']
+        self._prepostprocess_dropout = config['hidden_dropout_prob']
+        self._attention_dropout = config['attention_probs_dropout_prob']
+        self._weight_sharing = weight_sharing
+
+        self._word_emb_name = "word_embedding"
+        self._pos_emb_name = "pos_embedding"
+        self._sent_emb_name = "sent_embedding"
+        self._dtype = "float16" if use_fp16 else "float32"
+
+        # Initialize all weigths by truncated normal initializer, and all biases
+        # will be initialized by constant zero by default.
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=config['initializer_range'])
+
+        self._build_model(src_ids, position_ids, sentence_ids, input_mask)
+
+    def _build_model(self, src_ids, position_ids, sentence_ids, input_mask):
+        # padding id in vocabulary must be set to 0
+        emb_out = fluid.layers.embedding(
+            input=src_ids,
+            size=[self._voc_size, self._emb_size],
+            dtype=self._dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._word_emb_name, initializer=self._param_initializer),
+            is_sparse=False)
+        position_emb_out = fluid.layers.embedding(
+            input=position_ids,
+            size=[self._max_position_seq_len, self._emb_size],
+            dtype=self._dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._pos_emb_name, initializer=self._param_initializer))
+
+        sent_emb_out = fluid.layers.embedding(
+            sentence_ids,
+            size=[self._sent_types, self._emb_size],
+            dtype=self._dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._sent_emb_name, initializer=self._param_initializer))
+
+        emb_out = emb_out + position_emb_out
+        emb_out = emb_out + sent_emb_out
+
+        emb_out = pre_process_layer(
+            emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
+
+        if self._dtype == "float16":
+            input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
+        self_attn_mask = fluid.layers.matmul(
+            x=input_mask, y=input_mask, transpose_y=True)
+
+        self_attn_mask = fluid.layers.scale(
+            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
+        n_head_self_attn_mask = fluid.layers.stack(
+            x=[self_attn_mask] * self._n_head, axis=1)
+        n_head_self_attn_mask.stop_gradient = True
+
+        self._enc_out = encoder(
+            enc_input=emb_out,
+            attn_bias=n_head_self_attn_mask,
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            hidden_act=self._hidden_act,
+            preprocess_cmd="",
+            postprocess_cmd="dan",
+            param_initializer=self._param_initializer,
+            name='encoder')
+
+    def get_sequence_output(self):
+        """Get embedding of each token for squence labeling"""
+        return self._enc_out
+
+    def get_pooled_output(self):
+        """Get the first feature of each sequence for classification"""
+        next_sent_feat = fluid.layers.slice(
+            input=self._enc_out, axes=[1], starts=[0], ends=[1])
+        next_sent_feat = fluid.layers.fc(
+            input=next_sent_feat,
+            size=self._emb_size,
+            act="tanh",
+            param_attr=fluid.ParamAttr(
+                name="pooled_fc.w_0", initializer=self._param_initializer),
+            bias_attr="pooled_fc.b_0")
+        return next_sent_feat
+
+    def get_pretraining_output(self, mask_label, mask_pos, labels):
+        """Get the loss & accuracy for pretraining"""
+
+        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
+
+        # extract the first token feature in each sentence
+        next_sent_feat = self.get_pooled_output()
+        reshaped_emb_out = fluid.layers.reshape(
+            x=self._enc_out, shape=[-1, self._emb_size])
+        # extract masked tokens' feature
+        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
+
+        # transform: fc
+        mask_trans_feat = fluid.layers.fc(
+            input=mask_feat,
+            size=self._emb_size,
+            act=self._hidden_act,
+            param_attr=fluid.ParamAttr(
+                name='mask_lm_trans_fc.w_0',
+                initializer=self._param_initializer),
+            bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
+        # transform: layer norm
+        mask_trans_feat = pre_process_layer(
+            mask_trans_feat, 'n', name='mask_lm_trans')
+
+        mask_lm_out_bias_attr = fluid.ParamAttr(
+            name="mask_lm_out_fc.b_0",
+            initializer=fluid.initializer.Constant(value=0.0))
+        if self._weight_sharing:
+            fc_out = fluid.layers.matmul(
+                x=mask_trans_feat,
+                y=fluid.default_main_program().global_block().var(
+                    self._word_emb_name),
+                transpose_y=True)
+            fc_out += fluid.layers.create_parameter(
+                shape=[self._voc_size],
+                dtype=self._dtype,
+                attr=mask_lm_out_bias_attr,
+                is_bias=True)
+
+        else:
+            fc_out = fluid.layers.fc(input=mask_trans_feat,
+                                     size=self._voc_size,
+                                     param_attr=fluid.ParamAttr(
+                                         name="mask_lm_out_fc.w_0",
+                                         initializer=self._param_initializer),
+                                     bias_attr=mask_lm_out_bias_attr)
+
+        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
+            logits=fc_out, label=mask_label)
+        mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
+
+        next_sent_fc_out = fluid.layers.fc(
+            input=next_sent_feat,
+            size=2,
+            param_attr=fluid.ParamAttr(
+                name="next_sent_fc.w_0", initializer=self._param_initializer),
+            bias_attr="next_sent_fc.b_0")
+
+        next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(
+            logits=next_sent_fc_out, label=labels, return_softmax=True)
+
+        next_sent_acc = fluid.layers.accuracy(
+            input=next_sent_softmax, label=labels)
+
+        mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
+
+        loss = mean_next_sent_loss + mean_mask_lm_loss
+        return next_sent_acc, mean_mask_lm_loss, loss
diff --git a/demo/pantheon/lexical_anlysis/models/sequence_labeling/__init__.py b/demo/pantheon/lexical_anlysis/models/sequence_labeling/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/demo/pantheon/lexical_anlysis/models/sequence_labeling/nets.py b/demo/pantheon/lexical_anlysis/models/sequence_labeling/nets.py
new file mode 100755
index 0000000000000000000000000000000000000000..414e89b008ed8809396b894a6af1dc6c8fe469ce
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/models/sequence_labeling/nets.py
@@ -0,0 +1,174 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The function lex_net(args) define the lexical analysis network structure
+"""
+import sys
+import os
+import math
+
+import paddle.fluid as fluid
+from paddle.fluid.initializer import NormalInitializer
+
+
+def lex_net(word, args, vocab_size, num_labels, teacher_crf_decode=None, for_infer=True,target=None):
+    """
+    define the lexical analysis network structure
+    word: stores the input of the model
+    for_infer: a boolean value, indicating if the model to be created is for training or predicting.
+
+    return:
+        for infer: return the prediction
+        otherwise: return the prediction
+    """
+    word_emb_dim = args.word_emb_dim
+    grnn_hidden_dim = args.grnn_hidden_dim
+    emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir(args) else 1.0
+    crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir(args) else 1.0
+    bigru_num = args.bigru_num
+    init_bound = 0.1
+    IS_SPARSE = True
+
+    def _bigru_layer(input_feature):
+        """
+        define the bidirectional gru layer
+        """
+        pre_gru = fluid.layers.fc(
+            input=input_feature,
+            size=grnn_hidden_dim * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+        gru = fluid.layers.dynamic_gru(
+            input=pre_gru,
+            size=grnn_hidden_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+
+        pre_gru_r = fluid.layers.fc(
+            input=input_feature,
+            size=grnn_hidden_dim * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+        gru_r = fluid.layers.dynamic_gru(
+            input=pre_gru_r,
+            size=grnn_hidden_dim,
+            is_reverse=True,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+
+        bi_merge = fluid.layers.concat(input=[gru, gru_r], axis=1)
+        return bi_merge
+    
+    def log_softmax(logits, axis=-1):
+        logsoftmax = logits-fluid.layers.log(fluid.layers.reduce_sum(fluid.layers.exp(logits),axis))
+        return logsoftmax
+   
+    def cross_entropy(student, teacher):
+        ce_loss = -1.0 * fluid.layers.reduce_sum(teacher*fluid.layers.log(student), dim=1)
+        ce_loss = fluid.layers.sequence_pool(ce_loss, "sum")
+        return ce_loss
+
+    def kl_div(student, teacher):
+        ce_loss = fluid.layers.reduce_sum(teacher*(fluid.layers.log(teacher) - fluid.layers.log(student)), dim=1)
+        ce_loss = fluid.layers.sequence_pool(ce_loss, "sum")
+        return ce_loss
+
+    def pred(student, teacher,t=1.0):
+        return fluid.layers.reduce_mean(-1.0*fluid.layers.softmax(teacher)*log_softmax(student/t))
+   
+    def normalize(alpha):
+        """ alpha shape (-1, 57)
+        """
+        tag_num = alpha.shape[1] 
+        sum_alpha = fluid.layers.reduce_sum(alpha, dim=1)
+        sum_alpha = fluid.layers.unsqueeze(sum_alpha, axes=[1])
+        sum_alpha = fluid.layers.expand(sum_alpha, [1, tag_num])
+        norm_alpha = alpha / sum_alpha
+        return norm_alpha
+ 
+    def _net_conf(word, target=None):
+        """
+        Configure the network
+        """
+        word_embedding = fluid.embedding(
+            input=word,
+            size=[vocab_size, word_emb_dim],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr=fluid.ParamAttr(
+                learning_rate=emb_lr,
+                name="word_emb",
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound)))
+
+        input_feature = word_embedding
+        for i in range(bigru_num):
+            bigru_output = _bigru_layer(input_feature)
+            input_feature = bigru_output
+
+        emission = fluid.layers.fc(
+            size=num_labels,
+            input=bigru_output,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+
+        if target is not None:
+            crf_cost = fluid.layers.linear_chain_crf(
+                input=emission,
+                label=target,
+                param_attr=fluid.ParamAttr(
+                    name='crfw', learning_rate=crf_lr))
+            if teacher_crf_decode is not None:
+                teacher_cost = pred(student=emission, teacher=teacher_crf_decode,t=1.0)
+            else:
+                teacher_cost = 0
+                print('no teacher emission')
+            crf_avg_cost = fluid.layers.mean(x=crf_cost)
+            alpha, beta = 0.5, 0.5
+            print("alpha * crf_avg_cost + beta * teacher_cost: ", alpha, beta)
+            avg_cost = alpha * crf_avg_cost+ beta * teacher_cost
+            crf_decode = fluid.layers.crf_decoding(
+                input=emission, param_attr=fluid.ParamAttr(name='crfw'))
+            return avg_cost, crf_avg_cost, teacher_cost, crf_decode
+
+        else:
+            size = emission.shape[1]
+            fluid.layers.create_parameter(
+                shape=[size + 2, size], dtype=emission.dtype, name='crfw')
+            crf_decode = fluid.layers.crf_decoding(
+                input=emission, param_attr=fluid.ParamAttr(name='crfw'))
+
+        return crf_decode
+
+    if for_infer:
+        return _net_conf(word)
+
+    else:
+        # assert target != None, "target is necessary for training"
+        return _net_conf(word, target)
diff --git a/demo/pantheon/lexical_anlysis/models/transformer_encoder.py b/demo/pantheon/lexical_anlysis/models/transformer_encoder.py
new file mode 100755
index 0000000000000000000000000000000000000000..77908896cd2d5beebecd86cd873fafb22b999407
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/models/transformer_encoder.py
@@ -0,0 +1,342 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformer encoder."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from functools import partial
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+
+
+def multi_head_attention(queries,
+                         keys,
+                         values,
+                         attn_bias,
+                         d_key,
+                         d_value,
+                         d_model,
+                         n_head=1,
+                         dropout_rate=0.,
+                         cache=None,
+                         param_initializer=None,
+                         name='multi_head_att'):
+    """
+    Multi-Head Attention. Note that attn_bias is added to the logit before
+    computing softmax activiation to mask certain selected positions so that
+    they will not considered in attention weights.
+    """
+    keys = queries if keys is None else keys
+    values = keys if values is None else values
+
+    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
+        raise ValueError(
+            "Inputs: quries, keys and values should all be 3-D tensors.")
+
+    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
+        """
+        Add linear projection to queries, keys, and values.
+        """
+        q = layers.fc(input=queries,
+                      size=d_key * n_head,
+                      num_flatten_dims=2,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_query_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_query_fc.b_0')
+        k = layers.fc(input=keys,
+                      size=d_key * n_head,
+                      num_flatten_dims=2,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_key_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_key_fc.b_0')
+        v = layers.fc(input=values,
+                      size=d_value * n_head,
+                      num_flatten_dims=2,
+                      param_attr=fluid.ParamAttr(
+                          name=name + '_value_fc.w_0',
+                          initializer=param_initializer),
+                      bias_attr=name + '_value_fc.b_0')
+        return q, k, v
+
+    def __split_heads(x, n_head):
+        """
+        Reshape the last dimension of inpunt tensor x so that it becomes two
+        dimensions and then transpose. Specifically, input a tensor with shape
+        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
+        with shape [bs, n_head, max_sequence_length, hidden_dim].
+        """
+        hidden_size = x.shape[-1]
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        reshaped = layers.reshape(
+            x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
+
+        # permuate the dimensions into:
+        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
+        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
+
+    def __combine_heads(x):
+        """
+        Transpose and then reshape the last two dimensions of inpunt tensor x
+        so that it becomes one dimension, which is reverse to __split_heads.
+        """
+        if len(x.shape) == 3:
+            return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+
+        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        return layers.reshape(
+            x=trans_x,
+            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
+            inplace=True)
+
+    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
+        """
+        Scaled Dot-Product Attention
+        """
+        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
+        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
+        if attn_bias:
+            product += attn_bias
+        weights = layers.softmax(product)
+        if dropout_rate:
+            weights = layers.dropout(
+                weights,
+                dropout_prob=dropout_rate,
+                dropout_implementation="upscale_in_train",
+                is_test=False)
+        out = layers.matmul(weights, v)
+        return out
+
+    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
+
+    if cache is not None:  # use cache and concat time steps
+        # Since the inplace reshape in __split_heads changes the shape of k and
+        # v, which is the cache input for next time step, reshape the cache
+        # input from the previous time step first.
+        k = cache["k"] = layers.concat(
+            [layers.reshape(
+                cache["k"], shape=[0, 0, d_model]), k], axis=1)
+        v = cache["v"] = layers.concat(
+            [layers.reshape(
+                cache["v"], shape=[0, 0, d_model]), v], axis=1)
+
+    q = __split_heads(q, n_head)
+    k = __split_heads(k, n_head)
+    v = __split_heads(v, n_head)
+
+    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
+                                                  dropout_rate)
+
+    out = __combine_heads(ctx_multiheads)
+
+    # Project back to the model size.
+    proj_out = layers.fc(input=out,
+                         size=d_model,
+                         num_flatten_dims=2,
+                         param_attr=fluid.ParamAttr(
+                             name=name + '_output_fc.w_0',
+                             initializer=param_initializer),
+                         bias_attr=name + '_output_fc.b_0')
+    return proj_out
+
+
+def positionwise_feed_forward(x,
+                              d_inner_hid,
+                              d_hid,
+                              dropout_rate,
+                              hidden_act,
+                              param_initializer=None,
+                              name='ffn'):
+    """
+    Position-wise Feed-Forward Networks.
+    This module consists of two linear transformations with a ReLU activation
+    in between, which is applied to each position separately and identically.
+    """
+    hidden = layers.fc(input=x,
+                       size=d_inner_hid,
+                       num_flatten_dims=2,
+                       act=hidden_act,
+                       param_attr=fluid.ParamAttr(
+                           name=name + '_fc_0.w_0',
+                           initializer=param_initializer),
+                       bias_attr=name + '_fc_0.b_0')
+    if dropout_rate:
+        hidden = layers.dropout(
+            hidden,
+            dropout_prob=dropout_rate,
+            dropout_implementation="upscale_in_train",
+            is_test=False)
+    out = layers.fc(input=hidden,
+                    size=d_hid,
+                    num_flatten_dims=2,
+                    param_attr=fluid.ParamAttr(
+                        name=name + '_fc_1.w_0', initializer=param_initializer),
+                    bias_attr=name + '_fc_1.b_0')
+    return out
+
+
+def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.,
+                           name=''):
+    """
+    Add residual connection, layer normalization and droput to the out tensor
+    optionally according to the value of process_cmd.
+    This will be used before or after multi-head attention and position-wise
+    feed-forward networks.
+    """
+    for cmd in process_cmd:
+        if cmd == "a":  # add residual connection
+            out = out + prev_out if prev_out else out
+        elif cmd == "n":  # add layer normalization
+            out_dtype = out.dtype
+            if out_dtype == fluid.core.VarDesc.VarType.FP16:
+                out = layers.cast(x=out, dtype="float32")
+            out = layers.layer_norm(
+                out,
+                begin_norm_axis=len(out.shape) - 1,
+                param_attr=fluid.ParamAttr(
+                    name=name + '_layer_norm_scale',
+                    initializer=fluid.initializer.Constant(1.)),
+                bias_attr=fluid.ParamAttr(
+                    name=name + '_layer_norm_bias',
+                    initializer=fluid.initializer.Constant(0.)))
+            if out_dtype == fluid.core.VarDesc.VarType.FP16:
+                out = layers.cast(x=out, dtype="float16")
+        elif cmd == "d":  # add dropout
+            if dropout_rate:
+                out = layers.dropout(
+                    out,
+                    dropout_prob=dropout_rate,
+                    dropout_implementation="upscale_in_train",
+                    is_test=False)
+    return out
+
+
+pre_process_layer = partial(pre_post_process_layer, None)
+post_process_layer = pre_post_process_layer
+
+
+def encoder_layer(enc_input,
+                  attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  prepostprocess_dropout,
+                  attention_dropout,
+                  relu_dropout,
+                  hidden_act,
+                  preprocess_cmd="n",
+                  postprocess_cmd="da",
+                  param_initializer=None,
+                  name=''):
+    """The encoder layers that can be stacked to form a deep encoder.
+    This module consits of a multi-head (self) attention followed by
+    position-wise feed-forward networks and both the two components companied
+    with the post_process_layer to add residual connection, layer normalization
+    and droput.
+    """
+    attn_output = multi_head_attention(
+        pre_process_layer(
+            enc_input,
+            preprocess_cmd,
+            prepostprocess_dropout,
+            name=name + '_pre_att'),
+        None,
+        None,
+        attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        attention_dropout,
+        param_initializer=param_initializer,
+        name=name + '_multi_head_att')
+    attn_output = post_process_layer(
+        enc_input,
+        attn_output,
+        postprocess_cmd,
+        prepostprocess_dropout,
+        name=name + '_post_att')
+    ffd_output = positionwise_feed_forward(
+        pre_process_layer(
+            attn_output,
+            preprocess_cmd,
+            prepostprocess_dropout,
+            name=name + '_pre_ffn'),
+        d_inner_hid,
+        d_model,
+        relu_dropout,
+        hidden_act,
+        param_initializer=param_initializer,
+        name=name + '_ffn')
+    return post_process_layer(
+        attn_output,
+        ffd_output,
+        postprocess_cmd,
+        prepostprocess_dropout,
+        name=name + '_post_ffn')
+
+
+def encoder(enc_input,
+            attn_bias,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd="n",
+            postprocess_cmd="da",
+            param_initializer=None,
+            name=''):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+    for i in range(n_layer):
+        enc_output = encoder_layer(
+            enc_input,
+            attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd,
+            postprocess_cmd,
+            param_initializer=param_initializer,
+            name=name + '_layer_' + str(i))
+        enc_input = enc_output
+    enc_output = pre_process_layer(
+        enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
+
+    return enc_output
diff --git a/demo/pantheon/lexical_anlysis/preprocess/__init__.py b/demo/pantheon/lexical_anlysis/preprocess/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/demo/pantheon/lexical_anlysis/preprocess/ernie/__init__.py b/demo/pantheon/lexical_anlysis/preprocess/ernie/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/demo/pantheon/lexical_anlysis/preprocess/ernie/task_reader.py b/demo/pantheon/lexical_anlysis/preprocess/ernie/task_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3a8a0d790eb8ae592167b129b2c707ba2318b6f
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/preprocess/ernie/task_reader.py
@@ -0,0 +1,392 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module provides reader for classification and sequence labing
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+import csv
+import json
+
+import numpy as np
+
+from preprocess.ernie import tokenization
+from preprocess.padding import pad_batch_data
+import io
+
+def csv_reader(fd, delimiter='\t'):
+    def gen():
+        for i in fd:
+            slots = i.rstrip('\n').split(delimiter)
+            if len(slots) == 1:
+                yield slots,
+            else:
+                yield slots
+    return gen()
+
+class BaseReader(object):
+    """BaseReader for classify and sequence labeling task"""
+
+    def __init__(self,
+                 vocab_path,
+                 label_map_config=None,
+                 max_seq_len=512,
+                 do_lower_case=True,
+                 in_tokens=False,
+                 random_seed=None):
+        self.max_seq_len = max_seq_len
+        self.tokenizer = tokenization.FullTokenizer(
+            vocab_file=vocab_path, do_lower_case=do_lower_case)
+        self.vocab = self.tokenizer.vocab
+        self.pad_id = self.vocab["[PAD]"]
+        self.cls_id = self.vocab["[CLS]"]
+        self.sep_id = self.vocab["[SEP]"]
+        self.in_tokens = in_tokens
+
+        np.random.seed(random_seed)
+
+        self.current_example = 0
+        self.current_epoch = 0
+        self.num_examples = 0
+
+        if label_map_config:
+            with open(label_map_config) as f:
+                self.label_map = json.load(f)
+        else:
+            self.label_map = None
+
+    def get_train_progress(self):
+        """Gets progress for training phase."""
+        return self.current_example, self.current_epoch
+
+    def _read_tsv(self, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with io.open(input_file, "r", encoding="utf8") as f:
+            reader = csv_reader(f, delimiter="\t")
+            headers = next(reader)
+            Example = namedtuple('Example', headers)
+
+            examples = []
+            for line in reader:
+                example = Example(*line)
+                examples.append(example)
+            return examples
+
+    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
+        """Truncates a sequence pair in place to the maximum length."""
+
+        # This is a simple heuristic which will always truncate the longer sequence
+        # one token at a time. This makes more sense than truncating an equal percent
+        # of tokens from each, since if one sequence is very short then each token
+        # that's truncated likely contains more information than a longer sequence.
+        while True:
+            total_length = len(tokens_a) + len(tokens_b)
+            if total_length <= max_length:
+                break
+            if len(tokens_a) > len(tokens_b):
+                tokens_a.pop()
+            else:
+                tokens_b.pop()
+
+    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
+        """Converts a single `Example` into a single `Record`."""
+
+        text_a = tokenization.convert_to_unicode(example.text_a)
+        tokens_a = tokenizer.tokenize(text_a)
+        tokens_b = None
+        if "text_b" in example._fields:
+            text_b = tokenization.convert_to_unicode(example.text_b)
+            tokens_b = tokenizer.tokenize(text_b)
+
+        if tokens_b:
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[0:(max_seq_length - 2)]
+
+        # The convention in BERT/ERNIE is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0     0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = []
+        text_type_ids = []
+        tokens.append("[CLS]")
+        text_type_ids.append(0)
+        for token in tokens_a:
+            tokens.append(token)
+            text_type_ids.append(0)
+        tokens.append("[SEP]")
+        text_type_ids.append(0)
+
+        if tokens_b:
+            for token in tokens_b:
+                tokens.append(token)
+                text_type_ids.append(1)
+            tokens.append("[SEP]")
+            text_type_ids.append(1)
+
+        token_ids = tokenizer.convert_tokens_to_ids(tokens)
+        position_ids = list(range(len(token_ids)))
+
+        if self.label_map:
+            label_id = self.label_map[example.label]
+        else:
+            label_id = example.label
+
+        Record = namedtuple(
+            'Record',
+            ['token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'])
+
+        qid = None
+        if "qid" in example._fields:
+            qid = example.qid
+
+        record = Record(
+            token_ids=token_ids,
+            text_type_ids=text_type_ids,
+            position_ids=position_ids,
+            label_id=label_id,
+            qid=qid)
+        return record
+
+    def _prepare_batch_data(self, examples, batch_size, phase=None):
+        """generate batch records"""
+        batch_records, max_len = [], 0
+        for index, example in enumerate(examples):
+            if phase == "train":
+                self.current_example = index
+            record = self._convert_example_to_record(example, self.max_seq_len,
+                                                     self.tokenizer)
+            max_len = max(max_len, len(record.token_ids))
+            if self.in_tokens:
+                to_append = (len(batch_records) + 1) * max_len <= batch_size
+            else:
+                to_append = len(batch_records) < batch_size
+            if to_append:
+                batch_records.append(record)
+            else:
+                yield self._pad_batch_records(batch_records)
+                batch_records, max_len = [record], len(record.token_ids)
+
+        if batch_records:
+            yield self._pad_batch_records(batch_records)
+
+    def get_num_examples(self, input_file):
+        """return total number of examples"""
+        examples = self._read_tsv(input_file)
+        return len(examples)
+
+    def data_generator(self,
+                       input_file,
+                       batch_size,
+                       epoch,
+                       shuffle=True,
+                       phase=None):
+        """return generator which yields batch data for pyreader"""
+        examples = self._read_tsv(input_file)
+
+        def _wrapper():
+            for epoch_index in range(epoch):
+                if phase == "train":
+                    self.current_example = 0
+                    self.current_epoch = epoch_index
+                if shuffle:
+                    np.random.shuffle(examples)
+
+                for batch_data in self._prepare_batch_data(
+                        examples, batch_size, phase=phase):
+                    yield batch_data
+
+        return _wrapper
+
+
+class ClassifyReader(BaseReader):
+    """ClassifyReader"""
+
+    def _read_tsv(self, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with io.open(input_file, "r", encoding="utf8") as f:
+            reader = csv_reader(f, delimiter="\t")
+            headers = next(reader)
+            text_indices = [
+                index for index, h in enumerate(headers) if h != "label"
+            ]
+            Example = namedtuple('Example', headers)
+
+            examples = []
+            for line in reader:
+                for index, text in enumerate(line):
+                    if index in text_indices:
+                        line[index] = text.replace(' ', '')
+                example = Example(*line)
+                examples.append(example)
+            return examples
+
+    def _pad_batch_records(self, batch_records):
+        batch_token_ids = [record.token_ids for record in batch_records]
+        batch_text_type_ids = [record.text_type_ids for record in batch_records]
+        batch_position_ids = [record.position_ids for record in batch_records]
+        batch_labels = [record.label_id for record in batch_records]
+        batch_labels = np.array(batch_labels).astype("int64").reshape([-1, 1])
+
+        # padding
+        padded_token_ids, input_mask, seq_lens = pad_batch_data(
+            batch_token_ids,
+            pad_idx=self.pad_id,
+            return_input_mask=True,
+            return_seq_lens=True)
+        padded_text_type_ids = pad_batch_data(
+            batch_text_type_ids, pad_idx=self.pad_id)
+        padded_position_ids = pad_batch_data(
+            batch_position_ids, pad_idx=self.pad_id)
+
+        return_list = [
+            padded_token_ids, padded_text_type_ids, padded_position_ids,
+            input_mask, batch_labels, seq_lens
+        ]
+
+        return return_list
+
+
+class SequenceLabelReader(BaseReader):
+    """SequenceLabelReader"""
+
+    def _pad_batch_records(self, batch_records):
+        batch_token_ids = [record.token_ids for record in batch_records]
+        batch_text_type_ids = [record.text_type_ids for record in batch_records]
+        batch_position_ids = [record.position_ids for record in batch_records]
+        batch_label_ids = [record.label_ids for record in batch_records]
+
+        # padding
+        padded_token_ids, input_mask, batch_seq_lens = pad_batch_data(
+            batch_token_ids,
+            pad_idx=self.pad_id,
+            return_input_mask=True,
+            return_seq_lens=True)
+        padded_text_type_ids = pad_batch_data(
+            batch_text_type_ids, pad_idx=self.pad_id)
+        padded_position_ids = pad_batch_data(
+            batch_position_ids, pad_idx=self.pad_id)
+        padded_label_ids = pad_batch_data(
+            batch_label_ids, pad_idx=len(self.label_map) - 1)
+
+        return_list = [
+            padded_token_ids, padded_text_type_ids, padded_position_ids,
+            input_mask, padded_label_ids, batch_seq_lens
+        ]
+        return return_list
+
+    def _reseg_token_label(self, tokens, labels, tokenizer):
+        assert len(tokens) == len(labels)
+        ret_tokens = []
+        ret_labels = []
+        for token, label in zip(tokens, labels):
+            sub_token = tokenizer.tokenize(token)
+            if len(sub_token) == 0:
+                continue
+            ret_tokens.extend(sub_token)
+            ret_labels.append(label)
+            if len(sub_token) < 2:
+                continue
+            sub_label = label
+            if label.startswith("B-"):
+                sub_label = "I-" + label[2:]
+            ret_labels.extend([sub_label] * (len(sub_token) - 1))
+
+        assert len(ret_tokens) == len(ret_labels)
+        return ret_tokens, ret_labels
+
+    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
+        tokens = tokenization.convert_to_unicode(example.text_a).split(u"")
+        labels = tokenization.convert_to_unicode(example.label).split(u"")
+        tokens, labels = self._reseg_token_label(tokens, labels, tokenizer)
+
+        if len(tokens) > max_seq_length - 2:
+            tokens = tokens[0:(max_seq_length - 2)]
+            labels = labels[0:(max_seq_length - 2)]
+
+        tokens = ["[CLS]"] + tokens + ["[SEP]"]
+        token_ids = tokenizer.convert_tokens_to_ids(tokens)
+        position_ids = list(range(len(token_ids)))
+        text_type_ids = [0] * len(token_ids)
+        no_entity_id = len(self.label_map) - 1
+        labels = [
+            label if label in self.label_map else u"O" for label in labels
+        ]
+        label_ids = [no_entity_id] + [
+            self.label_map[label] for label in labels
+        ] + [no_entity_id]
+
+        Record = namedtuple(
+            'Record',
+            ['token_ids', 'text_type_ids', 'position_ids', 'label_ids'])
+        record = Record(
+            token_ids=token_ids,
+            text_type_ids=text_type_ids,
+            position_ids=position_ids,
+            label_ids=label_ids)
+        return record
+
+
+class ExtractEmbeddingReader(BaseReader):
+    """ExtractEmbeddingReader"""
+
+    def _pad_batch_records(self, batch_records):
+        batch_token_ids = [record.token_ids for record in batch_records]
+        batch_text_type_ids = [record.text_type_ids for record in batch_records]
+        batch_position_ids = [record.position_ids for record in batch_records]
+
+        # padding
+        padded_token_ids, input_mask, seq_lens = pad_batch_data(
+            batch_token_ids,
+            pad_idx=self.pad_id,
+            return_input_mask=True,
+            return_seq_lens=True)
+        padded_text_type_ids = pad_batch_data(
+            batch_text_type_ids, pad_idx=self.pad_id)
+        padded_position_ids = pad_batch_data(
+            batch_position_ids, pad_idx=self.pad_id)
+
+        return_list = [
+            padded_token_ids, padded_text_type_ids, padded_position_ids,
+            input_mask, seq_lens
+        ]
+
+        return return_list
+
+
+if __name__ == '__main__':
+    pass
diff --git a/demo/pantheon/lexical_anlysis/preprocess/ernie/tokenization.py b/demo/pantheon/lexical_anlysis/preprocess/ernie/tokenization.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a06a5818243e4d71aae93fdd1af86c6b14a66b8
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/preprocess/ernie/tokenization.py
@@ -0,0 +1,370 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import unicodedata
+import six
+import io
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    fin = io.open(vocab_file, encoding="utf8")
+    for num, line in enumerate(fin):
+        items = convert_to_unicode(line.strip()).split("\t")
+        if len(items) > 2:
+            break
+        token = items[0]
+        index = items[1] if len(items) == 2 else num
+        token = token.strip()
+        vocab[token] = int(index)
+    return vocab
+
+
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class CharTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in text.lower().split(" "):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+
+        Args:
+            do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+            input = "unaffable"
+            output = ["un", "##aff", "##able"]
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through `BasicTokenizer.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        text = convert_to_unicode(text)
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
diff --git a/demo/pantheon/lexical_anlysis/preprocess/padding.py b/demo/pantheon/lexical_anlysis/preprocess/padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..82171e68eb3af3513eaf4655c740a06bb1112d57
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/preprocess/padding.py
@@ -0,0 +1,78 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Mask, padding and batching.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def pad_batch_data(insts,
+                   pad_idx=0,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False,
+                   return_seq_lens=False):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and input mask.
+    """
+    return_list = []
+    max_len = max(len(inst) for inst in insts)
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+
+    inst_data = np.array(
+        [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts])
+    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] *
+                                    (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+
+    if return_max_len:
+        return_list += [max_len]
+
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+
+    if return_seq_lens:
+        seq_lens = np.array([len(inst) for inst in insts])
+        return_list += [seq_lens.astype("int64").reshape([-1])]
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+if __name__ == "__main__":
+    pass
diff --git a/demo/pantheon/lexical_anlysis/reader.py b/demo/pantheon/lexical_anlysis/reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..11958919e81dc667027bad51377252542a2e614a
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/reader.py
@@ -0,0 +1,208 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The file_reader converts raw corpus to input.
+"""
+
+import os
+import argparse
+import __future__
+import io
+import glob
+from paddleslim.pantheon import Student
+import random
+import numpy as np
+import six
+
+def load_kv_dict(dict_path,
+                 reverse=False,
+                 delimiter="\t",
+                 key_func=None,
+                 value_func=None):
+    """
+    Load key-value dict from file
+    """
+    result_dict = {}
+    for line in io.open(dict_path, "r", encoding='utf8'):
+        terms = line.strip("\n").split(delimiter)
+        if len(terms) != 2:
+            continue
+        if reverse:
+            value, key = terms
+        else:
+            key, value = terms
+        if key in result_dict:
+            raise KeyError("key duplicated with [%s]" % (key))
+        if key_func:
+            key = key_func(key)
+        if value_func:
+            value = value_func(value)
+        result_dict[key] = value
+    return result_dict
+
+
+class Dataset(object):
+    """data reader"""
+
+    def __init__(self, args, mode="train"):
+        # read dict
+        self.word2id_dict = load_kv_dict(
+            args.word_dict_path, reverse=True, value_func=int)
+        self.id2word_dict = load_kv_dict(args.word_dict_path)
+        self.label2id_dict = load_kv_dict(
+            args.label_dict_path, reverse=True, value_func=int)
+        self.id2label_dict = load_kv_dict(args.label_dict_path)
+        self.word_replace_dict = load_kv_dict(args.word_rep_dict_path)
+        self._student = Student()
+        self._student.register_teacher(in_address=args.in_address)
+        self._student.start()
+        self._know_desc = self._student.get_knowledge_desc()
+        self._know_data_generator = self._student.get_knowledge_generator(batch_size=1, drop_last=False)()
+        self._train_shuffle_buf_size = args.traindata_shuffle_buffer
+
+    @property
+    def vocab_size(self):
+        """vocabuary size"""
+        return max(self.word2id_dict.values()) + 1
+
+    @property
+    def num_labels(self):
+        """num_labels"""
+        return max(self.label2id_dict.values()) + 1
+
+    def get_num_examples(self, filename):
+        """num of line of file"""
+        return sum(1 for line in io.open(filename, "r", encoding='utf8'))
+
+    def word_to_ids(self, words):
+        """convert word to word index"""
+        word_ids = []
+        for word in words:
+            word = self.word_replace_dict.get(word, word)
+            if word not in self.word2id_dict:
+                word = "OOV"
+            word_id = self.word2id_dict[word]
+            word_ids.append(word_id)
+
+        return word_ids
+
+    def label_to_ids(self, labels):
+        """convert label to label index"""
+        label_ids = []
+        for label in labels:
+            if label not in self.label2id_dict:
+                label = "O"
+            label_id = self.label2id_dict[label]
+            label_ids.append(label_id)
+        return label_ids
+
+    def file_reader(self, filename, max_seq_len=126, mode="train"):
+        """
+        yield (word_idx, target_idx, teacher_emission) one by one from file,
+            or yield (word_idx, ) in `infer` mode
+        """
+
+        def wrapper():
+            invalid_samples = 0
+            fread = io.open(filename, "r", encoding="utf-8")
+            if mode == "infer":
+                for line in fread:
+                    words = line.strip()
+                    word_ids = self.word_to_ids(words)
+                    yield (word_ids[0:max_seq_len], )
+            elif mode == "test":
+                headline = next(fread)
+                headline = headline.strip().split('\t')
+                assert len(headline) == 2 and headline[
+                    0] == "text_a" and headline[1] == "label"
+                for line in fread:
+                    words, labels = line.strip("\n").split("\t")
+                    if len(words) < 1:
+                        continue
+                    word_ids = self.word_to_ids(words.split("\002"))
+                    label_ids = self.label_to_ids(labels.split("\002"))
+                    assert len(word_ids) == len(label_ids)
+                    yield word_ids[0:max_seq_len], label_ids[0:max_seq_len]
+            else:
+                headline = next(fread)
+                headline = headline.strip().split('\t')
+                assert len(headline) == 2 and headline[
+                    0] == "text_a" and headline[1] == "label"
+                buf = []
+                for line in fread:
+                    words, labels = line.strip("\n").split("\t")
+                    if len(words) < 1:
+                        continue
+                    word_ids = self.word_to_ids(words.split("\002"))
+                    label_ids = self.label_to_ids(labels.split("\002"))
+                    if six.PY2:
+                        know_data = self._know_data_generator.next()
+                    else:
+                        know_data = self._know_data_generator.__next__()
+                    teacher_crf_decode = know_data["crf_decode"]
+                    
+                    if len(teacher_crf_decode.shape) == 1:
+                        teacher_crf_decode = np.reshape(teacher_crf_decode, [-1, 1])
+                    teacher_seq_len = know_data["seq_lens"]
+                    assert len(word_ids) == len(label_ids)
+                    
+                    real_len = len(word_ids) if len(word_ids) < max_seq_len else max_seq_len
+                    if real_len == teacher_seq_len[0] - 2: 
+                        teacher_crf_decode_range = teacher_crf_decode[0][1:teacher_seq_len[0]-1]
+                        teacher_crf_decode_range = np.reshape(teacher_crf_decode_range, [-1, 1])
+                        buf.append([word_ids[0:max_seq_len], label_ids[0:max_seq_len], teacher_crf_decode_range])
+                        #buf.append([word_ids[0:max_seq_len], label_ids[0:max_seq_len], teacher_crf_decode[0][1:teacher_seq_len[0]-1]])
+                        if len(buf) > self._train_shuffle_buf_size:
+                            buf_ids = range(len(buf))
+                            random.shuffle(buf_ids)
+                            for idx in buf_ids:
+                                yield buf[idx]
+                            buf = []
+                    else:
+                        invalid_samples += 1
+                if len(buf) > 0:
+                    buf_ids = list(range(len(buf)))
+                    random.shuffle(buf_ids)
+                    for idx in buf_ids:
+                        yield buf[idx]
+
+                print("invalid samples in one epoch: {}".format(invalid_samples))
+            fread.close()
+        return wrapper
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument(
+        "--word_dict_path",
+        type=str,
+        default="./conf/word.dic",
+        help="word dict")
+    parser.add_argument(
+        "--label_dict_path",
+        type=str,
+        default="./conf/tag.dic",
+        help="label dict")
+    parser.add_argument(
+        "--word_rep_dict_path",
+        type=str,
+        default="./conf/q2b.dic",
+        help="word replace dict")
+    args = parser.parse_args()
+    dataset = Dataset(args)
+   # data_generator = dataset.file_reader("data/train.tsv")
+    #for word_idx, target_idx in data_generator():
+       # print(word_idx, target_idx)
+       # print(len(word_idx), len(target_idx))
+       # break
diff --git a/demo/pantheon/lexical_anlysis/run_student.sh b/demo/pantheon/lexical_anlysis/run_student.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a4b0a241786b68bd3fff1c00a1a38ea29c990bec
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/run_student.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+export CUDA_VISIBLE_DEVICES=5,6
+python -u train_student.py \
+    --train_data ./data/train.tsv \
+    --test_data ./data/test.tsv \
+    --model_save_dir ./teacher_ernie_init_lac_1gru_emb128 \
+    --validation_steps 1000 \
+    --save_steps 1000 \
+    --print_steps 100 \
+    --batch_size 32 \
+    --epoch 10 \
+    --traindata_shuffle_buffer 20000 \
+    --word_emb_dim 128 \
+    --grnn_hidden_dim 128 \
+    --bigru_num 1 \
+    --base_learning_rate 1e-3 \
+    --emb_learning_rate 2 \
+    --crf_learning_rate 0.2 \
+    --word_dict_path ./conf/word.dic \
+    --label_dict_path ./conf/tag.dic \
+    --word_rep_dict_path ./conf/q2b.dic \
+    --enable_ce false \
+    --use_cuda true \
+    --in_address "127.0.0.1:5002"
+
diff --git a/demo/pantheon/lexical_anlysis/run_teacher.sh b/demo/pantheon/lexical_anlysis/run_teacher.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d0acc1944de05f7ea6a108406cef3745b685ffe6
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/run_teacher.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+export FLAGS_sync_nccl_allreduce=0
+export FLAGS_eager_delete_tensor_gb=1
+export FLAGS_fraction_of_gpu_memory_to_use=0.99
+
+export CUDA_VISIBLE_DEVICES=5,6     # which GPU to use
+ERNIE_FINETUNED_MODEL_PATH=./model_finetuned
+DATA_PATH=./data/
+
+python -u teacher_ernie.py \
+    --ernie_config_path "conf/ernie_config.json" \
+    --init_checkpoint "${ERNIE_FINETUNED_MODEL_PATH}" \
+    --init_bound 0.1 \
+    --vocab_path "conf/vocab.txt" \
+    --batch_size 32 \
+    --random_seed 0 \
+    --num_labels 57 \
+    --max_seq_len 128 \
+    --test_data "${DATA_PATH}/train.tsv" \
+    --label_map_config "./conf/label_map.json" \
+    --do_lower_case true \
+    --use_cuda true \
+    --out_port=5002
+
diff --git a/demo/pantheon/lexical_anlysis/teacher_ernie.py b/demo/pantheon/lexical_anlysis/teacher_ernie.py
new file mode 100644
index 0000000000000000000000000000000000000000..9235fda162784a88902b8277ac1005f231ff8f24
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/teacher_ernie.py
@@ -0,0 +1,111 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Baidu's open-source Lexical Analysis tool for Chinese, including:
+    1. Word Segmentation,
+    2. Part-of-Speech Tagging
+    3. Named Entity Recognition
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+import argparse
+import numpy as np
+import multiprocessing
+import sys
+from collections import namedtuple
+from paddleslim.pantheon import Teacher
+import paddle.fluid as fluid
+
+import creator
+import model_utils
+print('model representation') 
+from models.representation.ernie import ErnieConfig
+print('model check') 
+from models.model_check import check_cuda
+from models.model_check import check_version
+
+
+
+def do_eval(args):
+    # init executor
+    if args.use_cuda:
+        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
+    else:
+        place = fluid.CPUPlace()
+    print('ernie config') 
+    ernie_config = ErnieConfig(args.ernie_config_path)
+    ernie_config.print_config()
+    test_program = fluid.Program()
+    print('test program') 
+    with fluid.program_guard(test_program, fluid.default_startup_program()):
+        with fluid.unique_name.guard():
+            test_ret = creator.create_ernie_model(args, ernie_config)
+    test_program = test_program.clone(for_test=True)
+    #print('create pyreader') 
+    pyreader = creator.create_pyreader(
+        args,
+        file_name=args.test_data,
+        feed_list=[ret.name for ret in test_ret['feed_list']],
+        model="ernie",
+        place=place,
+        return_reader=True,
+        mode='test')
+
+    #data_inter = reader.data_generator(args.test_data, args.batch_size, 1, shuffle=False, phase="train")
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    # load model
+    if not args.init_checkpoint:
+        raise ValueError(
+            "args 'init_checkpoint' should be set if only doing test or infer!")
+    model_utils.init_checkpoint(exe, args.init_checkpoint, test_program)
+    
+    teacher = Teacher(out_path=None, out_port=int(args.out_port))
+    teacher.start()
+    print('run teacher......')
+    
+    test_ret["chunk_evaluator"].reset()
+   
+    reader_config = {"batch_generator": pyreader}
+
+    teacher.start_knowledge_service(
+            feed_list=[test_ret["words"].name, test_ret["sent_ids"].name, test_ret["pos_ids"].name, test_ret["input_mask"].name, test_ret["labels"].name, test_ret["seq_lens"].name],
+            schema={"crf_decode":test_ret["crf_decode"],"seq_lens":test_ret["seq_lens"]},
+            program=test_program,
+            reader_config=reader_config,
+            exe=exe,
+            times=10)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(__doc__)
+    model_utils.load_yaml(parser, './conf/ernie_args.yaml')
+    
+    # config for pantheon teacher
+    parser.add_argument('--out_path', type=str, default=None, help="The path to dump knowledge for offline mode.")
+    parser.add_argument('--out_port', type=str, default=None, help="The IP port number to send out knowledge for \
+                            online mode, should be unique when launching multiple teachers in \
+                            the same node.")
+    
+    args = parser.parse_args()
+    check_cuda(args.use_cuda)
+    check_version()
+    model_utils.print_arguments(args)
+    do_eval(args)
diff --git a/demo/pantheon/lexical_anlysis/train_student.py b/demo/pantheon/lexical_anlysis/train_student.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a5534313df73a3cc6d8383cb25376fa5515db1c
--- /dev/null
+++ b/demo/pantheon/lexical_anlysis/train_student.py
@@ -0,0 +1,208 @@
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import math
+import time
+import random
+import argparse
+import multiprocessing
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+
+import reader
+import model_utils
+import creator
+from eval import test_process
+from models.model_check import check_cuda
+from models.model_check import check_version
+
+# the function to train model
+def do_train(args):
+    # init executor
+    if args.use_cuda:
+        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
+        dev_count = fluid.core.get_cuda_device_count()
+    else:
+        dev_count = min(multiprocessing.cpu_count(), args.cpu_num)
+        if (dev_count < args.cpu_num):
+            print(
+                "WARNING: The total CPU NUM in this machine is %d, which is less than cpu_num parameter you set. "
+                "Change the cpu_num from %d to %d" %
+                (dev_count, args.cpu_num, dev_count))
+        os.environ['CPU_NUM'] = str(dev_count)
+        place = fluid.CPUPlace()
+
+    train_program = fluid.Program()
+    test_program = fluid.Program()
+    startup_program = fluid.Program()
+
+    dataset = reader.Dataset(args)
+    with fluid.program_guard(train_program, startup_program):
+        #train_program.random_seed = args.random_seed
+        startup_program.random_seed = args.random_seed
+
+        with fluid.unique_name.guard():
+            train_ret = creator.create_model(
+                args, dataset.vocab_size, dataset.num_labels, mode='train')
+
+            optimizer = fluid.optimizer.Adam(
+                learning_rate=args.base_learning_rate)
+            optimizer.minimize(train_ret["avg_cost"])
+            
+    with fluid.program_guard(test_program, startup_program):
+        with fluid.unique_name.guard():
+            test_ret = creator.create_model(
+                args, dataset.vocab_size, dataset.num_labels, mode='test')
+
+            test_program = test_program.clone(for_test=True)
+
+    exe = fluid.Executor(place)
+    exe.run(startup_program)
+
+    if args.init_checkpoint:
+        model_utils.init_checkpoint(exe, args.init_checkpoint, train_program)
+    if dev_count > 1:
+        device = "GPU" if args.use_cuda else "CPU"
+        print("%d %s are used to train model" % (dev_count, device))
+        # multi cpu/gpu config
+        exec_strategy = fluid.ExecutionStrategy()
+
+        build_strategy = fluid.compiler.BuildStrategy()
+
+        compiled_prog = fluid.compiler.CompiledProgram(
+            train_program).with_data_parallel(
+                loss_name=train_ret['avg_cost'].name,
+                build_strategy=build_strategy,
+                exec_strategy=exec_strategy)
+    else:
+        compiled_prog = fluid.compiler.CompiledProgram(train_program)
+
+    # start training
+    num_train_examples = dataset.get_num_examples(args.train_data)
+    max_train_steps = args.epoch * num_train_examples // args.batch_size
+    print("Num train examples: %d" % num_train_examples)
+    print("Max train steps: %d" % max_train_steps)
+
+    train_generator = creator.create_lexnet_data_generator(args,
+                                         reader=dataset, 
+                                         file_name=args.train_data,
+                                         place=place, 
+                                         mode='train')
+    test_generator = creator.create_lexnet_data_generator(args, 
+                                         reader=dataset,
+                                         file_name=args.test_data, 
+                                         place=place, 
+                                         mode='test')
+
+    train_reader, test_reader = train_ret['pyreader'], test_ret['pyreader']
+    train_reader.set_batch_generator(train_generator, places=place)
+    test_reader.set_batch_generator(test_generator, places=place)
+
+    ce_info = []
+    step = 0
+    ce_time = 0
+    train_reader.start()
+    while True:
+        try:
+            # this is for minimizing the fetching op, saving the training speed.
+            if step % args.print_steps == 0:
+                fetch_list = [
+                    train_ret["avg_cost"], train_ret["precision"],
+                    train_ret["recall"], train_ret["f1_score"],
+                            train_ret["crf_avg_cost"], train_ret["teacher_cost"]
+                ]
+            else:
+                fetch_list = []
+
+            start_time = time.time()
+            outputs = exe.run(
+            program=compiled_prog,
+            fetch_list=fetch_list)
+
+            end_time = time.time()
+            if step % args.print_steps == 0:
+                avg_cost, precision, recall, f1_score, crf_avg_cost, teacher_cost = [
+                    np.mean(x) for x in outputs
+                ]
+                print("Data loader queue size: %d " % train_reader.queue.size())
+                print(
+                    "[train] step = %d, loss = %.5f, P: %.5f, R: %.5f, F1: %.5f, crf_avg_cost: %.5f, teacher_cost: %.5f, elapsed time %.5f"
+                    % (step, avg_cost, precision, recall, f1_score, crf_avg_cost, teacher_cost,
+                    end_time - start_time))
+
+            if step % args.validation_steps == 0:
+                test_process(exe, test_program, test_reader, test_ret)
+
+                ce_time += end_time - start_time
+                ce_info.append([ce_time, avg_cost, precision, recall, f1_score])
+
+            # save checkpoints
+            if step % args.save_steps == 0 and step != 0:
+                save_path = os.path.join(args.model_save_dir,
+                            "step_" + str(step))
+                fluid.io.save_persistables(exe, save_path, train_program)
+            step += 1
+        except fluid.core.EOFException:
+            train_reader.reset()
+            break
+
+    if args.enable_ce:
+        card_num = get_cards()
+        ce_cost = 0
+        ce_f1 = 0
+        ce_p = 0
+        ce_r = 0
+        ce_time = 0
+        try:
+            ce_time = ce_info[-2][0]
+            ce_cost = ce_info[-2][1]
+            ce_p = ce_info[-2][2]
+            ce_r = ce_info[-2][3]
+            ce_f1 = ce_info[-2][4]
+        except:
+            print("ce info error")
+        print("kpis\teach_step_duration_card%s\t%s" % (card_num, ce_time))
+        print("kpis\ttrain_cost_card%s\t%f" % (card_num, ce_cost))
+        print("kpis\ttrain_precision_card%s\t%f" % (card_num, ce_p))
+        print("kpis\ttrain_recall_card%s\t%f" % (card_num, ce_r))
+        print("kpis\ttrain_f1_card%s\t%f" % (card_num, ce_f1))
+
+
+def get_cards():
+    num = 0
+    cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
+    if cards != '':
+        num = len(cards.split(","))
+    return num
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(__doc__)
+    model_utils.load_yaml(parser, 'conf/args.yaml')
+
+    # config for pantheon student
+    parser.add_argument('--in_path', type=str, default=None, help="The path of dumped knowledge from teacher for offline mode.")
+    parser.add_argument('--in_address', type=str, default=None, help="The IP port number to receive knowledge from teacher for \
+                            online mode")
+    
+    args = parser.parse_args()
+    check_cuda(args.use_cuda)
+    check_version()
+    do_train(args)
diff --git a/demo/pantheon/toy/README.md b/demo/pantheon/toy/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3cb561b445e41c4fdc08a471b469a593e78d36ee
--- /dev/null
+++ b/demo/pantheon/toy/README.md
@@ -0,0 +1,54 @@
+## Toy example for Pantheon
+
+See more details about Pantheon in [PaddleSlim/Pantheon](../../../paddleslim/pantheon).
+
+Here implements two teacher models (not trainable, just for demo): teacher1 takes an integer **x** as input and predicts value **2x-1**, see in [run_teacher1.py](run_teacher1.py); teacher2 also takes **x** as input and predicts **2x+1**, see in [run_teacher2.py](run_teacher2.py). They two share a data reader to read a sequence of increasing natural numbers from zero to some positive inter **max_n** as input and generate different knowledge. And the schema keys for  knowledge in teacher1 is [**"x", "2x-1", "result"**], and [**"2x+1", "result"**] for knowledge in teacher2, in which **"result"** is the common schema and the copy of two  predictions respectively. On instantiating the **Student** object, the merging strategy for the common schema **"result"** should be specified, and the schema keys for the merged knowledge will be [**"x", "2x-1", "2x+1", "result"**], with the merged **"result"** equal to **"2x"** when the merging strategy is **"mean"** and **"4x"** when merging strategy is **"sum"**. The student model gets merged knowledge from teachers and prints them out, see in [run_student.py](run_student.py).
+
+The toy "knowledge distillation" system can be launched in three different modes, i.e., offline, online and their hybrid. All three modes should have the same outputs, and the correctness of results can be verified by checking the order and values of outputs.
+
+### Offline
+
+ The two teachers work in offline mode, and start them with given local file paths.
+
+ ```shell
+export PYTHONPATH=../../../:$PYTHONPATH
+export CUDA_VISIBLE_DEVICES=0,1
+export NUM_POSTPROCESS_THREADS=10 # default 8
+nohup python -u run_teacher1.py --use_cuda true --out_path teacher1_offline.dat > teacher1_offline.log 2>&1&
+export CUDA_VISIBLE_DEVICES=2
+nohup python -u run_teacher2.py --use_cuda true --out_path teacher2_offline.dat > teacher2_offline.log 2>&1&
+ ```
+ After the two executions both finished, start the student model with the two generated knowledge files.
+
+ ```shell
+export PYTHONPATH=../../../:$PYTHONPATH
+ python -u run_student.py \
+        --in_path0 teacher1_offline.dat \
+        --in_path1 teacher2_offline.dat
+ ```
+
+
+### Online
+
+The two teachers work in online mode, and start them with given TCP/IP ports. Please make sure that the ICP/IP ports are available.
+
+```shell
+export PYTHONPATH=../../../:$PYTHONPATH
+export CUDA_VISIBLE_DEVICES=0
+nohup python -u run_teacher1.py --use_cuda true --out_port 8080  > teacher1_online.log 2>&1&
+export CUDA_VISIBLE_DEVICES=1,2
+nohup python -u run_teacher2.py --use_cuda true --out_port 8081  > teacher2_online.log 2>&1&
+```
+Start the student model with the IP addresses that can reach the ports of the two teacher models, e.g., in the same node
+
+```shell
+export PYTHONPATH=../../../:$PYTHONPATH
+python -u run_student.py \
+         --in_address0 127.0.0.1:8080 \
+         --in_address1 127.0.0.1:8081 \
+```
+**Note:** in online mode, the starting order of teachers and the sudent doesn't matter, and they will wait for each other to establish connection.
+
+### Hybrid of offline and online
+
+One teacher works in offline mode and another one works in online mode. This time, start the offline teacher first. After the offline knowledge file gets well prepared, start the online teacher and the student at the same time.
diff --git a/demo/pantheon/toy/run_student.py b/demo/pantheon/toy/run_student.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2ede92fe3e13bacbd6590845247c89e30e018ff
--- /dev/null
+++ b/demo/pantheon/toy/run_student.py
@@ -0,0 +1,103 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from paddleslim.pantheon import Student
+
+from utils import str2bool
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument(
+        "--in_address0",
+        type=str,
+        default=None,
+        help="Input address for teacher 0. (default: %(default)s)")
+    parser.add_argument(
+        "--in_path0",
+        type=str,
+        default=None,
+        help="Input file path for teacher 0. (default: %(default)s)")
+    parser.add_argument(
+        "--in_address1",
+        type=str,
+        default=None,
+        help="Input address for teacher 1. (default: %(default)s)")
+    parser.add_argument(
+        "--in_path1",
+        type=str,
+        default=None,
+        help="Input file path for teacher 1. (default: %(default)s)")
+    parser.add_argument(
+        "--test_send_recv",
+        type=str2bool,
+        default=False,
+        help="Whether to test send/recv interfaces. (default: %(default)s)")
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="The batch size of student model. (default: %(default)s)")
+    args = parser.parse_args()
+    return args
+
+
+def run(args):
+    if args.in_address0 and args.in_path0:
+        raise ValueError(
+            "args.in_address0 and args.in_path0 should not be valid "
+            "at the same time!")
+    if not args.in_address0 and not args.in_path0:
+        raise ValueError(
+            "One of args.in_address0 and args.in_path0 must be valid!")
+
+    if args.in_address1 and args.in_path1:
+        raise ValueError(
+            "args.in_address1 and args.in_path1 should not be valid "
+            "at the same time!")
+    if not args.in_address1 and not args.in_path1:
+        raise ValueError(
+            "One of args.in_address1 and args.in_path1 must be valid")
+
+    student = Student(merge_strategy={"result": "sum"})
+
+    student.register_teacher(
+        in_address=args.in_address0, in_path=args.in_path0)
+    student.register_teacher(
+        in_address=args.in_address1, in_path=args.in_path1)
+    student.start()
+
+    if args.test_send_recv:
+        for t in range(2):
+            for i in range(3):
+                print(student.recv(t))
+        student.send("message from student!")
+
+    knowledge_desc = student.get_knowledge_desc()
+    data_generator = student.get_knowledge_generator(
+        batch_size=args.batch_size, drop_last=False)
+    for batch_data in data_generator():
+        batch_size = list(batch_data.values())[0].shape[0]
+        keys = batch_data.keys()
+        for i in range(batch_size):
+            data = {}
+            for key in keys:
+                data[key] = batch_data[key][i]
+            print(data)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    run(args)
diff --git a/demo/pantheon/toy/run_teacher1.py b/demo/pantheon/toy/run_teacher1.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e0e089877b642a66cf5bf7dd3e171af28a62f91
--- /dev/null
+++ b/demo/pantheon/toy/run_teacher1.py
@@ -0,0 +1,81 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.fluid as fluid
+
+from utils import parse_args, sample_generator, sample_list_generator, batch_generator
+from paddleslim.pantheon import Teacher
+
+
+def run(args):
+    if args.out_path and args.out_port:
+        raise ValueError("args.out_path and args.out_port should not be valid "
+                         "at the same time")
+    if not args.out_path and not args.out_port:
+        raise ValueError("One of args.out_path and args.out_port be valid")
+
+    # user-defined program: y = 2*x - 1 
+    startup = fluid.Program()
+    program = fluid.Program()
+    with fluid.program_guard(program, startup):
+        inp_x = fluid.layers.data(name='x', shape=[-1, 1], dtype="int64")
+        y = inp_x * 2 - 1
+        result = fluid.layers.assign(y)
+
+    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(startup)
+
+    teacher = Teacher(out_path=args.out_path, out_port=args.out_port)
+    teacher.start()
+
+    if args.generator_type == "sample_generator":
+        reader_config = {
+            "sample_generator": sample_generator(max_n=1000),
+            "batch_size": args.batch_size,
+            "drop_last": False
+        }
+    elif args.generator_type == "sample_list_generator":
+        reader_config = {
+            "sample_list_generator": sample_list_generator(
+                max_n=1000, batch_size=args.batch_size)
+        }
+    else:
+        reader_config = {
+            "batch_generator": batch_generator(
+                max_n=1000, batch_size=args.batch_size)
+        }
+
+    if args.test_send_recv:
+        teacher.send("greetings from teacher1")
+        teacher.send({"x": 1, "y": 2})
+        teacher.send({3, 5})
+        print("recved {}".format(teacher.recv()))
+
+    teacher.start_knowledge_service(
+        feed_list=[inp_x.name],
+        schema={"x": inp_x,
+                "2x-1": y,
+                "result": result},
+        program=program,
+        reader_config=reader_config,
+        exe=exe,
+        use_fp16=True,
+        times=args.serving_times)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    run(args)
diff --git a/demo/pantheon/toy/run_teacher2.py b/demo/pantheon/toy/run_teacher2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d45fec92bbce3655cc11b0737cf1d83daa018d8
--- /dev/null
+++ b/demo/pantheon/toy/run_teacher2.py
@@ -0,0 +1,79 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.fluid as fluid
+
+from utils import parse_args, sample_generator, sample_list_generator, batch_generator
+from paddleslim.pantheon import Teacher
+
+
+def run(args):
+    if args.out_path and args.out_port:
+        raise ValueError("args.out_path and args.out_port should not be valid "
+                         "at the same time")
+    if not args.out_path and not args.out_port:
+        raise ValueError("One of args.out_path and args.out_port be valid")
+
+    # user-defined program: y = 2*x + 1 
+    startup = fluid.Program()
+    program = fluid.Program()
+    with fluid.program_guard(program, startup):
+        inp_x = fluid.layers.data(name='x', shape=[-1, 1], dtype="int64")
+        y = inp_x * 2 + 1
+        result = fluid.layers.assign(y)
+
+    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(startup)
+
+    teacher = Teacher(out_path=args.out_path, out_port=args.out_port)
+    teacher.start()
+
+    if args.generator_type == "sample_generator":
+        reader_config = {
+            "sample_generator": sample_generator(max_n=1000),
+            "batch_size": args.batch_size,
+            "drop_last": False
+        }
+    elif args.generator_type == "sample_list_generator":
+        reader_config = {
+            "sample_list_generator": sample_list_generator(
+                max_n=1000, batch_size=args.batch_size)
+        }
+    else:
+        reader_config = {
+            "batch_generator": batch_generator(
+                max_n=1000, batch_size=args.batch_size)
+        }
+
+    if args.test_send_recv:
+        teacher.send("greetings from teacher2")
+        teacher.send([1])
+        teacher.send({1, 2, 3})
+        print("recved {}".format(teacher.recv()))
+
+    teacher.start_knowledge_service(
+        feed_list=[inp_x.name],
+        schema={"2x+1": y,
+                "result": result},
+        program=program,
+        reader_config=reader_config,
+        exe=exe,
+        times=args.serving_times)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    run(args)
diff --git a/demo/pantheon/toy/utils.py b/demo/pantheon/toy/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..af88d2a699db039dadc1fafa32bdcfa3f6bda491
--- /dev/null
+++ b/demo/pantheon/toy/utils.py
@@ -0,0 +1,91 @@
+import numpy as np
+import argparse
+
+
+def str2bool(v):
+    return v.lower() in ("true", "t", "1")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument(
+        "--out_port",
+        type=int,
+        default=None,
+        help="IP port number for sending out data. (default: %(default)s)")
+    parser.add_argument(
+        "--out_path",
+        type=str,
+        default=None,
+        help="The file path to dump knowledge data. (default: %(default)s)")
+    parser.add_argument(
+        "--use_cuda",
+        type=str2bool,
+        default=False,
+        help="Whether to use GPU for prediction. (default: %(default)s)")
+    parser.add_argument(
+        "--test_send_recv",
+        type=str2bool,
+        default=False,
+        help="Whether to test send/recv interfaces. (default: %(default)s)")
+    parser.add_argument(
+        "--generator_type",
+        type=str,
+        choices=[
+            "sample_generator", "sample_list_generator", "batch_generator"
+        ],
+        default="batch_generator",
+        help="Which data generator to use. (default: %(default)s)")
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="The batch size per device for data generators. (default: %(default)s)"
+    )
+    parser.add_argument(
+        "--serving_times",
+        type=int,
+        default=1,
+        help="The maximum times of teacher serving knowledge. (default: %(default)s)"
+    )
+    args = parser.parse_args()
+    return args
+
+
+def sample_generator(max_n):
+    def wrapper():
+        for i in range(max_n):
+            yield [i]
+
+    return wrapper
+
+
+def sample_list_generator(max_n, batch_size=500):
+    def wrapper():
+        sample_list = []
+        for sample in sample_generator(max_n)():
+            if len(sample_list) < batch_size:
+                sample_list.append(sample)
+            if len(sample_list) == batch_size:
+                yield sample_list
+                sample_list = []
+        if len(sample_list) > 0:
+            yield sample_list
+
+    return wrapper
+
+
+# data_generator
+def batch_generator(max_n, batch_size=500):
+    def wrapper():
+        batch = []
+        for sample in sample_generator(max_n)():
+            if len(batch) < batch_size:
+                batch.append(sample)
+            if len(batch) == batch_size:
+                yield [np.array(batch).astype('int64').reshape((-1, 1))]
+                batch = []
+        if len(batch) > 0:
+            yield [np.array(batch).astype('int64').reshape((-1, 1))]
+
+    return wrapper
diff --git a/demo/prune/README.md b/demo/prune/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..c31f1791b9e1a496b74754e30e3b1f4b14e92483
--- /dev/null
+++ b/demo/prune/README.md
@@ -0,0 +1,75 @@
+# 图像分类模型卷积层通道剪裁示例
+
+本示例将演示如何按指定的剪裁率对每个卷积层的通道数进行剪裁。该示例默认会自动下载并使用mnist数据。
+
+当前示例支持以下分类模型：
+
+- MobileNetV1
+- MobileNetV2
+- ResNet50
+- PVANet
+
+
+## 1. 数据准备
+
+本示例支持`MNIST`和`ImageNet`两种数据。默认情况下，会自动下载并使用`MNIST`数据，如果需要使用`ImageNet`数据，请按以下步骤操作：
+
+1). 根据分类模型中[ImageNet数据准备文档](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification#%E6%95%B0%E6%8D%AE%E5%87%86%E5%A4%87)下载数据到`PaddleSlim/demo/data/ILSVRC2012`路径下。
+2). 使用`train.py`脚本时，指定`--data`选项为`imagenet`.
+
+## 2. 下载预训练模型
+
+如果使用`ImageNet`数据，建议在预训练模型的基础上进行剪裁，请从[分类库](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification#%E5%B7%B2%E5%8F%91%E5%B8%83%E6%A8%A1%E5%9E%8B%E5%8F%8A%E5%85%B6%E6%80%A7%E8%83%BD)中下载合适的预训练模型。
+
+这里以`MobileNetV1`为例，下载并解压预训练模型到当前路径：
+
+```
+wget http://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV1_pretrained.tar
+tar -xf MobileNetV1_pretrained.tar
+```
+
+使用`train.py`脚本时，指定`--pretrained_model`加载预训练模型。
+
+## 3. 启动剪裁任务
+
+通过以下命令启动裁剪任务：
+
+```
+export CUDA_VISIBLE_DEVICES=0
+python train.py \
+--model "MobileNet" \
+--pruned_ratio 0.31 \
+--data "mnist"  \
+--criterion "l1_norm"
+```
+
+其中，`model`用于指定待裁剪的模型。`pruned_ratio`用于指定各个卷积层通道数被裁剪的比例。`data`选项用于指定使用的数据集。
+`criterion` 选项用于指定所使用的剪裁算法策略，现在支持`l1_norm`, `bn_scale`, `geometry_median`。默认为`l1_norm`。可以
+设置该参数以改变剪裁算法策略。该目录下的四个shell脚本文件是在ResNet34, MobileNetV1, MobileNetV2等三个模型上进行的四组
+`criterion`设置为`geometry_median`的实验，可以直接运行脚本文件启动剪裁实验。
+
+执行`python train.py --help`查看更多选项。
+
+在本示例中，会在日志中输出剪裁前后的`FLOPs`，并且每训练一轮就会保存一个模型到文件系统。
+
+## 4. 加载和评估模型
+
+本节介绍如何加载训练过程中保存的模型。
+
+执行以下代码加载模型并评估模型在测试集上的指标。
+
+```
+python eval.py \
+--model "MobileNet" \
+--data "mnist" \
+--model_path "./models/0"
+```
+
+在脚本`eval.py`中，使用`paddleslim.prune.load_model`接口加载剪裁得到的模型。
+
+## 5. 接口介绍
+
+该示例使用了`paddleslim.Pruner`工具类，用户接口使用介绍请参考：[API文档](https://paddlepaddle.github.io/PaddleSlim/api/prune_api/)
+
+在调用`paddleslim.Pruner`工具类时，需要指定待裁卷积层的参数名称。不同模型的参数命名不同，
+在`train.py`脚本中，提供了`get_pruned_params`方法，根据用户设置的选项`--model`确定要裁剪的参数。
diff --git a/demo/prune/eval.py b/demo/prune/eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2c9ea26cbb2223c9cb0ee4bac0962680637f2db
--- /dev/null
+++ b/demo/prune/eval.py
@@ -0,0 +1,104 @@
+import os
+import sys
+import logging
+import paddle
+import argparse
+import functools
+import math
+import time
+import numpy as np
+import paddle.fluid as fluid
+from paddleslim.prune import load_model
+from paddleslim.common import get_logger
+from paddleslim.analysis import flops
+sys.path[0] = os.path.join(os.path.dirname("__file__"), os.path.pardir)
+import models
+from utility import add_arguments, print_arguments
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',       int,  64 * 4,                 "Minibatch size.")
+add_arg('use_gpu',          bool, True,                "Whether to use GPU or not.")
+add_arg('model',            str,  "MobileNet",                "The target model.")
+add_arg('model_path', str,  "./models/0",                "The path of model used to evalate..")
+add_arg('data',             str, "mnist",                 "Which data to use. 'mnist' or 'imagenet'")
+add_arg('log_period',       int, 10,                 "Log period in batches.")
+# yapf: enable
+
+model_list = models.__all__
+
+
+def eval(args):
+    train_reader = None
+    test_reader = None
+    if args.data == "mnist":
+        import paddle.dataset.mnist as reader
+        train_reader = reader.train()
+        val_reader = reader.test()
+        class_dim = 10
+        image_shape = "1,28,28"
+    elif args.data == "imagenet":
+        import imagenet_reader as reader
+        train_reader = reader.train()
+        val_reader = reader.val()
+        class_dim = 1000
+        image_shape = "3,224,224"
+    else:
+        raise ValueError("{} is not supported.".format(args.data))
+    image_shape = [int(m) for m in image_shape.split(",")]
+    assert args.model in model_list, "{} is not in lists: {}".format(
+        args.model, model_list)
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    # model definition
+    model = models.__dict__[args.model]()
+    out = model.net(input=image, class_dim=class_dim)
+    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+    val_program = fluid.default_main_program().clone(for_test=True)
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    val_reader = paddle.fluid.io.batch(val_reader, batch_size=args.batch_size)
+
+    val_feeder = feeder = fluid.DataFeeder(
+        [image, label], place, program=val_program)
+
+    load_model(exe, val_program, args.model_path)
+
+    batch_id = 0
+    acc_top1_ns = []
+    acc_top5_ns = []
+    for data in val_reader():
+        start_time = time.time()
+        acc_top1_n, acc_top5_n = exe.run(
+            val_program,
+            feed=val_feeder.feed(data),
+            fetch_list=[acc_top1.name, acc_top5.name])
+        end_time = time.time()
+        if batch_id % args.log_period == 0:
+            _logger.info(
+                "Eval batch[{}] - acc_top1: {}; acc_top5: {}; time: {}".format(
+                    batch_id,
+                    np.mean(acc_top1_n),
+                    np.mean(acc_top5_n), end_time - start_time))
+        acc_top1_ns.append(np.mean(acc_top1_n))
+        acc_top5_ns.append(np.mean(acc_top5_n))
+        batch_id += 1
+
+    _logger.info("Final eval - acc_top1: {}; acc_top5: {}".format(
+        np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns))))
+
+
+def main():
+    args = parser.parse_args()
+    print_arguments(args)
+    eval(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/prune/fpgm_mobilenetv1_f-50_train.sh b/demo/prune/fpgm_mobilenetv1_f-50_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0e84dafd5e9619631927aa118cef02a232016313
--- /dev/null
+++ b/demo/prune/fpgm_mobilenetv1_f-50_train.sh
@@ -0,0 +1,17 @@
+#!/bin/bash  
+export CUDA_VISIBLE_DEVICES=0,1
+export FLAGS_fraction_of_gpu_memory_to_use=0.98
+python train.py \
+    --model="MobileNet" \
+    --pretrained_model="/workspace/models/MobileNetV1_pretrained" \
+    --data="imagenet" \
+    --pruned_ratio=0.3125 \
+    --lr=0.1 \
+    --num_epochs=120 \
+    --test_period=10 \
+    --step_epochs 30 60 90\
+    --l2_decay=3e-5 \
+    --lr_strategy="piecewise_decay" \
+    --criterion="geometry_median" \
+    --model_path="./fpgm_mobilenetv1_models" \
+    2>&1 | tee fpgm_mobilenetv1_train.log
diff --git a/demo/prune/fpgm_mobilenetv2_f-50_train.sh b/demo/prune/fpgm_mobilenetv2_f-50_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7d0399775f41fb3b762599ab6f2cf63337b7eab4
--- /dev/null
+++ b/demo/prune/fpgm_mobilenetv2_f-50_train.sh
@@ -0,0 +1,17 @@
+#!/bin/bash  
+export CUDA_VISIBLE_DEVICES=0,1
+export FLAGS_fraction_of_gpu_memory_to_use=0.98
+python train.py \
+    --model="MobileNetV2" \
+    --pretrained_model="/workspace/models/MobileNetV2_pretrained" \
+    --data="imagenet" \
+    --pruned_ratio=0.325 \
+    --lr=0.001 \
+    --num_epochs=90 \
+    --test_period=5 \
+    --step_epochs 30 60 80\
+    --l2_decay=1e-4 \
+    --lr_strategy="piecewise_decay" \
+    --criterion="geometry_median" \
+    --model_path="./fpgm_mobilenetv2_models" \
+    2>&1 | tee fpgm_mobilenetv2_train.log
diff --git a/demo/prune/fpgm_resnet34_f-42_train.sh b/demo/prune/fpgm_resnet34_f-42_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ec24c4d8f13ab2d095e29274326c5136ed240e4d
--- /dev/null
+++ b/demo/prune/fpgm_resnet34_f-42_train.sh
@@ -0,0 +1,12 @@
+#!/bin/bash  
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+export FLAGS_fraction_of_gpu_memory_to_use=0.98
+python train.py \
+    --model="ResNet34" \
+    --pretrained_model="/workspace/models/ResNet34_pretrained" \
+    --data="imagenet" \
+    --pruned_ratio=0.25 \
+    --lr_strategy="cosine_decay" \
+    --criterion="geometry_median" \
+    --model_path="./fpgm_resnet34_025_120_models" \
+    2>&1 | tee fpgm_resnet025_120_train.log
diff --git a/demo/prune/fpgm_resnet34_f-50_train.sh b/demo/prune/fpgm_resnet34_f-50_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c181dfb66a1b0fb588cba3868b970f656a73f0d6
--- /dev/null
+++ b/demo/prune/fpgm_resnet34_f-50_train.sh
@@ -0,0 +1,17 @@
+#!/bin/bash  
+export CUDA_VISIBLE_DEVICES=0,1
+export FLAGS_fraction_of_gpu_memory_to_use=0.98
+python train.py \
+    --model="ResNet34" \
+    --pretrained_model="/workspace/models/ResNet34_pretrained" \
+    --data="imagenet" \
+    --pruned_ratio=0.3125 \
+    --lr=0.001 \
+    --num_epochs=70 \
+    --test_period=5 \
+    --step_epochs 30 60 \
+    --l2_decay=1e-4 \
+    --lr_strategy="piecewise_decay" \
+    --criterion="geometry_median" \
+    --model_path="./fpgm_resnet34_models" \
+    2>&1 | tee fpgm_resnet03_train.log
diff --git a/demo/prune/image_classification_pruning_quick_start.ipynb b/demo/prune/image_classification_pruning_quick_start.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..670a2edf611629a9f33b694b59b24138b0bfdc56
--- /dev/null
+++ b/demo/prune/image_classification_pruning_quick_start.ipynb
@@ -0,0 +1,352 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#  图像分类模型通道剪裁-快速开始\n",
+    "\n",
+    "该教程以图像分类模型MobileNetV1为例，说明如何快速使用[PaddleSlim的卷积通道剪裁接口]()。\n",
+    "该示例包含以下步骤：\n",
+    "\n",
+    "1. 导入依赖\n",
+    "2. 构建模型\n",
+    "3. 剪裁\n",
+    "4. 训练剪裁后的模型\n",
+    "\n",
+    "以下章节依次次介绍每个步骤的内容。\n",
+    "\n",
+    "## 1. 导入依赖\n",
+    "\n",
+    "PaddleSlim依赖Paddle1.7版本，请确认已正确安装Paddle，然后按以下方式导入Paddle和PaddleSlim:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import paddle\n",
+    "import paddle.fluid as fluid\n",
+    "import paddleslim as slim"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. 构建网络\n",
+    "\n",
+    "该章节构造一个用于对MNIST数据进行分类的分类模型，选用`MobileNetV1`，并将输入大小设置为`[1, 28, 28]`，输出类别数为10。\n",
+    "为了方便展示示例，我们在`paddleslim.models`下预定义了用于构建分类模型的方法，执行以下代码构建分类模型："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "exe, train_program, val_program, inputs, outputs = slim.models.image_classification(\"MobileNet\", [1, 28, 28], 10, use_gpu=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    ">注意：paddleslim.models下的API并非PaddleSlim常规API，是为了简化示例而封装预定义的一系列方法，比如：模型结构的定义、Program的构建等。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. 剪裁卷积层通道\n",
+    "\n",
+    "### 3.1 计算剪裁之前的FLOPs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FLOPs: 10907072.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "FLOPs = slim.analysis.flops(train_program)\n",
+    "print(\"FLOPs: {}\".format(FLOPs))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.2 剪裁\n",
+    "\n",
+    "我们这里对参数名为`conv2_1_sep_weights`和`conv2_2_sep_weights`的卷积层进行剪裁，分别剪掉20%和30%的通道数。\n",
+    "代码如下所示："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pruner = slim.prune.Pruner()\n",
+    "pruned_program, _, _ = pruner.prune(\n",
+    "        train_program,\n",
+    "        fluid.global_scope(),\n",
+    "        params=[\"conv2_1_sep_weights\", \"conv2_2_sep_weights\"],\n",
+    "        ratios=[0.33] * 2,\n",
+    "        place=fluid.CPUPlace())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "以上操作会修改`train_program`中对应卷积层参数的定义，同时对`fluid.global_scope()`中存储的参数数组进行裁剪。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.3 计算剪裁之后的FLOPs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FLOPs: 10907072.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "FLOPs = paddleslim.analysis.flops(train_program)\n",
+    "print(\"FLOPs: {}\".format(FLOPs))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. 训练剪裁后的模型\n",
+    "\n",
+    "### 4.1 定义输入数据\n",
+    "\n",
+    "为了快速执行该示例，我们选取简单的MNIST数据，Paddle框架的`paddle.dataset.mnist`包定义了MNIST数据的下载和读取。\n",
+    "代码如下："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import paddle.dataset.mnist as reader\n",
+    "train_reader = paddle.fluid.io.batch(\n",
+    "        reader.train(), batch_size=128, drop_last=True)\n",
+    "train_feeder = fluid.DataFeeder(inputs, fluid.CPUPlace())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 4.2 执行训练\n",
+    "以下代码执行了一个`epoch`的训练："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0.1484375] [0.4921875] [2.6727316]\n",
+      "[0.125] [0.546875] [2.6547904]\n",
+      "[0.125] [0.5546875] [2.795205]\n",
+      "[0.1171875] [0.578125] [2.8561475]\n",
+      "[0.1875] [0.59375] [2.470603]\n",
+      "[0.1796875] [0.578125] [2.8031898]\n",
+      "[0.1484375] [0.6015625] [2.7530417]\n",
+      "[0.1953125] [0.640625] [2.711596]\n",
+      "[0.125] [0.59375] [2.8637898]\n",
+      "[0.1796875] [0.53125] [2.9473038]\n",
+      "[0.25] [0.671875] [2.3943179]\n",
+      "[0.25] [0.6953125] [2.632146]\n",
+      "[0.2578125] [0.7265625] [2.723265]\n",
+      "[0.359375] [0.765625] [2.4263484]\n",
+      "[0.3828125] [0.8203125] [2.226284]\n",
+      "[0.421875] [0.8203125] [1.8042578]\n",
+      "[0.4765625] [0.890625] [1.6841211]\n",
+      "[0.53125] [0.8671875] [2.1971617]\n",
+      "[0.5546875] [0.8984375] [1.5361531]\n",
+      "[0.53125] [0.890625] [1.7211896]\n",
+      "[0.5078125] [0.8984375] [1.6586945]\n",
+      "[0.53125] [0.9140625] [1.8980236]\n",
+      "[0.546875] [0.9453125] [1.5279069]\n",
+      "[0.5234375] [0.8828125] [1.7356458]\n",
+      "[0.6015625] [0.9765625] [1.0375824]\n",
+      "[0.5546875] [0.921875] [1.639497]\n",
+      "[0.6015625] [0.9375] [1.5469061]\n",
+      "[0.578125] [0.96875] [1.3573356]\n",
+      "[0.65625] [0.9453125] [1.3787829]\n",
+      "[0.640625] [0.9765625] [0.9946856]\n",
+      "[0.65625] [0.96875] [1.1651027]\n",
+      "[0.625] [0.984375] [1.0487883]\n",
+      "[0.7265625] [0.9609375] [1.2526855]\n",
+      "[0.7265625] [0.9765625] [1.2954011]\n",
+      "[0.65625] [0.96875] [1.1181556]\n",
+      "[0.71875] [0.9765625] [0.97891223]\n",
+      "[0.640625] [0.9609375] [1.2135172]\n",
+      "[0.7265625] [0.9921875] [0.8950747]\n",
+      "[0.7578125] [0.96875] [1.0864108]\n",
+      "[0.734375] [0.9921875] [0.8392239]\n",
+      "[0.796875] [0.9609375] [0.7012155]\n",
+      "[0.7734375] [0.9765625] [0.7409136]\n",
+      "[0.8046875] [0.984375] [0.6108341]\n",
+      "[0.796875] [0.9765625] [0.63867176]\n",
+      "[0.7734375] [0.984375] [0.64099216]\n",
+      "[0.7578125] [0.9453125] [0.83827704]\n",
+      "[0.8046875] [0.9921875] [0.5311729]\n",
+      "[0.8984375] [0.9921875] [0.36445504]\n",
+      "[0.859375] [0.9921875] [0.40577835]\n",
+      "[0.8125] [0.9765625] [0.64629185]\n",
+      "[0.84375] [1.] [0.38400555]\n",
+      "[0.890625] [0.9765625] [0.45866236]\n",
+      "[0.8828125] [0.9921875] [0.3711415]\n",
+      "[0.7578125] [0.9921875] [0.6650479]\n",
+      "[0.7578125] [0.984375] [0.9030752]\n",
+      "[0.8671875] [0.9921875] [0.3678714]\n",
+      "[0.7421875] [0.9765625] [0.7424855]\n",
+      "[0.7890625] [1.] [0.6212543]\n",
+      "[0.8359375] [1.] [0.58529043]\n",
+      "[0.8203125] [0.96875] [0.5860813]\n",
+      "[0.8671875] [0.9921875] [0.415236]\n",
+      "[0.8125] [1.] [0.60501564]\n",
+      "[0.796875] [0.9765625] [0.60677457]\n",
+      "[0.8515625] [1.] [0.5338207]\n",
+      "[0.8046875] [0.9921875] [0.54180473]\n",
+      "[0.875] [0.9921875] [0.7293667]\n",
+      "[0.84375] [0.9765625] [0.5581689]\n",
+      "[0.8359375] [1.] [0.50712734]\n",
+      "[0.8671875] [0.9921875] [0.55217856]\n",
+      "[0.765625] [0.96875] [0.8076792]\n",
+      "[0.953125] [1.] [0.17031987]\n",
+      "[0.890625] [0.9921875] [0.42383268]\n",
+      "[0.828125] [0.9765625] [0.49300486]\n",
+      "[0.8671875] [0.96875] [0.57985115]\n",
+      "[0.8515625] [1.] [0.4901033]\n",
+      "[0.921875] [1.] [0.34583277]\n",
+      "[0.8984375] [0.984375] [0.41139168]\n",
+      "[0.9296875] [1.] [0.20420414]\n",
+      "[0.921875] [0.984375] [0.24322833]\n",
+      "[0.921875] [0.9921875] [0.30570173]\n",
+      "[0.875] [0.9921875] [0.3866225]\n",
+      "[0.9140625] [0.9921875] [0.20813875]\n",
+      "[0.9140625] [1.] [0.17933217]\n",
+      "[0.8984375] [0.9921875] [0.32508463]\n",
+      "[0.9375] [1.] [0.24799153]\n",
+      "[0.9140625] [1.] [0.26146784]\n",
+      "[0.90625] [1.] [0.24672262]\n",
+      "[0.8828125] [1.] [0.34094217]\n",
+      "[0.90625] [1.] [0.2964819]\n",
+      "[0.9296875] [1.] [0.18237087]\n",
+      "[0.84375] [1.] [0.7182543]\n",
+      "[0.8671875] [0.984375] [0.508474]\n",
+      "[0.8828125] [0.9921875] [0.367172]\n",
+      "[0.9453125] [1.] [0.2366665]\n",
+      "[0.9375] [1.] [0.12494276]\n",
+      "[0.8984375] [1.] [0.3395289]\n",
+      "[0.890625] [0.984375] [0.30877113]\n",
+      "[0.90625] [1.] [0.29763448]\n",
+      "[0.8828125] [0.984375] [0.4845504]\n",
+      "[0.8515625] [1.] [0.45548072]\n",
+      "[0.8828125] [1.] [0.33331633]\n",
+      "[0.90625] [1.] [0.4024018]\n",
+      "[0.890625] [0.984375] [0.73405886]\n",
+      "[0.9609375] [0.9921875] [0.15409982]\n",
+      "[0.9140625] [0.984375] [0.37103674]\n",
+      "[0.953125] [1.] [0.17628372]\n",
+      "[0.890625] [1.] [0.36522508]\n",
+      "[0.8828125] [1.] [0.407708]\n",
+      "[0.9375] [0.984375] [0.25090045]\n",
+      "[0.890625] [0.984375] [0.35742313]\n",
+      "[0.921875] [0.9921875] [0.2751101]\n",
+      "[0.890625] [0.984375] [0.43053097]\n",
+      "[0.875] [0.9921875] [0.34412643]\n",
+      "[0.90625] [1.] [0.35595697]\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-21-92f72657bddc>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtrain_reader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m     \u001b[0macc1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0macc5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mexe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpruned_program\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrain_feeder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0macc1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0macc5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/paddle/fluid/executor.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache)\u001b[0m\n\u001b[1;32m    776\u001b[0m                 \u001b[0mscope\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mscope\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    777\u001b[0m                 \u001b[0mreturn_numpy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_numpy\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 778\u001b[0;31m                 use_program_cache=use_program_cache)\n\u001b[0m\u001b[1;32m    779\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    780\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEOFException\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/paddle/fluid/executor.py\u001b[0m in \u001b[0;36m_run_impl\u001b[0;34m(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache)\u001b[0m\n\u001b[1;32m    829\u001b[0m                 \u001b[0mscope\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mscope\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    830\u001b[0m                 \u001b[0mreturn_numpy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_numpy\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 831\u001b[0;31m                 use_program_cache=use_program_cache)\n\u001b[0m\u001b[1;32m    832\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    833\u001b[0m         \u001b[0mprogram\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscope\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplace\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/paddle/fluid/executor.py\u001b[0m in \u001b[0;36m_run_program\u001b[0;34m(self, program, feed, fetch_list, feed_var_name, fetch_var_name, scope, return_numpy, use_program_cache)\u001b[0m\n\u001b[1;32m    903\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0muse_program_cache\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    904\u001b[0m             self._default_executor.run(program.desc, scope, 0, True, True,\n\u001b[0;32m--> 905\u001b[0;31m                                        fetch_var_name)\n\u001b[0m\u001b[1;32m    906\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    907\u001b[0m             self._default_executor.run_prepared_ctx(ctx, scope, False, False,\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "for data in train_reader():\n",
+    "    acc1, acc5, loss = exe.run(pruned_program, feed=train_feeder.feed(data), fetch_list=outputs)\n",
+    "    print(acc1, acc5, loss)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/demo/prune/train.py b/demo/prune/train.py
index a8d923b3b9cdaee7c3e938cc6ac3729cd7e6f250..73e08756f34bd212db3b2239351cf9f8ce6962d9 100644
--- a/demo/prune/train.py
+++ b/demo/prune/train.py
@@ -8,10 +8,10 @@ import math
 import time
 import numpy as np
 import paddle.fluid as fluid
-from paddleslim.prune import Pruner
+sys.path[0] = os.path.join(os.path.dirname("__file__"), os.path.pardir)
+from paddleslim.prune import Pruner, save_model
 from paddleslim.common import get_logger
 from paddleslim.analysis import flops
-sys.path.append(sys.path[0] + "/../")
 import models
 from utility import add_arguments, print_arguments
 
@@ -35,9 +35,34 @@ add_arg('config_file',      str, None,                 "The config file for comp
 add_arg('data',             str, "mnist",                 "Which data to use. 'mnist' or 'imagenet'")
 add_arg('log_period',       int, 10,                 "Log period in batches.")
 add_arg('test_period',      int, 10,                 "Test period in epoches.")
+add_arg('model_path',       str, "./models",         "The path to save model.")
+add_arg('pruned_ratio',     float, None,         "The ratios to be pruned.")
+add_arg('criterion',        str, "l1_norm",         "The prune criterion to be used, support l1_norm and batch_norm_scale.")
+add_arg('save_inference',   bool, False,                "Whether to save inference model.")
 # yapf: enable
 
-model_list = [m for m in dir(models) if "__" not in m]
+model_list = models.__all__
+
+
+def get_pruned_params(args, program):
+    params = []
+    if args.model == "MobileNet":
+        for param in program.global_block().all_parameters():
+            if "_sep_weights" in param.name:
+                params.append(param.name)
+    elif args.model == "MobileNetV2":
+        for param in program.global_block().all_parameters():
+            if "linear_weights" in param.name or "expand_weights" in param.name:
+                params.append(param.name)
+    elif args.model == "ResNet34":
+        for param in program.global_block().all_parameters():
+            if "weights" in param.name and "branch" in param.name:
+                params.append(param.name)
+    elif args.model == "PVANet":
+        for param in program.global_block().all_parameters():
+            if "conv_weights" in param.name:
+                params.append(param.name)
+    return params
 
 
 def piecewise_decay(args):
@@ -45,6 +70,7 @@ def piecewise_decay(args):
     bd = [step * e for e in args.step_epochs]
     lr = [args.lr * (0.1**i) for i in range(len(bd) + 1)]
     learning_rate = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
+
     optimizer = fluid.optimizer.Momentum(
         learning_rate=learning_rate,
         momentum=args.momentum_rate,
@@ -112,10 +138,12 @@ def compress(args):
             return os.path.exists(
                 os.path.join(args.pretrained_model, var.name))
 
+        _logger.info("Load pretrained model from {}".format(
+            args.pretrained_model))
         fluid.io.load_vars(exe, args.pretrained_model, predicate=if_exist)
 
-    val_reader = paddle.batch(val_reader, batch_size=args.batch_size)
-    train_reader = paddle.batch(
+    val_reader = paddle.fluid.io.batch(val_reader, batch_size=args.batch_size)
+    train_reader = paddle.fluid.io.batch(
         train_reader, batch_size=args.batch_size, drop_last=True)
 
     train_feeder = feeder = fluid.DataFeeder([image, label], place)
@@ -176,38 +204,40 @@ def compress(args):
                            end_time - start_time))
             batch_id += 1
 
-    params = []
-    for param in fluid.default_main_program().global_block().all_parameters():
-        if "_sep_weights" in param.name:
-            params.append(param.name)
-    _logger.info("fops before pruning: {}".format(
+    test(0, val_program)
+
+    params = get_pruned_params(args, fluid.default_main_program())
+    _logger.info("FLOPs before pruning: {}".format(
         flops(fluid.default_main_program())))
-    pruner = Pruner()
-    pruned_val_program = pruner.prune(
+    pruner = Pruner(args.criterion)
+    pruned_val_program, _, _ = pruner.prune(
         val_program,
         fluid.global_scope(),
         params=params,
-        ratios=[0.33] * len(params),
+        ratios=[args.pruned_ratio] * len(params),
         place=place,
         only_graph=True)
 
-    pruned_program = pruner.prune(
+    pruned_program, _, _ = pruner.prune(
         fluid.default_main_program(),
         fluid.global_scope(),
         params=params,
-        ratios=[0.33] * len(params),
+        ratios=[args.pruned_ratio] * len(params),
         place=place)
-
-    for param in pruned_program[0].global_block().all_parameters():
-        if "weights" in param.name:
-            print param.name, param.shape
-    return
-    _logger.info("fops after pruning: {}".format(flops(pruned_program)))
-
+    _logger.info("FLOPs after pruning: {}".format(flops(pruned_program)))
     for i in range(args.num_epochs):
         train(i, pruned_program)
         if i % args.test_period == 0:
             test(i, pruned_val_program)
+            save_model(exe, pruned_val_program,
+                       os.path.join(args.model_path, str(i)))
+        if args.save_inference:
+            infer_model_path = os.path.join(args.model_path, "infer_models",
+                                            str(i))
+            fluid.io.save_inference_model(infer_model_path, ["image"], [out],
+                                          exe, pruned_val_program)
+            _logger.info("Saved inference model into [{}]".format(
+                infer_model_path))
 
 
 def main():
diff --git a/demo/quant/pact_quant_aware/README.md b/demo/quant/pact_quant_aware/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cf64d7e21b071263dc7cdb2810e87fde4a7d0dc8
--- /dev/null
+++ b/demo/quant/pact_quant_aware/README.md
@@ -0,0 +1,198 @@
+# 自定义量化方法使用示例
+
+本示例介绍如何使用自定义量化方法，以PACT方法为例，量化训练好的分类模型MobileNetV3, 可以减少模型的存储空间和显存占用。
+
+## 方法介绍
+PACT(Parameterized Clipping Activation for Quantized Neural Networks)[论文地址](https://arxiv.org/abs/1805.06085)提出了在量化激活值之前去掉一些离群点来使量化精度提高。论文中给的PACT的公式是：
+
+<p align="center">
+<img src="./image/pact.png" height=50 width=420 hspace='10'/> <br />
+</p>
+
+因为论文中的思想是将PACT公式代替ReLU激活函数，但是在实际使用中，将要进行量化的激活值不一定来自ReLU激活函数，有可能是其他函数，也有可能是来自elementwise op等，所以本demo中的方法是在激活值和量化op之间加入改进后的PACT方法，公式如下：
+
+<p align="center">
+<img src="./image/pact_our.png" height=50 width=360 hspace='10'/> <br />
+</p>
+
+
+改进的原因是要量化的激活值不一定都是大于0，而量化时寻找的时激活值的最大值，所以小于0的值也要进行约束。
+
+### 定义PACT函数
+
+自定义量化方法支持对激活值或者权重定义预处理方式，同时也支持自定义量化方法。在 `quant_aware` 接口中，相关参数以及意义如下：
+
+- `weight_quantize_func`: 自定义对权重量化的函数，该函数的输入是待量化的权重，输出是反量化之后的权重，可以快速验证此量化函数是否有效。此参数设置后，将会替代量化配置中 `weight_quantize_type` 定义的方法，如果此参数不设置，将继续使用 `weight_quantize_type` 定义的方法。
+- `act_quantize_func`: 自定义对激活量化的函数，该函数的输入是待量化的激活，输出是反量化之后的激活，可以快速验证此量化函数是否有效。将会替代量化配置中 `activation_quantize_type` 定义的方法，如果此参数不设置，将继续使用 `activation_quantize_type` 定义的方法。
+
+- `weight_preprocess_func` : 自定义在对权重做量化之前，对权重进行处理的函数。此方法的意义在于网络中的参数不一定适合于直接量化，如果对参数分布先进行处理再进行量化，或许可以提高量化精度。
+
+- `act_preprocess_func` : 自定义在对激活做量化之前，对激活进行处理的函数。此方法的意义在于网络中的激活值不一定适合于直接量化，如果对激活值先进行处理再进行量化，或许可以提高量化精度。
+
+- `optimizer_func` : 该参数是一个返回optimizer的函数。定义的optimizer函数将用于定义上述自定义函数中的参数的优化参数。
+
+PACT方法属于自定义 `act_preprocess_func`, 输入是将要量化的激活值。
+
+可如下定义：
+
+```
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.layer_helper import LayerHelper
+
+def pact(x, name=None):
+    helper = LayerHelper("pact", **locals())
+    dtype = 'float32'
+    # 定义PACT初始阈值
+    init_thres = 20
+    u_param_attr = fluid.ParamAttr(
+        name=x.name + '_pact',
+        initializer=fluid.initializer.ConstantInitializer(value=init_thres),
+        regularizer=fluid.regularizer.L2Decay(0.0001),
+        learning_rate=1)
+    u_param = helper.create_parameter(
+        attr=u_param_attr, shape=[1], dtype=dtype)
+    x = fluid.layers.elementwise_sub(
+        x, fluid.layers.relu(fluid.layers.elementwise_sub(x, u_param)))
+    x = fluid.layers.elementwise_add(
+        x, fluid.layers.relu(fluid.layers.elementwise_sub(-u_param, x)))
+
+    return x
+```
+
+函数中可以定义初始阈值，和初始阈值的l2正则项系数，在训练过程中可根据梯度传播训练阈值为一个合适的值。
+
+优化器函数如下:
+
+```
+def get_optimizer():
+    return fluid.optimizer.MomentumOptimizer(0.001, 0.9)
+```
+因为除了PACT阈值以外，其他参数都是训练好的，因此在训练时可以将PACT中阈值的学习率调大一些。
+
+> 注意，因为PACT只是在量化时去掉了离群点，影响了量化scale的选择，因此使用PACT训练后，可以用普通量化的方法加载参数进行测试，是一个不影响预测的方法。
+
+## MobileNetV3的量化训练流程
+
+### 准备数据
+
+在``demo``文件夹下创建``data``文件夹，将``ImageNet``数据集解压在``data``文件夹下，解压后``data/ILSVRC2012``文件夹下应包含以下文件：
+- ``'train'``文件夹，训练图片
+- ``'train_list.txt'``文件
+- ``'val'``文件夹，验证图片
+- ``'val_list.txt'``文件
+
+### 准备需要量化的模型
+
+我们将使用飞桨分类库[PaddleClas](https://github.com/PaddlePaddle/PaddleClas)中给出的MobileNetV3精度最高的模型进行量化。量化前精度top-1为78.9%.
+
+```
+mkdir pretrain
+cd pretrain
+wget https://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV3_large_x1_0_ssld_pretrained.tar
+tar xf MobileNetV3_large_x1_0_ssld_pretrained.tar
+cd ..
+```
+
+使用该模型的原因是因为MobileNetV3这个使用ssld蒸馏之后的模型，激活值存在很多离群点，可有效地验证PACT的效果。下面的图是MobileNetV3的其中一个中间激活值分布的直方图:
+
+<p align="center">
+<img src="./image/activation_dist.png" height=400 width=500 hspace='10'/> <br />
+</p>
+图中直方图的横坐标的范围是激活值分布的最小值和最大值，从图中可以看出，最小值在-60左右，最大值在80左右，但是主要分布在-20到20之间。
+
+
+### 开启 `image` 的梯度
+
+因为目前实现的原因，需要将 `image` 的梯度开启。
+
+```
+image.stop_gradient = False
+```
+
+### 配置量化参数
+
+```
+quant_config = {
+    'weight_quantize_type': 'channel_wise_abs_max',
+    'activation_quantize_type': 'moving_average_abs_max',
+    'weight_bits': 8,
+    'activation_bits': 8,
+    'quantize_op_types': ['conv2d', 'depthwise_conv2d', 'mul'],
+    'dtype': 'int8',
+    'window_size': 10000,
+    'moving_rate': 0.9
+}
+```
+
+### 对训练和测试program插入可训练量化op
+
+普通量化：
+```
+val_program = quant_aware(val_program, place, quant_config, scope=None, for_test=True)
+
+compiled_train_prog = quant_aware(train_prog, place, quant_config, scope=None, for_test=False)
+```
+
+使用PACT的量化：
+```
+val_program = quant_aware(val_program, place, quant_config, scope=None, act_preprocess_func=pact, executor=exe, for_test=True)
+
+compiled_train_prog = quant_aware(train_prog, place, quant_config, scope=None, act_preprocess_func=pact, optimizer_func=get_optimizer,  executor=exe, for_test=False)
+```
+
+### 关掉指定build策略
+
+```
+build_strategy = fluid.BuildStrategy()
+build_strategy.fuse_all_reduce_ops = False
+build_strategy.sync_batch_norm = False
+exec_strategy = fluid.ExecutionStrategy()
+compiled_train_prog = compiled_train_prog.with_data_parallel(
+        loss_name=avg_cost.name,
+        build_strategy=build_strategy,
+        exec_strategy=exec_strategy)
+```
+
+
+### 训练命令
+
+普通量化：
+```
+python train.py --model MobileNetV3_large_x1_0 --pretrained_model ./pretrain/MobileNetV3_large_x1_0_ssld_pretrained --num_epochs 30 --lr 0.0001 --use_pact False
+
+```
+
+输出结果为:
+```
+2020-06-05 15:14:15,319-INFO: epoch[0]-batch[10] - loss: 2.50413322449; acc_top1: 0.515625; acc_top5: 0.75; time: 1.29066705704
+2020-06-05 15:14:28,950-INFO: epoch[0]-batch[20] - loss: 3.14219880104; acc_top1: 0.3828125; acc_top5: 0.62890625; time: 1.29546618462
+2020-06-05 15:14:42,479-INFO: epoch[0]-batch[30] - loss: 3.34660744667; acc_top1: 0.3671875; acc_top5: 0.609375; time: 1.20717287064
+2020-06-05 15:14:56,196-INFO: epoch[0]-batch[40] - loss: 3.69098854065; acc_top1: 0.2890625; acc_top5: 0.5546875; time: 1.29232215881
+2020-06-05 15:15:09,815-INFO: epoch[0]-batch[50] - loss: 3.5337202549; acc_top1: 0.30078125; acc_top5: 0.5546875; time: 1.34358000755
+2020-06-05 15:15:23,550-INFO: epoch[0]-batch[60] - loss: 3.22006082535; acc_top1: 0.359375; acc_top5: 0.609375; time: 1.34181118011
+2020-06-05 15:15:37,425-INFO: epoch[0]-batch[70] - loss: 3.06894540787; acc_top1: 0.4375; acc_top5: 0.65625; time: 1.33122491837
+2020-06-05 15:15:51,161-INFO: epoch[0]-batch[80] - loss: 3.00548839569; acc_top1: 0.3828125; acc_top5: 0.6328125; time: 1.27601099014
+2020-06-05 15:16:05,158-INFO: epoch[0]-batch[90] - loss: 2.52197813988; acc_top1: 0.484375; acc_top5: 0.71484375; time: 1.28280210495
+```
+可以看到普通量化loss不稳定，而且在实验进行到2个epoch时，loss会变为nan。普通量化很不稳定
+
+使用PACT量化训练
+```
+python train.py --model MobileNetV3_large_x1_0 --pretrained_model ./pretrain/MobileNetV3_large_x1_0_ssld_pretrained --num_epochs 30 --lr 0.0001 --use_pact True --batch_size 128 --lr_strategy=piecewise_decay --step_epochs 20 --l2_decay 1e-5
+```
+
+输出结果为
+```
+2020-06-05 15:25:37,647-INFO: epoch[0]-batch[10] - loss: 1.60160636902; acc_top1: 0.65625; acc_top5: 0.890625; time: 1.56788897514
+2020-06-05 15:25:53,191-INFO: epoch[0]-batch[20] - loss: 1.4748904705; acc_top1: 0.6484375; acc_top5: 0.84375; time: 1.4936029911
+2020-06-05 15:26:08,598-INFO: epoch[0]-batch[30] - loss: 1.427333951; acc_top1: 0.6953125; acc_top5: 0.875; time: 1.51066279411
+2020-06-05 15:26:24,009-INFO: epoch[0]-batch[40] - loss: 1.43955898285; acc_top1: 0.6640625; acc_top5: 0.8671875; time: 1.49221611023
+2020-06-05 15:26:39,501-INFO: epoch[0]-batch[50] - loss: 1.29342699051; acc_top1: 0.6953125; acc_top5: 0.90625; time: 1.50851297379
+2020-06-05 15:26:54,927-INFO: epoch[0]-batch[60] - loss: 1.49478590488; acc_top1: 0.6171875; acc_top5: 0.875; time: 1.50131177902
+2020-06-05 15:27:10,250-INFO: epoch[0]-batch[70] - loss: 1.34970903397; acc_top1: 0.7109375; acc_top5: 0.890625; time: 1.51333618164
+2020-06-05 15:27:25,309-INFO: epoch[0]-batch[80] - loss: 1.51600492001; acc_top1: 0.6796875; acc_top5: 0.859375; time: 1.44952607155
+2020-06-05 15:27:40,273-INFO: epoch[0]-batch[90] - loss: 1.5926772356; acc_top1: 0.6328125; acc_top5: 0.859375; time: 1.45620679855
+2020-06-05 15:27:55,660-INFO: epoch[0]-batch[100] - loss: 1.40280032158; acc_top1: 0.671875; acc_top5: 0.875; time: 1.50846099854
+```
+可以看出loss值比较稳定，并且我们在实验时，可以得到top-1 77.5%的量化模型。除了上述命令中的配置外，还要设置为 `pact` 初始阈值为20。量化模型可点击[下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/mobilenetv3_pact_quant.tar)。
diff --git a/demo/quant/pact_quant_aware/image/activation_dist.png b/demo/quant/pact_quant_aware/image/activation_dist.png
new file mode 100644
index 0000000000000000000000000000000000000000..9e133f8c6d9628d33410ce82d6ad4fa2233dd323
Binary files /dev/null and b/demo/quant/pact_quant_aware/image/activation_dist.png differ
diff --git a/demo/quant/pact_quant_aware/image/pact.png b/demo/quant/pact_quant_aware/image/pact.png
new file mode 100644
index 0000000000000000000000000000000000000000..86e0733fac37a968df73e24f1c9d2870be3e0988
Binary files /dev/null and b/demo/quant/pact_quant_aware/image/pact.png differ
diff --git a/demo/quant/pact_quant_aware/image/pact_our.png b/demo/quant/pact_quant_aware/image/pact_our.png
new file mode 100644
index 0000000000000000000000000000000000000000..62eefdb46bd634b832e0daf1d949dbe35d871406
Binary files /dev/null and b/demo/quant/pact_quant_aware/image/pact_our.png differ
diff --git a/demo/quant/pact_quant_aware/pact.py b/demo/quant/pact_quant_aware/pact.py
new file mode 100644
index 0000000000000000000000000000000000000000..26a2a5efd6e9b819db9b7134a62a1ac8c1fc296f
--- /dev/null
+++ b/demo/quant/pact_quant_aware/pact.py
@@ -0,0 +1,30 @@
+import sys
+import paddle
+import paddle.fluid as fluid
+from paddleslim.quant import quant_aware, convert
+import numpy as np
+
+from paddle.fluid.layer_helper import LayerHelper
+
+
+def pact(x, name=None):
+    helper = LayerHelper("pact", **locals())
+    dtype = 'float32'
+    init_thres = 20
+    u_param_attr = fluid.ParamAttr(
+        name=x.name + '_pact',
+        initializer=fluid.initializer.ConstantInitializer(value=init_thres),
+        regularizer=fluid.regularizer.L2Decay(0.0001),
+        learning_rate=1)
+    u_param = helper.create_parameter(
+        attr=u_param_attr, shape=[1], dtype=dtype)
+    x = fluid.layers.elementwise_sub(
+        x, fluid.layers.relu(fluid.layers.elementwise_sub(x, u_param)))
+    x = fluid.layers.elementwise_add(
+        x, fluid.layers.relu(fluid.layers.elementwise_sub(-u_param, x)))
+
+    return x
+
+
+def get_optimizer():
+    return fluid.optimizer.MomentumOptimizer(0.0001, 0.9)
diff --git a/demo/quant/pact_quant_aware/train.py b/demo/quant/pact_quant_aware/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..812c4a04861e25786b0d4f9503c5fd47bfb2a994
--- /dev/null
+++ b/demo/quant/pact_quant_aware/train.py
@@ -0,0 +1,362 @@
+import os
+import sys
+import logging
+import paddle
+import argparse
+import functools
+import math
+import time
+import numpy as np
+import paddle.fluid as fluid
+sys.path.append(os.path.dirname("__file__"))
+sys.path.append(
+    os.path.join(os.path.dirname("__file__"), os.path.pardir, os.path.pardir))
+from paddleslim.common import get_logger
+from paddleslim.analysis import flops
+from paddleslim.quant import quant_aware, quant_post, convert
+import models
+from utility import add_arguments, print_arguments
+from pact import *
+quantization_model_save_dir = './quantization_models/'
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',       int,  64 * 4,
+        "Minibatch size.")
+add_arg('use_gpu',          bool, True,
+        "Whether to use GPU or not.")
+add_arg('model',            str,  "MobileNet",
+        "The target model.")
+add_arg('pretrained_model', str,  "../pretrained_model/MobileNetV1_pretrained",
+        "Whether to use pretrained model.")
+add_arg('lr',               float,  0.0001,
+        "The learning rate used to fine-tune pruned model.")
+add_arg('lr_strategy',      str,  "piecewise_decay",
+        "The learning rate decay strategy.")
+add_arg('l2_decay',         float,  3e-5,
+        "The l2_decay parameter.")
+add_arg('momentum_rate',    float,  0.9,
+        "The value of momentum_rate.")
+add_arg('num_epochs',       int,  1,
+        "The number of total epochs.")
+add_arg('total_images',     int,  1281167,
+        "The number of total training images.")
+parser.add_argument('--step_epochs', nargs='+', type=int,
+        default=[30, 60, 90],
+        help="piecewise decay step")
+add_arg('config_file',      str, None,
+        "The config file for compression with yaml format.")
+add_arg('data',             str, "imagenet",
+        "Which data to use. 'mnist' or 'imagenet'")
+add_arg('log_period',       int, 10,
+        "Log period in batches.")
+add_arg('checkpoint_dir',         str, None,
+        "checkpoint dir")
+add_arg('checkpoint_epoch',         int, None,
+        "checkpoint epoch")
+add_arg('output_dir',         str, "output/MobileNetV3_large_x1_0",
+        "model save dir")
+add_arg('use_pact',          bool, True,
+        "Whether to use PACT or not.")
+
+# yapf: enable
+
+model_list = [m for m in dir(models) if "__" not in m]
+
+
+def piecewise_decay(args):
+    step = int(math.ceil(float(args.total_images) / args.batch_size))
+    bd = [step * e for e in args.step_epochs]
+    lr = [args.lr * (0.1**i) for i in range(len(bd) + 1)]
+    learning_rate = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=learning_rate,
+        momentum=args.momentum_rate,
+        regularization=fluid.regularizer.L2Decay(args.l2_decay))
+    return optimizer
+
+
+def cosine_decay(args):
+    step = int(math.ceil(float(args.total_images) / args.batch_size))
+    learning_rate = fluid.layers.cosine_decay(
+        learning_rate=args.lr, step_each_epoch=step, epochs=args.num_epochs)
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=learning_rate,
+        momentum=args.momentum_rate,
+        regularization=fluid.regularizer.L2Decay(args.l2_decay))
+    return optimizer
+
+
+def create_optimizer(args):
+    if args.lr_strategy == "piecewise_decay":
+        return piecewise_decay(args)
+    elif args.lr_strategy == "cosine_decay":
+        return cosine_decay(args)
+
+
+def compress(args):
+    # 1. quantization configs
+    quant_config = {
+        # weight quantize type, default is 'channel_wise_abs_max'
+        'weight_quantize_type': 'channel_wise_abs_max',
+        # activation quantize type, default is 'moving_average_abs_max'
+        'activation_quantize_type': 'moving_average_abs_max',
+        # weight quantize bit num, default is 8
+        'weight_bits': 8,
+        # activation quantize bit num, default is 8
+        'activation_bits': 8,
+        # ops of name_scope in not_quant_pattern list, will not be quantized
+        'not_quant_pattern': ['skip_quant'],
+        # ops of type in quantize_op_types, will be quantized
+        'quantize_op_types': ['conv2d', 'depthwise_conv2d', 'mul'],
+        # data type after quantization, such as 'uint8', 'int8', etc. default is 'int8'
+        'dtype': 'int8',
+        # window size for 'range_abs_max' quantization. defaulf is 10000
+        'window_size': 10000,
+        # The decay coefficient of moving average, default is 0.9
+        'moving_rate': 0.9,
+    }
+
+    train_reader = None
+    test_reader = None
+    if args.data == "mnist":
+        import paddle.dataset.mnist as reader
+        train_reader = reader.train()
+        val_reader = reader.test()
+        class_dim = 10
+        image_shape = "1,28,28"
+    elif args.data == "imagenet":
+        import imagenet_reader as reader
+        train_reader = reader.train()
+        val_reader = reader.val()
+        class_dim = 1000
+        image_shape = "3,224,224"
+    else:
+        raise ValueError("{} is not supported.".format(args.data))
+
+    image_shape = [int(m) for m in image_shape.split(",")]
+    assert args.model in model_list, "{} is not in lists: {}".format(args.model,
+                                                                     model_list)
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    if args.use_pact:
+        image.stop_gradient = False
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    # model definition
+    model = models.__dict__[args.model]()
+    out = model.net(input=image, class_dim=class_dim)
+    cost = fluid.layers.cross_entropy(input=out, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+
+    train_prog = fluid.default_main_program()
+    val_program = fluid.default_main_program().clone(for_test=True)
+
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    opt = create_optimizer(args)
+    opt.minimize(avg_cost)
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    # 2. quantization transform programs (training aware)
+    #    Make some quantization transforms in the graph before training and testing.
+    #    According to the weight and activation quantization type, the graph will be added
+    #    some fake quantize operators and fake dequantize operators.
+
+    if args.use_pact:
+        act_preprocess_func = pact
+        optimizer_func = get_optimizer
+        executor = exe
+    else:
+        act_preprocess_func = None
+        optimizer_func = None
+        executor = None
+
+    val_program = quant_aware(
+        val_program,
+        place,
+        quant_config,
+        scope=None,
+        act_preprocess_func=act_preprocess_func,
+        optimizer_func=optimizer_func,
+        executor=executor,
+        for_test=True)
+    compiled_train_prog = quant_aware(
+        train_prog,
+        place,
+        quant_config,
+        scope=None,
+        act_preprocess_func=act_preprocess_func,
+        optimizer_func=optimizer_func,
+        executor=executor,
+        for_test=False)
+
+    assert os.path.exists(
+        args.pretrained_model), "pretrained_model doesn't exist"
+
+    if args.pretrained_model:
+
+        def if_exist(var):
+            return os.path.exists(os.path.join(args.pretrained_model, var.name))
+
+        fluid.io.load_vars(exe, args.pretrained_model, predicate=if_exist)
+
+    val_reader = paddle.fluid.io.batch(val_reader, batch_size=args.batch_size)
+    train_reader = paddle.fluid.io.batch(
+        train_reader, batch_size=args.batch_size, drop_last=True)
+
+    train_feeder = feeder = fluid.DataFeeder([image, label], place)
+    val_feeder = feeder = fluid.DataFeeder(
+        [image, label], place, program=val_program)
+
+    def test(epoch, program):
+        batch_id = 0
+        acc_top1_ns = []
+        acc_top5_ns = []
+        for data in val_reader():
+            start_time = time.time()
+            acc_top1_n, acc_top5_n = exe.run(
+                program,
+                feed=train_feeder.feed(data),
+                fetch_list=[acc_top1.name, acc_top5.name])
+            end_time = time.time()
+            if batch_id % args.log_period == 0:
+                _logger.info(
+                    "Eval epoch[{}] batch[{}] - acc_top1: {}; acc_top5: {}; time: {}".
+                    format(epoch, batch_id,
+                           np.mean(acc_top1_n),
+                           np.mean(acc_top5_n), end_time - start_time))
+            acc_top1_ns.append(np.mean(acc_top1_n))
+            acc_top5_ns.append(np.mean(acc_top5_n))
+            batch_id += 1
+
+        _logger.info("Final eval epoch[{}] - acc_top1: {}; acc_top5: {}".format(
+            epoch,
+            np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns))))
+        return np.mean(np.array(acc_top1_ns))
+
+    def train(epoch, compiled_train_prog):
+
+        batch_id = 0
+        for data in train_reader():
+            start_time = time.time()
+            loss_n, acc_top1_n, acc_top5_n = exe.run(
+                compiled_train_prog,
+                feed=train_feeder.feed(data),
+                fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name])
+
+            end_time = time.time()
+            loss_n = np.mean(loss_n)
+            acc_top1_n = np.mean(acc_top1_n)
+            acc_top5_n = np.mean(acc_top5_n)
+            if batch_id % args.log_period == 0:
+                _logger.info(
+                    "epoch[{}]-batch[{}] - loss: {}; acc_top1: {}; acc_top5: {}; time: {}".
+                    format(epoch, batch_id, loss_n, acc_top1_n, acc_top5_n,
+                           end_time - start_time))
+
+            if args.use_pact and batch_id % 1000 == 0:
+                threshold = {}
+                for var in val_program.list_vars():
+                    if 'pact' in var.name:
+                        array = np.array(fluid.global_scope().find_var(var.name)
+                                         .get_tensor())
+                        threshold[var.name] = array[0]
+                print(threshold)
+
+            batch_id += 1
+
+    build_strategy = fluid.BuildStrategy()
+    build_strategy.memory_optimize = False
+    build_strategy.enable_inplace = False
+    build_strategy.fuse_all_reduce_ops = False
+    build_strategy.sync_batch_norm = False
+    exec_strategy = fluid.ExecutionStrategy()
+    compiled_train_prog = compiled_train_prog.with_data_parallel(
+        loss_name=avg_cost.name,
+        build_strategy=build_strategy,
+        exec_strategy=exec_strategy)
+
+    # train loop
+    best_acc1 = 0.0
+    best_epoch = 0
+
+    start_epoch = 0
+    if args.checkpoint_dir is not None:
+        ckpt_path = args.checkpoint_dir
+        assert args.checkpoint_epoch is not None, "checkpoint_epoch must be set"
+        start_epoch = args.checkpoint_epoch
+        fluid.io.load_persistables(
+            exe, dirname=args.checkpoint_dir, main_program=val_program)
+        start_step = start_epoch * int(
+            math.ceil(float(args.total_images) / args.batch_size))
+        v = fluid.global_scope().find_var('@LR_DECAY_COUNTER@').get_tensor()
+        v.set(np.array([start_step]).astype(np.float32), place)
+
+    for i in range(start_epoch, args.num_epochs):
+        train(i, compiled_train_prog)
+        acc1 = test(i, val_program)
+        fluid.io.save_persistables(
+            exe,
+            dirname=os.path.join(args.output_dir, str(i)),
+            main_program=val_program)
+        if acc1 > best_acc1:
+            best_acc1 = acc1
+            best_epoch = i
+            fluid.io.save_persistables(
+                exe,
+                dirname=os.path.join(args.output_dir, 'best_model'),
+                main_program=val_program)
+    if os.path.exists(os.path.join(args.output_dir, 'best_model')):
+        fluid.io.load_persistables(
+            exe,
+            dirname=os.path.join(args.output_dir, 'best_model'),
+            main_program=val_program)
+    # 3. Freeze the graph after training by adjusting the quantize
+    #    operators' order for the inference.
+    #    The dtype of float_program's weights is float32, but in int8 range.
+    float_program, int8_program = convert(val_program, place, quant_config, \
+                                                        scope=None, \
+                                                        save_int8=True)
+    print("eval best_model after convert")
+    final_acc1 = test(best_epoch, float_program)
+    # 4. Save inference model
+    model_path = os.path.join(quantization_model_save_dir, args.model,
+                              'act_' + quant_config['activation_quantize_type']
+                              + '_w_' + quant_config['weight_quantize_type'])
+    float_path = os.path.join(model_path, 'float')
+    int8_path = os.path.join(model_path, 'int8')
+    if not os.path.isdir(model_path):
+        os.makedirs(model_path)
+
+    fluid.io.save_inference_model(
+        dirname=float_path,
+        feeded_var_names=[image.name],
+        target_vars=[out],
+        executor=exe,
+        main_program=float_program,
+        model_filename=float_path + '/model',
+        params_filename=float_path + '/params')
+
+    fluid.io.save_inference_model(
+        dirname=int8_path,
+        feeded_var_names=[image.name],
+        target_vars=[out],
+        executor=exe,
+        main_program=int8_program,
+        model_filename=int8_path + '/model',
+        params_filename=int8_path + '/params')
+
+
+def main():
+    args = parser.parse_args()
+    print_arguments(args)
+    compress(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/quant/quant_aware/README.md b/demo/quant/quant_aware/README.md
index 5fae50c5ff752c36863bfa57a9a9f08135b90f00..0c7315625f04a51cede038a4ba6ebf991d9efaa0 100644
--- a/demo/quant/quant_aware/README.md
+++ b/demo/quant/quant_aware/README.md
@@ -4,11 +4,31 @@
 
 ## 接口介绍
 
-请参考 <a href='../../../paddleslim/quant/quantization_api_doc.md'>量化API文档</a>。
+请参考 <a href='https://paddlepaddle.github.io/PaddleSlim/api_cn/quantization_api.html#quant-aware'>量化API文档</a>。
 
-## 分类模型的离线量化流程
+## 分类模型的量化训练流程
 
-### 1. 配置量化参数
+### 准备数据
+
+在``demo``文件夹下创建``data``文件夹，将``ImageNet``数据集解压在``data``文件夹下，解压后``data/ILSVRC2012``文件夹下应包含以下文件：
+- ``'train'``文件夹，训练图片
+- ``'train_list.txt'``文件
+- ``'val'``文件夹，验证图片
+- ``'val_list.txt'``文件
+
+### 准备需要量化的模型
+
+使用以下命令下载训练好的模型并解压。
+
+```
+mkdir pretrain
+cd pretrain
+wget http://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV1_pretrained.tar
+tar xf MobileNetV1_pretrained.tar
+cd ..
+```
+
+### 配置量化参数
 
 ```
 quant_config = {
@@ -20,12 +40,11 @@ quant_config = {
     'quantize_op_types': ['conv2d', 'depthwise_conv2d', 'mul'],
     'dtype': 'int8',
     'window_size': 10000,
-    'moving_rate': 0.9,
-    'quant_weight_only': False
+    'moving_rate': 0.9
 }
 ```
 
-### 2. 对训练和测试program插入可训练量化op
+### 对训练和测试program插入可训练量化op
 
 ```
 val_program = quant_aware(val_program, place, quant_config, scope=None, for_test=True)
@@ -33,7 +52,7 @@ val_program = quant_aware(val_program, place, quant_config, scope=None, for_test
 compiled_train_prog = quant_aware(train_prog, place, quant_config, scope=None, for_test=False)
 ```
 
-### 3.关掉指定build策略
+### 关掉指定build策略
 
 ```
 build_strategy = fluid.BuildStrategy()
@@ -46,32 +65,10 @@ compiled_train_prog = compiled_train_prog.with_data_parallel(
         exec_strategy=exec_strategy)
 ```
 
-### 4. freeze program
-
-```
-float_program, int8_program = convert(val_program, 
-                                      place,
-                                      quant_config,
-                                      scope=None,
-                                      save_int8=True)
-```
 
-### 5.保存预测模型
+### 训练命令
 
 ```
-fluid.io.save_inference_model(
-    dirname=float_path,
-    feeded_var_names=[image.name],
-    target_vars=[out], executor=exe,
-    main_program=float_program,
-    model_filename=float_path + '/model',
-    params_filename=float_path + '/params')
-
-fluid.io.save_inference_model(
-    dirname=int8_path,
-    feeded_var_names=[image.name],
-    target_vars=[out], executor=exe,
-    main_program=int8_program,
-    model_filename=int8_path + '/model',
-    params_filename=int8_path + '/params')
+python train.py --model MobileNet --pretrained_model ./pretrain/MobileNetV1_pretrained --checkpoint_dir ./output/mobilenetv1 --num_epochs 30
 ```
+运行之后，可看到``best_model``的最后测试结果，和MobileNet量化前的精度top1=70.99%, top5=89.68%非常相近。
diff --git a/demo/quant/quant_aware/image_classification_training_aware_quantization_quick_start.ipynb b/demo/quant/quant_aware/image_classification_training_aware_quantization_quick_start.ipynb
new file mode 100755
index 0000000000000000000000000000000000000000..0c771cc096b26d79cc7d0b0b2aa87ff29bbce27f
--- /dev/null
+++ b/demo/quant/quant_aware/image_classification_training_aware_quantization_quick_start.ipynb
@@ -0,0 +1,342 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 图像分类模型量化训练-快速开始"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "该教程以图像分类模型MobileNetV1为例，说明如何快速使用PaddleSlim的[量化训练接口](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/docs/api/quantization_api.md)。 该示例包含以下步骤：\n",
+    "\n",
+    "1. 导入依赖\n",
+    "2. 构建模型\n",
+    "3. 训练模型\n",
+    "4. 量化\n",
+    "5. 训练和测试量化后的模型\n",
+    "6. 保存量化后的模型"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. 导入依赖\n",
+    "PaddleSlim依赖Paddle1.7版本，请确认已正确安装Paddle，然后按以下方式导入Paddle和PaddleSlim:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import paddle\n",
+    "import paddle.fluid as fluid\n",
+    "import paddleslim as slim\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. 构建网络\n",
+    "该章节构造一个用于对MNIST数据进行分类的分类模型，选用`MobileNetV1`，并将输入大小设置为`[1, 28, 28]`，输出类别数为10。               为了方便展示示例，我们在`paddleslim.models`下预定义了用于构建分类模型的方法，执行以下代码构建分类模型：\n",
+    "\n",
+    ">注意：paddleslim.models下的API并非PaddleSlim常规API，是为了简化示例而封装预定义的一系列方法，比如：模型结构的定义、Program的构建等。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "exe, train_program, val_program, inputs, outputs = \\\n",
+    "    slim.models.image_classification(\"MobileNet\", [1, 28, 28], 10, use_gpu=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. 训练模型\n",
+    "该章节介绍了如何定义输入数据和如何训练和测试分类模型。先训练分类模型的原因是量化训练过程是在训练好的模型上进行的，也就是说是在训练好的模型的基础上加入量化反量化op之后，用小学习率进行参数微调。\n",
+    "\n",
+    "### 3.1 定义输入数据\n",
+    "\n",
+    "为了快速执行该示例，我们选取简单的MNIST数据，Paddle框架的`paddle.dataset.mnist`包定义了MNIST数据的下载和读取。\n",
+    "代码如下："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import paddle.dataset.mnist as reader\n",
+    "train_reader = paddle.fluid.io.batch(\n",
+    "        reader.train(), batch_size=128, drop_last=True)\n",
+    "test_reader = paddle.fluid.io.batch(\n",
+    "        reader.train(), batch_size=128, drop_last=True)\n",
+    "train_feeder = fluid.DataFeeder(inputs, fluid.CPUPlace())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.2 训练和测试\n",
+    "先定义训练和测试函数，正常训练和量化训练时只需要调用函数即可。在训练函数中执行了一个epoch的训练，因为MNIST数据集数据较少，一个epoch就可将top1精度训练到95%以上。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train(prog):\n",
+    "    iter = 0\n",
+    "    for data in train_reader():\n",
+    "        acc1, acc5, loss = exe.run(prog, feed=train_feeder.feed(data), fetch_list=outputs)\n",
+    "        if iter % 100 == 0:\n",
+    "            print('train iter={}, top1={}, top5={}, loss={}'.format(iter, acc1.mean(), acc5.mean(), loss.mean()))\n",
+    "        iter += 1\n",
+    "        \n",
+    "def test(prog):\n",
+    "    iter = 0\n",
+    "    res = [[], []]\n",
+    "    for data in train_reader():\n",
+    "        acc1, acc5, loss = exe.run(prog, feed=train_feeder.feed(data), fetch_list=outputs)\n",
+    "        if iter % 100 == 0:\n",
+    "            print('test iter={}, top1={}, top5={}, loss={}'.format(iter, acc1.mean(), acc5.mean(), loss.mean()))\n",
+    "        res[0].append(acc1.mean())\n",
+    "        res[1].append(acc5.mean())\n",
+    "        iter += 1\n",
+    "    print('final test result top1={}, top5={}'.format(np.array(res[0]).mean(), np.array(res[1]).mean()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "调用``train``函数训练分类网络，``train_program``是在第2步：构建网络中定义的。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train iter=0, top1=0.1171875, top5=0.546875, loss=2.79680204391\n",
+      "train iter=100, top1=0.9296875, top5=1.0, loss=0.305284500122\n",
+      "train iter=200, top1=0.9609375, top5=0.9921875, loss=0.158525630832\n",
+      "train iter=300, top1=0.9609375, top5=0.9921875, loss=0.146427512169\n",
+      "train iter=400, top1=0.9609375, top5=1.0, loss=0.179066047072\n"
+     ]
+    }
+   ],
+   "source": [
+    "train(train_program)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "调用``test``函数测试分类网络，``val_program``是在第2步：构建网络中定义的。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "test iter=0, top1=0.96875, top5=1.0, loss=0.0801232308149\n",
+      "test iter=100, top1=0.9609375, top5=1.0, loss=0.104892581701\n",
+      "test iter=200, top1=0.96875, top5=1.0, loss=0.156774014235\n",
+      "test iter=300, top1=0.984375, top5=1.0, loss=0.0931615754962\n",
+      "test iter=400, top1=0.9453125, top5=1.0, loss=0.184863254428\n",
+      "final test result top1=0.970469415188, top5=0.999282181263\n"
+     ]
+    }
+   ],
+   "source": [
+    "test(val_program)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. 量化\n",
+    "\n",
+    "按照[默认配置](https://paddlepaddle.github.io/PaddleSlim/api/quantization_api/#_1)在``train_program``和``val_program``中加入量化和反量化op."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-06 09:08:49,489-INFO: quant_aware config {'moving_rate': 0.9, 'weight_quantize_type': 'channel_wise_abs_max', 'is_full_quantize': False, 'dtype': 'int8', 'weight_bits': 8, 'window_size': 10000, 'activation_bits': 8, 'quantize_op_types': ['conv2d', 'depthwise_conv2d', 'mul'], 'not_quant_pattern': ['skip_quant'], 'activation_quantize_type': 'moving_average_abs_max', 'for_tensorrt': False}\n",
+      "2020-02-06 09:08:50,943-INFO: quant_aware config {'moving_rate': 0.9, 'weight_quantize_type': 'channel_wise_abs_max', 'is_full_quantize': False, 'dtype': 'int8', 'weight_bits': 8, 'window_size': 10000, 'activation_bits': 8, 'quantize_op_types': ['conv2d', 'depthwise_conv2d', 'mul'], 'not_quant_pattern': ['skip_quant'], 'activation_quantize_type': 'moving_average_abs_max', 'for_tensorrt': False}\n"
+     ]
+    }
+   ],
+   "source": [
+    "quant_program = slim.quant.quant_aware(train_program, exe.place, for_test=False)\n",
+    "val_quant_program = slim.quant.quant_aware(val_program, exe.place, for_test=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. 训练和测试量化后的模型\n",
+    "微调量化后的模型，训练一个epoch后测试。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train iter=0, top1=0.953125, top5=1.0, loss=0.184170544147\n",
+      "train iter=100, top1=0.96875, top5=1.0, loss=0.0945074558258\n",
+      "train iter=200, top1=0.9765625, top5=1.0, loss=0.0915599390864\n",
+      "train iter=300, top1=0.9765625, top5=1.0, loss=0.0562560297549\n",
+      "train iter=400, top1=0.9609375, top5=1.0, loss=0.094195574522\n"
+     ]
+    }
+   ],
+   "source": [
+    "train(quant_program)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "测试量化后的模型，和``3.2 训练和测试``中得到的测试结果相比，精度相近，达到了无损量化。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "test iter=0, top1=0.984375, top5=1.0, loss=0.0542894415557\n",
+      "test iter=100, top1=0.9609375, top5=1.0, loss=0.0662319809198\n",
+      "test iter=200, top1=0.9609375, top5=1.0, loss=0.0832970961928\n",
+      "test iter=300, top1=0.9921875, top5=1.0, loss=0.0262515246868\n",
+      "test iter=400, top1=0.96875, top5=1.0, loss=0.123742781579\n",
+      "final test result top1=0.984057843685, top5=0.999799668789\n"
+     ]
+    }
+   ],
+   "source": [
+    "test(val_quant_program)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. 保存量化后的模型\n",
+    "\n",
+    "在``4. 量化``中使用接口``slim.quant.quant_aware``接口得到的模型只适合训练时使用，为了得到最终使用时的模型，需要使用[slim.quant.convert](https://paddlepaddle.github.io/PaddleSlim/api/quantization_api/#convert)接口，然后使用[fluid.io.save_inference_model](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/io_cn/save_inference_model_cn.html#save-inference-model)保存模型。``float_prog``的参数数据类型是float32，但是数据范围是int8, 保存之后可使用fluid或者paddle-lite加载使用，paddle-lite在使用时，会先将类型转换为int8。``int8_prog``的参数数据类型是int8, 保存后可看到量化后模型大小，不可加载使用。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-06 09:09:27,529-INFO: convert config {'moving_rate': 0.9, 'weight_quantize_type': 'channel_wise_abs_max', 'is_full_quantize': False, 'dtype': 'int8', 'weight_bits': 8, 'window_size': 10000, 'activation_bits': 8, 'quantize_op_types': ['conv2d', 'depthwise_conv2d', 'mul'], 'not_quant_pattern': ['skip_quant'], 'activation_quantize_type': 'moving_average_abs_max', 'for_tensorrt': False}\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[u'save_infer_model/scale_0',\n",
+       " u'save_infer_model/scale_1',\n",
+       " u'save_infer_model/scale_2']"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "float_prog, int8_prog = slim.quant.convert(val_quant_program, exe.place, save_int8=True)\n",
+    "target_vars = [float_prog.global_block().var(name) for name in outputs]\n",
+    "fluid.io.save_inference_model(dirname='./inference_model/float',\n",
+    "        feeded_var_names=[var.name for var in inputs],\n",
+    "        target_vars=target_vars,\n",
+    "        executor=exe,\n",
+    "        main_program=float_prog)\n",
+    "fluid.io.save_inference_model(dirname='./inference_model/int8',\n",
+    "        feeded_var_names=[var.name for var in inputs],\n",
+    "        target_vars=target_vars,\n",
+    "        executor=exe,\n",
+    "        main_program=int8_prog)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/demo/quant/quant_aware/train.py b/demo/quant/quant_aware/train.py
index 45b1aa72c062e1c8ceebdfb92e2f80df80246a58..c613380af9581414a45be86e8d4708e91c15b2e1 100644
--- a/demo/quant/quant_aware/train.py
+++ b/demo/quant/quant_aware/train.py
@@ -8,11 +8,11 @@ import math
 import time
 import numpy as np
 import paddle.fluid as fluid
-sys.path.append(sys.path[0] + "../../../")
-sys.path.append(sys.path[0] + "../../")
+sys.path[0] = os.path.join(
+    os.path.dirname("__file__"), os.path.pardir, os.path.pardir)
 from paddleslim.common import get_logger
 from paddleslim.analysis import flops
-from paddleslim.quant import quant_aware, quant_post, convert
+from paddleslim.quant import quant_aware, convert
 import models
 from utility import add_arguments, print_arguments
 
@@ -37,7 +37,7 @@ parser.add_argument('--step_epochs', nargs='+', type=int, default=[30, 60, 90],
 add_arg('config_file',      str, None,                 "The config file for compression with yaml format.")
 add_arg('data',             str, "imagenet",             "Which data to use. 'mnist' or 'imagenet'")
 add_arg('log_period',       int, 10,                 "Log period in batches.")
-add_arg('test_period',      int, 10,                 "Test period in epoches.")
+add_arg('checkpoint_dir',         str, "output",           "checkpoint save dir")
 # yapf: enable
 
 model_list = [m for m in dir(models) if "__" not in m]
@@ -78,27 +78,24 @@ def compress(args):
     # 1. quantization configs
     ############################################################################################################
     quant_config = {
-        # weight quantize type, default is 'abs_max'
-        'weight_quantize_type': 'abs_max',
-        # activation quantize type, default is 'abs_max'
+        # weight quantize type, default is 'channel_wise_abs_max'
+        'weight_quantize_type': 'channel_wise_abs_max',
+        # activation quantize type, default is 'moving_average_abs_max'
         'activation_quantize_type': 'moving_average_abs_max',
         # weight quantize bit num, default is 8
         'weight_bits': 8,
         # activation quantize bit num, default is 8
         'activation_bits': 8,
-        # op of name_scope in not_quant_pattern list, will not quantized
+        # ops of name_scope in not_quant_pattern list, will not be quantized
         'not_quant_pattern': ['skip_quant'],
-        # op of types in quantize_op_types, will quantized
+        # ops of type in quantize_op_types, will be quantized
         'quantize_op_types': ['conv2d', 'depthwise_conv2d', 'mul'],
-        # data type after quantization, default is 'int8'
+        # data type after quantization, such as 'uint8', 'int8', etc. default is 'int8'
         'dtype': 'int8',
         # window size for 'range_abs_max' quantization. defaulf is 10000
         'window_size': 10000,
         # The decay coefficient of moving average, default is 0.9
         'moving_rate': 0.9,
-        # if set quant_weight_only True, then only quantize parameters of layers which need quantization,
-        # and insert anti-quantization op for parameters of these layers.
-        'quant_weight_only': False
     }
 
     train_reader = None
@@ -141,23 +138,29 @@ def compress(args):
     #    According to the weight and activation quantization type, the graph will be added
     #    some fake quantize operators and fake dequantize operators.
     ############################################################################################################
-    val_program = quant_aware(val_program, place, quant_config, scope=None, for_test=True)
-    compiled_train_prog = quant_aware(train_prog, place, quant_config, scope=None, for_test=False)
+    val_program = quant_aware(
+        val_program, place, quant_config, scope=None, for_test=True)
+    compiled_train_prog = quant_aware(
+        train_prog, place, quant_config, scope=None, for_test=False)
     opt = create_optimizer(args)
     opt.minimize(avg_cost)
 
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
 
+    assert os.path.exists(
+        args.pretrained_model), "pretrained_model doesn't exist"
+
     if args.pretrained_model:
 
         def if_exist(var):
-            return os.path.exists(os.path.join(args.pretrained_model, var.name))
+            return os.path.exists(
+                os.path.join(args.pretrained_model, var.name))
 
         fluid.io.load_vars(exe, args.pretrained_model, predicate=if_exist)
 
-    val_reader = paddle.batch(val_reader, batch_size=args.batch_size)
-    train_reader = paddle.batch(
+    val_reader = paddle.fluid.io.batch(val_reader, batch_size=args.batch_size)
+    train_reader = paddle.fluid.io.batch(
         train_reader, batch_size=args.batch_size, drop_last=True)
 
     train_feeder = feeder = fluid.DataFeeder([image, label], place)
@@ -192,16 +195,6 @@ def compress(args):
         return np.mean(np.array(acc_top1_ns))
 
     def train(epoch, compiled_train_prog):
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.memory_optimize = False
-        build_strategy.enable_inplace = False
-        build_strategy.fuse_all_reduce_ops = False
-        build_strategy.sync_batch_norm = False
-        exec_strategy = fluid.ExecutionStrategy()
-        compiled_train_prog = compiled_train_prog.with_data_parallel(
-                loss_name=avg_cost.name,
-                build_strategy=build_strategy,
-                exec_strategy=exec_strategy)
 
         batch_id = 0
         for data in train_reader():
@@ -221,14 +214,41 @@ def compress(args):
                            end_time - start_time))
             batch_id += 1
 
+    build_strategy = fluid.BuildStrategy()
+    build_strategy.memory_optimize = False
+    build_strategy.enable_inplace = False
+    build_strategy.fuse_all_reduce_ops = False
+    build_strategy.sync_batch_norm = False
+    exec_strategy = fluid.ExecutionStrategy()
+    compiled_train_prog = compiled_train_prog.with_data_parallel(
+        loss_name=avg_cost.name,
+        build_strategy=build_strategy,
+        exec_strategy=exec_strategy)
+
     ############################################################################################################
     # train loop
     ############################################################################################################
+    best_acc1 = 0.0
+    best_epoch = 0
     for i in range(args.num_epochs):
         train(i, compiled_train_prog)
-        if i % args.test_period == 0:
-            test(i, val_program)
-
+        acc1 = test(i, val_program)
+        fluid.io.save_persistables(
+            exe,
+            dirname=os.path.join(args.checkpoint_dir, str(i)),
+            main_program=val_program)
+        if acc1 > best_acc1:
+            best_acc1 = acc1
+            best_epoch = i
+            fluid.io.save_persistables(
+                exe,
+                dirname=os.path.join(args.checkpoint_dir, 'best_model'),
+                main_program=val_program)
+    if os.path.exists(os.path.join(args.checkpoint_dir, 'best_model')):
+        fluid.io.load_persistables(
+            exe,
+            dirname=os.path.join(args.checkpoint_dir, 'best_model'),
+            main_program=val_program)
     ############################################################################################################
     # 3. Freeze the graph after training by adjusting the quantize
     #    operators' order for the inference.
@@ -237,13 +257,14 @@ def compress(args):
     float_program, int8_program = convert(val_program, place, quant_config, \
                                                         scope=None, \
                                                         save_int8=True)
-
+    print("eval best_model after convert")
+    final_acc1 = test(best_epoch, float_program)
     ############################################################################################################
     # 4. Save inference model
     ############################################################################################################
     model_path = os.path.join(quantization_model_save_dir, args.model,
-                              'act_' + quant_config['activation_quantize_type'] + '_w_' + quant_config[
-                                  'weight_quantize_type'])
+                              'act_' + quant_config['activation_quantize_type']
+                              + '_w_' + quant_config['weight_quantize_type'])
     float_path = os.path.join(model_path, 'float')
     int8_path = os.path.join(model_path, 'int8')
     if not os.path.isdir(model_path):
@@ -252,7 +273,8 @@ def compress(args):
     fluid.io.save_inference_model(
         dirname=float_path,
         feeded_var_names=[image.name],
-        target_vars=[out], executor=exe,
+        target_vars=[out],
+        executor=exe,
         main_program=float_program,
         model_filename=float_path + '/model',
         params_filename=float_path + '/params')
@@ -260,7 +282,8 @@ def compress(args):
     fluid.io.save_inference_model(
         dirname=int8_path,
         feeded_var_names=[image.name],
-        target_vars=[out], executor=exe,
+        target_vars=[out],
+        executor=exe,
         main_program=int8_program,
         model_filename=int8_path + '/model',
         params_filename=int8_path + '/params')
diff --git a/demo/quant/quant_embedding/README.md b/demo/quant/quant_embedding/README.md
index 422ef5b6ecbf96a356dfb6e8943d2863f6da5e23..609b25159e2baa97e3b1e370edc7074e1a29dcb8 100755
--- a/demo/quant/quant_embedding/README.md
+++ b/demo/quant/quant_embedding/README.md
@@ -3,7 +3,7 @@
 本示例介绍如何使用Embedding量化的接口 [paddleslim.quant.quant_embedding]() 。``quant_embedding``接口将网络中的Embedding参数从``float32``类型量化到 ``8-bit``整数类型，在几乎不损失模型精度的情况下减少模型的存储空间和显存占用。
 
 
-接口介绍请参考 <a href='../../../paddleslim/quant/quantization_api_doc.md'>量化API文档</a>。
+接口介绍请参考 <a href='https://paddlepaddle.github.io/PaddleSlim/api_cn/quantization_api.html#quant-embedding'>量化API文档</a>。
 
 该接口对program的修改：
 
diff --git a/demo/quant/quant_embedding/infer.py b/demo/quant/quant_embedding/infer.py
index 40ae2ee8c639754d24a5474c5e58d7e062a1d4d0..8cfcd5a15a3e1e95f5d0ae71673be016c12b077f 100755
--- a/demo/quant/quant_embedding/infer.py
+++ b/demo/quant/quant_embedding/infer.py
@@ -80,7 +80,12 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w):
                     dirname=model_path,
                     main_program=copy_program)
                 if args.emb_quant:
-                    config = {'params_name': 'emb', 'quantize_type': 'abs_max'}
+                    config = {
+                        'quantize_op_types': 'lookup_table',
+                        'lookup_table': {
+                            'quantize_type': 'abs_max'
+                        },
+                    }
                     copy_program = quant_embedding(copy_program, place, config)
                     fluid.io.save_persistables(
                         exe,
diff --git a/demo/quant/quant_post/README.md b/demo/quant/quant_post/README.md
index 72cd68781d6de71aca19d3b34f1daf187494f371..b5c66cc7c8c0e5761e798f24ccc48e6918fd6ec6 100755
--- a/demo/quant/quant_post/README.md
+++ b/demo/quant/quant_post/README.md
@@ -1,16 +1,16 @@
-# 离线量化示例
+# 静态离线量化示例
 
-本示例介绍如何使用离线量化接口``paddleslim.quant.quant_post``来对训练好的分类模型进行离线量化, 该接口无需对模型进行训练就可得到量化模型，减少模型的存储空间和显存占用。
+本示例介绍如何使用离线量化接口``paddleslim.quant.quant_post_static``来对训练好的分类模型进行离线量化, 该接口无需对模型进行训练就可得到量化模型，减少模型的存储空间和显存占用。
 
 ## 接口介绍
 
-请参考 <a href='../../../paddleslim/quant/quantization_api_doc.md'>量化API文档</a>。
+请参考 <a href='https://paddlepaddle.github.io/PaddleSlim/api_cn/quantization_api.html#quant-post-static'>量化API文档</a>。
 
 ## 分类模型的离线量化流程
 
 ### 准备数据
 
-在当前文件夹下创建``data``文件夹，将``imagenet``数据集解压在``data``文件夹下，解压后``data``文件夹下应包含以下文件：
+在``demo``文件夹下创建``data``文件夹，将``ImageNet``数据集解压在``data``文件夹下，解压后``data/ILSVRC2012``文件夹下应包含以下文件：
 - ``'train'``文件夹，训练图片
 - ``'train_list.txt'``文件
 - ``'val'``文件夹，验证图片
@@ -30,10 +30,10 @@ python export_model.py --model "MobileNet" --pretrained_model ./pretrain/MobileN
 ```
 转化之后的模型存储在``inference_model/MobileNet/``文件夹下，可看到该文件夹下有``'model'``, ``'weights'``两个文件。
 
-### 离线量化
-接下来对导出的模型文件进行离线量化，离线量化的脚本为[quant_post.py](./quant_post.py)，脚本中使用接口``paddleslim.quant.quant_post``对模型进行离线量化。运行命令为：
+### 静态离线量化
+接下来对导出的模型文件进行静态离线量化，静态离线量化的脚本为[quant_post.py](./quant_post.py)，脚本中使用接口``paddleslim.quant.quant_post_static``对模型进行离线量化。运行命令为：
 ```
-python quant_post.py --model_path ./inference_model/MobileNet --save_path ./quant_model_train/MobileNet --model_filename model --params_filename weights
+python quant_post_static.py --model_path ./inference_model/MobileNet --save_path ./quant_model_train/MobileNet --model_filename model --params_filename weights
 ```
 
 - ``model_path``: 需要量化的模型坐在的文件夹
@@ -62,11 +62,11 @@ top1_acc/top5_acc= [0.70913923 0.89548034]
 使用以下命令测试离线量化后的模型的精度：
 
 ```
-python eval.py --model_path ./quant_model_train/MobileNet
+python eval.py --model_path ./quant_model_train/MobileNet --model_name __model__ --params_name __params__
 ```
 
 精度输出为
 ```
 top1_acc/top5_acc= [0.70141864 0.89086477]
 ```
-从以上精度对比可以看出，对``mobilenet``在``imagenet``上的分类模型进行离线量化后 ``top1``精度损失为``0.77%``， ``top5``精度损失为``0.46%``. 
+从以上精度对比可以看出，对``mobilenet``在``imagenet``上的分类模型进行离线量化后 ``top1``精度损失为``0.77%``， ``top5``精度损失为``0.46%``.
diff --git a/demo/quant/quant_post/eval.py b/demo/quant/quant_post/eval.py
index 8d5cfa003d8b7077224ae2f54194069aadc3dc90..c144fd1690ed55304cc722472cb155814bd26f64 100755
--- a/demo/quant/quant_post/eval.py
+++ b/demo/quant/quant_post/eval.py
@@ -20,7 +20,8 @@ import functools
 
 import paddle
 import paddle.fluid as fluid
-sys.path.append('../../')
+sys.path[0] = os.path.join(
+    os.path.dirname("__file__"), os.path.pardir, os.path.pardir)
 import imagenet_reader as reader
 from utility import add_arguments, print_arguments
 
@@ -45,7 +46,7 @@ def eval(args):
         exe,
         model_filename=args.model_name,
         params_filename=args.params_name)
-    val_reader = paddle.batch(reader.val(), batch_size=128)
+    val_reader = paddle.fluid.io.batch(reader.val(), batch_size=128)
     feeder = fluid.DataFeeder(
         place=place, feed_list=feed_target_names, program=val_program)
 
diff --git a/demo/quant/quant_post/export_model.py b/demo/quant/quant_post/export_model.py
index dbfeb2b042139ec85b390ccd6f242c0aa93e8835..be3751bc55a512fc51d6d3f10334d61c112042e3 100755
--- a/demo/quant/quant_post/export_model.py
+++ b/demo/quant/quant_post/export_model.py
@@ -8,9 +8,9 @@ import math
 import time
 import numpy as np
 import paddle.fluid as fluid
-sys.path.append(sys.path[0] + "/../../../")
+sys.path[0] = os.path.join(
+    os.path.dirname("__file__"), os.path.pardir, os.path.pardir)
 from paddleslim.common import get_logger
-sys.path.append(sys.path[0] + "/../../")
 import models
 from utility import add_arguments, print_arguments
 
diff --git a/demo/quant/quant_post/image_classification_post_training_quantization_quick_start.ipynb b/demo/quant/quant_post/image_classification_post_training_quantization_quick_start.ipynb
new file mode 100755
index 0000000000000000000000000000000000000000..7374efb2df5ddb29d2d99301c785ed1367ae49a5
--- /dev/null
+++ b/demo/quant/quant_post/image_classification_post_training_quantization_quick_start.ipynb
@@ -0,0 +1,313 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 图像分类模型离线量化-快速开始"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "该教程以图像分类模型MobileNetV1为例，说明如何快速使用PaddleSlim的[离线量化接口](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/docs/api/quantization_api.md)。 该示例包含以下步骤：\n",
+    "\n",
+    "1. 导入依赖\n",
+    "2. 构建模型\n",
+    "3. 训练模型\n",
+    "4. 离线量化"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. 导入依赖\n",
+    "PaddleSlim依赖Paddle1.7版本，请确认已正确安装Paddle，然后按以下方式导入Paddle和PaddleSlim:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import paddle\n",
+    "import paddle.fluid as fluid\n",
+    "import paddleslim as slim\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. 构建网络\n",
+    "该章节构造一个用于对MNIST数据进行分类的分类模型，选用`MobileNetV1`，并将输入大小设置为`[1, 28, 28]`，输出类别数为10。               为了方便展示示例，我们在`paddleslim.models`下预定义了用于构建分类模型的方法，执行以下代码构建分类模型：\n",
+    "\n",
+    ">注意：paddleslim.models下的API并非PaddleSlim常规API，是为了简化示例而封装预定义的一系列方法，比如：模型结构的定义、Program的构建等。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "exe, train_program, val_program, inputs, outputs = \\\n",
+    "    slim.models.image_classification(\"MobileNet\", [1, 28, 28], 10, use_gpu=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. 训练模型\n",
+    "该章节介绍了如何定义输入数据和如何训练和测试分类模型。先训练分类模型的原因是离线量化需要一个训练好的模型。\n",
+    "\n",
+    "### 3.1 定义输入数据\n",
+    "\n",
+    "为了快速执行该示例，我们选取简单的MNIST数据，Paddle框架的`paddle.dataset.mnist`包定义了MNIST数据的下载和读取。\n",
+    "代码如下："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import paddle.dataset.mnist as reader\n",
+    "train_reader = paddle.fluid.io.batch(\n",
+    "        reader.train(), batch_size=128, drop_last=True)\n",
+    "test_reader = paddle.fluid.io.batch(\n",
+    "        reader.train(), batch_size=128, drop_last=True)\n",
+    "train_feeder = fluid.DataFeeder(inputs, fluid.CPUPlace())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.2 训练和测试\n",
+    "先定义训练和测试函数。在训练函数中执行了一个epoch的训练，因为MNIST数据集数据较少，一个epoch就可将top1精度训练到95%以上。\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train(prog):\n",
+    "    iter = 0\n",
+    "    for data in train_reader():\n",
+    "        acc1, acc5, loss = exe.run(prog, feed=train_feeder.feed(data), fetch_list=outputs)\n",
+    "        if iter % 100 == 0:\n",
+    "            print('train', acc1.mean(), acc5.mean(), loss.mean())\n",
+    "        iter += 1\n",
+    "        \n",
+    "def test(prog, outputs=outputs):\n",
+    "    iter = 0\n",
+    "    res = [[], []]\n",
+    "    for data in train_reader():\n",
+    "        acc1, acc5, loss = exe.run(prog, feed=train_feeder.feed(data), fetch_list=outputs)\n",
+    "        if iter % 100 == 0:\n",
+    "            print('test', acc1.mean(), acc5.mean(), loss.mean())\n",
+    "        res[0].append(acc1.mean())\n",
+    "        res[1].append(acc5.mean())\n",
+    "        iter += 1\n",
+    "    print('final test result', np.array(res[0]).mean(), np.array(res[1]).mean())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "调用``train``函数训练分类网络，``train_program``是在第2步：构建网络中定义的。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "('train', 0.0625, 0.5234375, 2.6373053)\n",
+      "('train', 0.9375, 0.9921875, 0.20106347)\n",
+      "('train', 0.953125, 1.0, 0.13234669)\n",
+      "('train', 0.96875, 0.9921875, 0.18056682)\n",
+      "('train', 0.9453125, 1.0, 0.15847622)\n"
+     ]
+    }
+   ],
+   "source": [
+    "train(train_program)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "调用``test``函数测试分类网络，``val_program``是在第2步：构建网络中定义的。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "('test', 0.9609375, 0.9921875, 0.12996897)\n",
+      "('test', 0.9609375, 1.0, 0.094265014)\n",
+      "('test', 0.9453125, 1.0, 0.10511534)\n",
+      "('test', 0.9765625, 1.0, 0.11341806)\n",
+      "('test', 0.953125, 1.0, 0.17046008)\n",
+      "('final test result', 0.9647603, 0.99943244)\n"
+     ]
+    }
+   ],
+   "source": [
+    "test(val_program)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "保存inference model，将训练好的分类模型保存在``'./inference_model'``下，后续进行离线量化时将加载保存在此处的模型。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[u'save_infer_model/scale_0',\n",
+       " u'save_infer_model/scale_1',\n",
+       " u'save_infer_model/scale_2']"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "target_vars = [val_program.global_block().var(name) for name in outputs]\n",
+    "fluid.io.save_inference_model(dirname='./inference_model',\n",
+    "        feeded_var_names=[var.name for var in inputs],\n",
+    "        target_vars=target_vars,\n",
+    "        executor=exe,\n",
+    "        main_program=val_program)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. 离线量化\n",
+    "\n",
+    "调用离线量化接口，加载文件夹``'./inference_model'``训练好的分类模型，并使用10个batch的数据进行参数校正。此过程无需训练，只需跑前向过程来计算量化所需参数。离线量化后的模型保存在文件夹``'./quant_post_model'``下。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-06 09:32:42,944-INFO: run batch: 0\n",
+      "2020-02-06 09:32:42,944-INFO: run batch: 0\n",
+      "2020-02-06 09:32:43,233-INFO: run batch: 5\n",
+      "2020-02-06 09:32:43,233-INFO: run batch: 5\n",
+      "2020-02-06 09:32:43,362-INFO: all run batch: 10\n",
+      "2020-02-06 09:32:43,362-INFO: all run batch: 10\n",
+      "2020-02-06 09:32:43,365-INFO: calculate scale factor ...\n",
+      "2020-02-06 09:32:43,365-INFO: calculate scale factor ...\n",
+      "2020-02-06 09:32:54,841-INFO: update the program ...\n",
+      "2020-02-06 09:32:54,841-INFO: update the program ...\n"
+     ]
+    }
+   ],
+   "source": [
+    "slim.quant.quant_post(\n",
+    "        executor=exe,\n",
+    "        model_dir='./inference_model',\n",
+    "        quantize_model_path='./quant_post_model',\n",
+    "        sample_generator=reader.test(),\n",
+    "        batch_nums=10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "加载保存在文件夹``'./quant_post_model'``下的量化后的模型进行测试，可看到精度和``3.2 训练和测试``中得到的测试精度相近，因此离线量化过程对于此分类模型几乎无损。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "('test', 0.9765625, 0.9921875, 0.11411239)\n",
+      "('test', 0.953125, 1.0, 0.111179784)\n",
+      "('test', 0.953125, 1.0, 0.101078615)\n",
+      "('test', 0.96875, 1.0, 0.0993958)\n",
+      "('test', 0.9609375, 1.0, 0.16066414)\n",
+      "('final test result', 0.9643096, 0.99931556)\n"
+     ]
+    }
+   ],
+   "source": [
+    "quant_post_prog, feed_target_names, fetch_targets = fluid.io.load_inference_model(\n",
+    "        dirname='./quant_post_model',\n",
+    "        model_filename='__model__',\n",
+    "        params_filename='__params__',\n",
+    "        executor=exe)\n",
+    "test(quant_post_prog, fetch_targets)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/demo/quant/quant_post/quant_post.py b/demo/quant/quant_post/quant_post.py
index 5a2c1c834c82e125adad7a597f0d8667d8b19bfc..96f8dca60e928056aa6761501d9355b569161e53 100755
--- a/demo/quant/quant_post/quant_post.py
+++ b/demo/quant/quant_post/quant_post.py
@@ -9,13 +9,12 @@ import time
 import numpy as np
 import paddle.fluid as fluid
 
-import reader
-sys.path.append(sys.path[0] + "/../../../")
+sys.path[0] = os.path.join(
+    os.path.dirname("__file__"), os.path.pardir, os.path.pardir)
 from paddleslim.common import get_logger
 from paddleslim.quant import quant_post
-sys.path.append(sys.path[0] + "/../../")
 from utility import add_arguments, print_arguments
-
+import imagenet_reader as reader
 _logger = get_logger(__name__, level=logging.INFO)
 
 parser = argparse.ArgumentParser(description=__doc__)
diff --git a/demo/sensitive/image_classification_sensitivity_analysis.ipynb b/demo/sensitive/image_classification_sensitivity_analysis.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..19234337f744ac01aceaaf92b7443b5e97e40923
--- /dev/null
+++ b/demo/sensitive/image_classification_sensitivity_analysis.ipynb
@@ -0,0 +1,859 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#  图像分类模型通道剪裁-敏感度分析\n",
+    "\n",
+    "该教程以图像分类模型MobileNetV1为例，说明如何快速使用[PaddleSlim的敏感度分析接口](https://paddlepaddle.github.io/PaddleSlim/api/prune_api/#sensitivity)。\n",
+    "该示例包含以下步骤：\n",
+    "\n",
+    "1. 导入依赖\n",
+    "2. 构建模型\n",
+    "3. 定义输入数据\n",
+    "4. 定义模型评估方法\n",
+    "5. 训练模型\n",
+    "6. 获取待分析卷积参数名称\n",
+    "7. 分析敏感度\n",
+    "8. 剪裁模型\n",
+    "\n",
+    "以下章节依次次介绍每个步骤的内容。\n",
+    "\n",
+    "## 1. 导入依赖\n",
+    "\n",
+    "PaddleSlim依赖Paddle1.7版本，请确认已正确安装Paddle，然后按以下方式导入Paddle和PaddleSlim:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import paddle\n",
+    "import paddle.fluid as fluid\n",
+    "import paddleslim as slim"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. 构建网络\n",
+    "\n",
+    "该章节构造一个用于对MNIST数据进行分类的分类模型，选用`MobileNetV1`，并将输入大小设置为`[1, 28, 28]`，输出类别数为10。\n",
+    "为了方便展示示例，我们在`paddleslim.models`下预定义了用于构建分类模型的方法，执行以下代码构建分类模型："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "exe, train_program, val_program, inputs, outputs = slim.models.image_classification(\"MobileNet\", [1, 28, 28], 10, use_gpu=True)\n",
+    "place = fluid.CUDAPlace(0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3 定义输入数据\n",
+    "\n",
+    "为了快速执行该示例，我们选取简单的MNIST数据，Paddle框架的`paddle.dataset.mnist`包定义了MNIST数据的下载和读取。\n",
+    "代码如下："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import paddle.dataset.mnist as reader\n",
+    "train_reader = paddle.fluid.io.batch(\n",
+    "        reader.train(), batch_size=128, drop_last=True)\n",
+    "test_reader = paddle.fluid.io.batch(\n",
+    "        reader.test(), batch_size=128, drop_last=True)\n",
+    "data_feeder = fluid.DataFeeder(inputs, place)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. 定义模型评估方法\n",
+    "\n",
+    "在计算敏感度时，需要裁剪单个卷积层后的模型在测试数据上的效果，我们定义以下方法实现该功能："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "def test(program):\n",
+    "    acc_top1_ns = []\n",
+    "    acc_top5_ns = []\n",
+    "    for data in test_reader():\n",
+    "        acc_top1_n, acc_top5_n, _ = exe.run(\n",
+    "            program,\n",
+    "            feed=data_feeder.feed(data),\n",
+    "            fetch_list=outputs)\n",
+    "        acc_top1_ns.append(np.mean(acc_top1_n))\n",
+    "        acc_top5_ns.append(np.mean(acc_top5_n))\n",
+    "    print(\"Final eva - acc_top1: {}; acc_top5: {}\".format(\n",
+    "        np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns))))\n",
+    "    return np.mean(np.array(acc_top1_ns))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. 训练模型\n",
+    "\n",
+    "只有训练好的模型才能做敏感度分析，因为该示例任务相对简单，我这里用训练一个`epoch`产出的模型做敏感度分析。对于其它训练比较耗时的模型，您可以加载训练好的模型权重。\n",
+    "\n",
+    "以下为模型训练代码："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.984375 1.0 0.04038039\n"
+     ]
+    }
+   ],
+   "source": [
+    "for data in train_reader():\n",
+    "    acc1, acc5, loss = exe.run(train_program, feed=data_feeder.feed(data), fetch_list=outputs)\n",
+    "print(np.mean(acc1), np.mean(acc5), np.mean(loss))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "用上节定义的模型评估方法，评估当前模型在测试集上的精度："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.9574319124221802; acc_top5: 0.999098539352417\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.9574319"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test(val_program)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. 获取待分析卷积参数\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['conv2_1_sep_weights', 'conv2_2_sep_weights', 'conv3_1_sep_weights', 'conv3_2_sep_weights', 'conv4_1_sep_weights', 'conv4_2_sep_weights', 'conv5_1_sep_weights', 'conv5_2_sep_weights', 'conv5_3_sep_weights', 'conv5_4_sep_weights', 'conv5_5_sep_weights', 'conv5_6_sep_weights', 'conv6_sep_weights']\n"
+     ]
+    }
+   ],
+   "source": [
+    "params = []\n",
+    "for param in train_program.global_block().all_parameters():\n",
+    "    if \"_sep_weights\" in param.name:\n",
+    "        params.append(param.name)\n",
+    "print(params)\n",
+    "params = params[:5]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. 分析敏感度\n",
+    "\n",
+    "### 7.1 简单计算敏感度"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "调用[sensitivity接口](https://paddlepaddle.github.io/PaddleSlim/api/prune_api/#sensitivity)对训练好的模型进行敏感度分析。\n",
+    "\n",
+    "在计算过程中，敏感度信息会不断追加保存到选项`sensitivities_file`指定的文件中，该文件中已有的敏感度信息不会被重复计算。\n",
+    "\n",
+    "先用以下命令删除当前路径下可能已有的`sensitivities_0.data`文件:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm -rf sensitivities_0.data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "除了指定待分析的卷积层参数，我们还可以指定敏感度分析的粒度和范围，即单个卷积层参数分别被剪裁掉的比例。\n",
+    "\n",
+    "如果待分析的模型比较敏感，剪掉单个卷积层的40%的通道，模型在测试集上的精度损失就达90%，那么`pruned_ratios`最大设置到0.4即可，比如：\n",
+    "`[0.1, 0.2, 0.3, 0.4]`\n",
+    "\n",
+    "为了得到更精确的敏感度信息，我可以适当调小`pruned_ratios`的粒度，比如：`[0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]`\n",
+    "\n",
+    "`pruned_ratios`的粒度越小，计算敏感度的速度越慢。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-04 15:29:33,091-INFO: sensitive - param: conv2_2_sep_weights; ratios: 0.1\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.9574319124221802; acc_top5: 0.999098539352417\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-04 15:29:35,971-INFO: pruned param: conv2_2_sep_weights; 0.1; loss=0.025107262656092644\n",
+      "2020-02-04 15:29:35,975-INFO: sensitive - param: conv2_2_sep_weights; ratios: 0.2\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.9333934187889099; acc_top5: 0.999098539352417\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-04 15:29:38,797-INFO: pruned param: conv2_2_sep_weights; 0.2; loss=0.04069465771317482\n",
+      "2020-02-04 15:29:38,801-INFO: sensitive - param: conv2_1_sep_weights; ratios: 0.1\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.9184695482254028; acc_top5: 0.9983974099159241\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-04 15:29:42,056-INFO: pruned param: conv2_1_sep_weights; 0.1; loss=0.035987019538879395\n",
+      "2020-02-04 15:29:42,059-INFO: sensitive - param: conv2_1_sep_weights; ratios: 0.2\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.9229767918586731; acc_top5: 0.9989984035491943\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-04 15:29:45,121-INFO: pruned param: conv2_1_sep_weights; 0.2; loss=0.031697917729616165\n",
+      "2020-02-04 15:29:45,124-INFO: sensitive - param: conv3_1_sep_weights; ratios: 0.1\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.9270833134651184; acc_top5: 0.999098539352417\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-04 15:29:48,070-INFO: pruned param: conv3_1_sep_weights; 0.1; loss=-0.00010458791075507179\n",
+      "2020-02-04 15:29:48,073-INFO: sensitive - param: conv3_1_sep_weights; ratios: 0.2\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.9575320482254028; acc_top5: 0.9992988705635071\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-04 15:29:51,172-INFO: pruned param: conv3_1_sep_weights; 0.2; loss=0.004707638639956713\n",
+      "2020-02-04 15:29:51,174-INFO: sensitive - param: conv4_1_sep_weights; ratios: 0.1\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.9529246687889099; acc_top5: 0.9993990659713745\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-04 15:29:54,379-INFO: pruned param: conv4_1_sep_weights; 0.1; loss=0.0015692544402554631\n",
+      "2020-02-04 15:29:54,382-INFO: sensitive - param: conv4_1_sep_weights; ratios: 0.2\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.9559294581413269; acc_top5: 0.9993990659713745\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-04 15:29:57,316-INFO: pruned param: conv4_1_sep_weights; 0.2; loss=0.001987668452784419\n",
+      "2020-02-04 15:29:57,319-INFO: sensitive - param: conv3_2_sep_weights; ratios: 0.1\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.9555288553237915; acc_top5: 0.9989984035491943\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-04 15:30:00,300-INFO: pruned param: conv3_2_sep_weights; 0.1; loss=-0.005021402612328529\n",
+      "2020-02-04 15:30:00,306-INFO: sensitive - param: conv3_2_sep_weights; ratios: 0.2\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.9622395634651184; acc_top5: 0.999098539352417\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-04 15:30:03,400-INFO: pruned param: conv3_2_sep_weights; 0.2; loss=0.0008369522984139621\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.9566305875778198; acc_top5: 0.9991987347602844\n",
+      "{'conv2_2_sep_weights': {0.1: 0.025107263, 0.2: 0.040694658}, 'conv2_1_sep_weights': {0.1: 0.03598702, 0.2: 0.031697918}, 'conv3_1_sep_weights': {0.1: -0.00010458791, 0.2: 0.0047076386}, 'conv4_1_sep_weights': {0.1: 0.0015692544, 0.2: 0.0019876685}, 'conv3_2_sep_weights': {0.1: -0.0050214026, 0.2: 0.0008369523}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "sens_0 = slim.prune.sensitivity(\n",
+    "        val_program,\n",
+    "        place,\n",
+    "        params,\n",
+    "        test,\n",
+    "        sensitivities_file=\"sensitivities_0.data\",\n",
+    "        pruned_ratios=[0.1, 0.2])\n",
+    "print(sens_0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 7.2 扩展敏感度信息\n",
+    "\n",
+    "第7.1节计算敏感度用的是`pruned_ratios=[0.1, 0.2]`, 我们可以在此基础上将其扩展到`[0.1, 0.2, 0.3]`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-04 15:30:16,173-INFO: sensitive - param: conv2_2_sep_weights; ratios: 0.3\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.9574319124221802; acc_top5: 0.999098539352417\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-04 15:30:19,087-INFO: pruned param: conv2_2_sep_weights; 0.3; loss=0.2279527187347412\n",
+      "2020-02-04 15:30:19,091-INFO: sensitive - param: conv2_1_sep_weights; ratios: 0.3\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.739182710647583; acc_top5: 0.9918870329856873\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-04 15:30:22,079-INFO: pruned param: conv2_1_sep_weights; 0.3; loss=0.08871221542358398\n",
+      "2020-02-04 15:30:22,082-INFO: sensitive - param: conv3_1_sep_weights; ratios: 0.3\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.8724960088729858; acc_top5: 0.9975961446762085\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-04 15:30:24,974-INFO: pruned param: conv3_1_sep_weights; 0.3; loss=0.005439940840005875\n",
+      "2020-02-04 15:30:24,976-INFO: sensitive - param: conv4_1_sep_weights; ratios: 0.3\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.952223539352417; acc_top5: 0.999098539352417\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-04 15:30:28,071-INFO: pruned param: conv4_1_sep_weights; 0.3; loss=0.03535936772823334\n",
+      "2020-02-04 15:30:28,073-INFO: sensitive - param: conv3_2_sep_weights; ratios: 0.3\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.9235777258872986; acc_top5: 0.9978966116905212\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-04 15:30:31,068-INFO: pruned param: conv3_2_sep_weights; 0.3; loss=0.008055261336266994\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.9497195482254028; acc_top5: 0.9986979365348816\n",
+      "{'conv2_2_sep_weights': {0.1: 0.025107263, 0.2: 0.040694658, 0.3: 0.22795272}, 'conv2_1_sep_weights': {0.1: 0.03598702, 0.2: 0.031697918, 0.3: 0.088712215}, 'conv3_1_sep_weights': {0.1: -0.00010458791, 0.2: 0.0047076386, 0.3: 0.005439941}, 'conv4_1_sep_weights': {0.1: 0.0015692544, 0.2: 0.0019876685, 0.3: 0.035359368}, 'conv3_2_sep_weights': {0.1: -0.0050214026, 0.2: 0.0008369523, 0.3: 0.008055261}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "sens_0 = slim.prune.sensitivity(\n",
+    "        val_program,\n",
+    "        place,\n",
+    "        params,\n",
+    "        test,\n",
+    "        sensitivities_file=\"sensitivities_0.data\",\n",
+    "        pruned_ratios=[0.3])\n",
+    "print(sens_0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 7.3 多进程加速计算敏感度信息\n",
+    "\n",
+    "敏感度分析所用时间取决于待分析的卷积层数量和模型评估的速度，我们可以通过多进程的方式加速敏感度计算。\n",
+    "\n",
+    "在不同的进程设置不同`pruned_ratios`, 然后将结果合并。\n",
+    "\n",
+    "#### 7.3.1 多进程计算敏感度\n",
+    "\n",
+    "在以上章节，我们计算了`pruned_ratios=[0.1, 0.2, 0.3]`的敏感度，并将其保存到了文件`sensitivities_0.data`中。\n",
+    "\n",
+    "在另一个进程中，我们可以设置`pruned_ratios=[0.4]`，并将结果保存在文件`sensitivities_1.data`中。代码如下："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'conv2_2_sep_weights': {0.4: 0.06348718}, 'conv2_1_sep_weights': {0.4: 0.15917951}, 'conv4_1_sep_weights': {0.4: 0.16246155}, 'conv3_1_sep_weights': {0.4: 0.034871764}, 'conv3_2_sep_weights': {0.4: 0.115384646}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "sens_1 = slim.prune.sensitivity(\n",
+    "        val_program,\n",
+    "        place,\n",
+    "        params,\n",
+    "        test,\n",
+    "        sensitivities_file=\"sensitivities_1.data\",\n",
+    "        pruned_ratios=[0.4])\n",
+    "print(sens_1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 7.3.2 加载多个进程产出的敏感度文件"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'conv2_2_sep_weights': {0.1: 0.025107263, 0.2: 0.040694658, 0.3: 0.22795272}, 'conv2_1_sep_weights': {0.1: 0.03598702, 0.2: 0.031697918, 0.3: 0.088712215}, 'conv3_1_sep_weights': {0.1: -0.00010458791, 0.2: 0.0047076386, 0.3: 0.005439941}, 'conv4_1_sep_weights': {0.1: 0.0015692544, 0.2: 0.0019876685, 0.3: 0.035359368}, 'conv3_2_sep_weights': {0.1: -0.0050214026, 0.2: 0.0008369523, 0.3: 0.008055261}}\n",
+      "{'conv2_2_sep_weights': {0.4: 0.06348718}, 'conv2_1_sep_weights': {0.4: 0.15917951}, 'conv4_1_sep_weights': {0.4: 0.16246155}, 'conv3_1_sep_weights': {0.4: 0.034871764}, 'conv3_2_sep_weights': {0.4: 0.115384646}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "s_0 = slim.prune.load_sensitivities(\"sensitivities_0.data\")\n",
+    "s_1 = slim.prune.load_sensitivities(\"sensitivities_1.data\")\n",
+    "print(s_0)\n",
+    "print(s_1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 7.3.3 合并敏感度信息"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'conv2_2_sep_weights': {0.1: 0.025107263, 0.2: 0.040694658, 0.3: 0.22795272, 0.4: 0.06348718}, 'conv2_1_sep_weights': {0.1: 0.03598702, 0.2: 0.031697918, 0.3: 0.088712215, 0.4: 0.15917951}, 'conv3_1_sep_weights': {0.1: -0.00010458791, 0.2: 0.0047076386, 0.3: 0.005439941, 0.4: 0.034871764}, 'conv4_1_sep_weights': {0.1: 0.0015692544, 0.2: 0.0019876685, 0.3: 0.035359368, 0.4: 0.16246155}, 'conv3_2_sep_weights': {0.1: -0.0050214026, 0.2: 0.0008369523, 0.3: 0.008055261, 0.4: 0.115384646}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "s = slim.prune.merge_sensitive([s_0, s_1])\n",
+    "print(s)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 8. 剪裁模型\n",
+    "\n",
+    "根据以上章节产出的敏感度信息，对模型进行剪裁。\n",
+    "\n",
+    "### 8.1 计算剪裁率\n",
+    "\n",
+    "首先，调用PaddleSlim提供的[get_ratios_by_loss](https://paddlepaddle.github.io/PaddleSlim/api/prune_api/#get_ratios_by_loss)方法根据敏感度计算剪裁率，通过调整参数`loss`大小获得合适的一组剪裁率："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'conv3_1_sep_weights': 0.3, 'conv4_1_sep_weights': 0.22400936122727166, 'conv3_2_sep_weights': 0.3}\n"
+     ]
+    }
+   ],
+   "source": [
+    "loss = 0.01\n",
+    "ratios = slim.prune.get_ratios_by_loss(s_0, loss)\n",
+    "print(ratios)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 8.2 剪裁训练网络"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FLOPs before pruning: 10896832.0\n",
+      "FLOPs after pruning: 9777980.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "pruner = slim.prune.Pruner()\n",
+    "print(\"FLOPs before pruning: {}\".format(slim.analysis.flops(train_program)))\n",
+    "pruned_program, _, _ = pruner.prune(\n",
+    "        train_program,\n",
+    "        fluid.global_scope(),\n",
+    "        params=ratios.keys(),\n",
+    "        ratios=ratios.values(),\n",
+    "        place=place)\n",
+    "print(\"FLOPs after pruning: {}\".format(slim.analysis.flops(pruned_program)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 8.3 剪裁测试网络\n",
+    "\n",
+    ">注意：对测试网络进行剪裁时，需要将`only_graph`设置为True，具体原因请参考[Pruner API文档](https://paddlepaddle.github.io/PaddleSlim/api/prune_api/#pruner)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FLOPs before pruning: 10896832.0\n",
+      "FLOPs after pruning: 9777980.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "pruner = slim.prune.Pruner()\n",
+    "print(\"FLOPs before pruning: {}\".format(slim.analysis.flops(val_program)))\n",
+    "pruned_val_program, _, _ = pruner.prune(\n",
+    "        val_program,\n",
+    "        fluid.global_scope(),\n",
+    "        params=ratios.keys(),\n",
+    "        ratios=ratios.values(),\n",
+    "        place=place,\n",
+    "        only_graph=True)\n",
+    "print(\"FLOPs after pruning: {}\".format(slim.analysis.flops(pruned_val_program)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "测试一下剪裁后的模型在测试集上的精度："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.9721554517745972; acc_top5: 0.9995993375778198\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.97215545"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test(pruned_val_program)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 8.4 训练剪裁后的模型\n",
+    "\n",
+    "对剪裁后的模型在训练集上训练一个`epoch`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.984375 1.0 0.04675974\n"
+     ]
+    }
+   ],
+   "source": [
+    "for data in train_reader():\n",
+    "    acc1, acc5, loss = exe.run(pruned_program, feed=data_feeder.feed(data), fetch_list=outputs)\n",
+    "print(np.mean(acc1), np.mean(acc5), np.mean(loss))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "测试训练后模型的精度："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final eva - acc_top1: 0.9721554517745972; acc_top5: 0.9995993375778198\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.97215545"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test(pruned_val_program)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/demo/sensitive/train.py b/demo/sensitive/train.py
index f043961668dbe1eb4ef8ead91b3f4f460cf80c8a..693d6bf4e167d68b04c255485a3b64282cbebdc1 100644
--- a/demo/sensitive/train.py
+++ b/demo/sensitive/train.py
@@ -11,7 +11,7 @@ import paddle.fluid as fluid
 from paddleslim.prune import merge_sensitive, get_ratios_by_loss
 from paddleslim.prune import sensitivity
 from paddleslim.common import get_logger
-sys.path.append(sys.path[0] + "/../")
+sys.path[0] = os.path.join(os.path.dirname("__file__"), os.path.pardir)
 import models
 from utility import add_arguments, print_arguments
 
@@ -68,7 +68,7 @@ def compress(args):
 
         fluid.io.load_vars(exe, args.pretrained_model, predicate=if_exist)
 
-    val_reader = paddle.batch(val_reader, batch_size=args.batch_size)
+    val_reader = paddle.fluid.io.batch(val_reader, batch_size=args.batch_size)
 
     val_feeder = feeder = fluid.DataFeeder(
         [image, label], place, program=val_program)
diff --git a/demo/sensitive_prune/greedy_prune.py b/demo/sensitive_prune/greedy_prune.py
index e3f8bb57eb3eb0e5c515376970d9484eeca78764..f59b7eaa6e5b02fb977be730ab6f72a2a2518fb8 100644
--- a/demo/sensitive_prune/greedy_prune.py
+++ b/demo/sensitive_prune/greedy_prune.py
@@ -119,8 +119,8 @@ def compress(args):
 
         fluid.io.load_vars(exe, args.pretrained_model, predicate=if_exist)
 
-    val_reader = paddle.batch(val_reader, batch_size=args.batch_size)
-    train_reader = paddle.batch(
+    val_reader = paddle.fluid.io.batch(val_reader, batch_size=args.batch_size)
+    train_reader = paddle.fluid.io.batch(
         train_reader, batch_size=args.batch_size, drop_last=True)
 
     train_feeder = feeder = fluid.DataFeeder([image, label], place)
diff --git a/demo/sensitive_prune/prune.py b/demo/sensitive_prune/prune.py
index e6c1ba7ccd09f41c8d0652075036a1c279251517..a4cb8ac242558c6daa6b92c51bcea534b406098f 100644
--- a/demo/sensitive_prune/prune.py
+++ b/demo/sensitive_prune/prune.py
@@ -117,8 +117,8 @@ def compress(args):
 
         fluid.io.load_vars(exe, args.pretrained_model, predicate=if_exist)
 
-    val_reader = paddle.batch(val_reader, batch_size=args.batch_size)
-    train_reader = paddle.batch(
+    val_reader = paddle.fluid.io.batch(val_reader, batch_size=args.batch_size)
+    train_reader = paddle.fluid.io.batch(
         train_reader, batch_size=args.batch_size, drop_last=True)
 
     train_feeder = feeder = fluid.DataFeeder([image, label], place)
diff --git a/demo/slimfacenet/README.md b/demo/slimfacenet/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..7cb5c8acaf40883b28364b9d3e3dd4fc4357c661
--- /dev/null
+++ b/demo/slimfacenet/README.md
@@ -0,0 +1,87 @@
+# slimfacenet使用示例
+
+本示例将演示如何训练`slimfacenet`及评测`slimfacenet`量化模型。
+
+本示例依赖：Paddle 1.8 PaddleSlim 1.1.1
+
+当前示例支持以下人脸识别模型：
+
+- `SlimFaceNet_A_x0_60`
+- `SlimFaceNet_B_x0_75`
+
+为了对齐论文，LFW指标为112x96输入下的结果；结合业务场景，Flops和speed为112x112输入下的结果，延时为RK3288上的延时。
+
+|Method|LFW|Flops|speed|
+|------|-----|-----|-----|
+|MobileNetV2|98.58%|277M|270ms|
+|MobileFaceNet|99.18%|224M|102ms|
+|SlimFaceNet_A_x0_60|99.21%|128M|63ms|
+|SlimFaceNet_B_x0_75|99.22%|151M|70ms|
+|SlimFaceNet_A_x0_60_quant|99.17%|32M|42ms|
+|SlimFaceNet_B_x0_75_quant|99.21%|38M|45ms|
+
+## 1. 数据准备
+
+本示例支持`CASIA`和`lfw`两种公开数据集默认情况：
+
+[CASIA训练数据集](https://paddlemodels.bj.bcebos.com/PaddleSlim/dataset/CASIA.zip)
+
+[lfw测试数据集](https://paddlemodels.bj.bcebos.com/PaddleSlim/dataset/lfw.zip)
+
+1). 训练数据集位置`./CASIA`
+2). 测试数据集位置`./lfw`
+
+## 2. 下载预训练模型
+
+如果使用预先训练并量化好的`slimfacenet`模型，可以从以下地址下载
+
+[SlimFaceNet_A_x0_60预训模型](https://paddlemodels.bj.bcebos.com/PaddleSlim/SlimFaceNet_A_x0_60_112x96.tar)
+
+[SlimFaceNet_A_x0_60量化模型](https://paddlemodels.bj.bcebos.com/PaddleSlim/SlimFaceNet_A_x0_60_112x96_quant_post.tar)
+
+[SlimFaceNet_B_x0_75预训模型](https://paddlemodels.bj.bcebos.com/PaddleSlim/SlimFaceNet_B_x0_75_112x96.tar)
+
+[SlimFaceNet_B_x0_75量化模型](https://paddlemodels.bj.bcebos.com/PaddleSlim/SlimFaceNet_B_x0_75_112x96_quant_post.tar)
+
+## 3. 启动`slimfacenet`训练任务
+
+通过以下命令启动训练任务：
+
+```
+sh slim_train.sh
+或者
+export CUDA_VISIBLE_DEVICES=0
+python -u train_eval.py \
+    --action train \
+    --model=SlimFaceNet_B_x0_75
+```
+
+其中，SlimFaceNet_A_x0_60是`slimfacenet`搜索空间中的一个模型结构，通道数的缩放系数为0.6，
+在每个缩放系数下搜索空间中都共有6**15(约4700亿)种不同的模型结构。模型训练好之后会保存在`./out_inference/`
+
+
+## 4. 将float32模型量化为int8模型
+
+通过以下命令启动训练任务：
+
+```
+sh slim_quant.sh
+或者
+export CUDA_VISIBLE_DEVICES=0
+python -u train_eval.py --action quant
+```
+执行完之后量化模型会保存在`./quant_model/`, 注当前阶段量化模型还是是按float32保存的，转paddlelite后会变为int8
+
+## 4. 加载和评估量化模型
+
+本节介绍如何加载并评测预先训练好并量化后的模型。
+
+执行以下代码加载模型并评估模型在测试集上的指标。
+
+```
+将量化模型默认地址在`./quant_model/`
+sh slim_eval.sh
+或者
+export CUDA_VISIBLE_DEVICES=0
+python train_eval.py --action test
+```
diff --git a/demo/slimfacenet/dataloader/__init__.py b/demo/slimfacenet/dataloader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dfaa94ac2a5e4f1844d37cc8b9ea7ce76f4437c
--- /dev/null
+++ b/demo/slimfacenet/dataloader/__init__.py
@@ -0,0 +1 @@
+from .casia import CASIA_Face
diff --git a/demo/slimfacenet/dataloader/casia.py b/demo/slimfacenet/dataloader/casia.py
new file mode 100644
index 0000000000000000000000000000000000000000..064abc6113e6ea8c2629b216d43ca5f821753936
--- /dev/null
+++ b/demo/slimfacenet/dataloader/casia.py
@@ -0,0 +1,97 @@
+# ================================================================
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import six
+if six.PY2:
+    import scipy.misc as imgreader
+else:
+    import imageio as imgreader
+import os
+import paddle
+from paddle import fluid
+
+
+class CASIA_Face(object):
+    def __init__(self, root):
+        self.root = root
+
+        img_txt_dir = os.path.join(root, 'CASIA-WebFace-112X96.txt')
+        image_list = []
+        label_list = []
+        with open(img_txt_dir) as f:
+            img_label_list = f.read().splitlines()
+        for info in img_label_list:
+            image_dir, label_name = info.split(' ')
+            image_list.append(
+                os.path.join(root, 'CASIA-WebFace-112X96', image_dir))
+            label_list.append(int(label_name))
+
+        self.image_list = image_list
+        self.label_list = label_list
+        self.class_nums = len(np.unique(self.label_list))
+        self.shuffle_idx = list(
+            np.random.choice(
+                len(self.image_list), len(self.image_list), False))
+
+    def reader(self):
+        while True:
+            if len(self.shuffle_idx) == 0:
+                self.shuffle_idx = list(
+                    np.random.choice(
+                        len(self.image_list), len(self.image_list), False))
+                return
+            index = self.shuffle_idx.pop()
+
+            img_path = self.image_list[index]
+            target = self.label_list[index]
+
+            try:
+                img = imgreader.imread(img_path)
+            except:
+                continue
+
+            if len(img.shape) == 2:
+                img = np.stack([img] * 3, 2)
+            flip = np.random.choice(2) * 2 - 1
+            img = img[:, ::flip, :]
+            img = (img - 127.5) / 128.0
+            img = img.transpose(2, 0, 1)
+
+            yield img, target
+
+    def __len__(self):
+        return len(self.image_list)
+
+
+if __name__ == '__main__':
+    data_dir = 'PATH to CASIA dataset'
+
+    place = fluid.CPUPlace()
+    with fluid.dygraph.guard(place):
+        dataset = CASIA_Face(root=data_dir)
+        print(len(dataset))
+        print(dataset.class_nums)
+        trainloader = paddle.fluid.io.batch(
+            dataset.reader, batch_size=1, drop_last=False)
+        for i in range(10):
+            for data in trainloader():
+                img = np.array([x[0] for x in data]).astype('float32')
+                img = fluid.dygraph.to_variable(img)
+                print(img.shape)
+                label = np.array([x[1] for x in data]).astype('int64').reshape(
+                    -1, 1)
+                label = fluid.dygraph.to_variable(label)
+                print(label.shape)
+        print(len(dataset))
diff --git a/demo/slimfacenet/dataloader/lfw.py b/demo/slimfacenet/dataloader/lfw.py
new file mode 100644
index 0000000000000000000000000000000000000000..e13fd513b9e6e26474b48a025c4bebd205fb0f3a
--- /dev/null
+++ b/demo/slimfacenet/dataloader/lfw.py
@@ -0,0 +1,59 @@
+# ================================================================
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import six
+if six.PY2:
+    import scipy.misc as imgreader
+else:
+    import imageio as imgreader
+import paddle
+from paddle import fluid
+
+
+class LFW(object):
+    def __init__(self, imgl, imgr):
+
+        self.imgl_list = imgl
+        self.imgr_list = imgr
+        self.shuffle_idx = [i for i in range(len(self.imgl_list))]
+
+    def reader(self):
+        while True:
+            if len(self.shuffle_idx) == 0:
+                self.shuffle_idx = [i for i in range(len(self.imgl_list))]
+                return
+            index = self.shuffle_idx.pop(0)
+
+            imgl = imgreader.imread(self.imgl_list[index])
+            if len(imgl.shape) == 2:
+                imgl = np.stack([imgl] * 3, 2)
+            imgr = imgreader.imread(self.imgr_list[index])
+            if len(imgr.shape) == 2:
+                imgr = np.stack([imgr] * 3, 2)
+
+            imglist = [imgl, imgl[:, ::-1, :], imgr, imgr[:, ::-1, :]]
+            for i in range(len(imglist)):
+                imglist[i] = (imglist[i] - 127.5) / 128.0
+                imglist[i] = imglist[i].transpose(2, 0, 1)
+
+            imgs = [img.astype('float32') for img in imglist]
+            yield imgs
+
+    def __len__(self):
+        return len(self.imgl_list)
+
+
+if __name__ == '__main__':
+    pass
diff --git a/demo/slimfacenet/lfw_eval.py b/demo/slimfacenet/lfw_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..974fe0783ca4a7f040153514feb7c10851865bf1
--- /dev/null
+++ b/demo/slimfacenet/lfw_eval.py
@@ -0,0 +1,175 @@
+# ================================================================
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import argparse
+import time
+import scipy.io
+import numpy as np
+
+import paddle
+from paddle import fluid
+
+from dataloader.casia import CASIA_Face
+from dataloader.lfw import LFW
+from paddleslim import models
+
+
+def parse_filelist(root):
+    with open(os.path.join(root, 'pairs.txt')) as f:
+        pairs = f.read().splitlines()[1:]
+    folder_name = 'lfw-112X96'
+    nameLs = []
+    nameRs = []
+    folds = []
+    flags = []
+    for i, p in enumerate(pairs):
+        p = p.split('\t')
+        if len(p) == 3:
+            nameL = os.path.join(root, folder_name, p[0],
+                                 p[0] + '_' + '{:04}.jpg'.format(int(p[1])))
+            nameR = os.path.join(root, folder_name, p[0],
+                                 p[0] + '_' + '{:04}.jpg'.format(int(p[2])))
+            fold = i // 600
+            flag = 1
+        elif len(p) == 4:
+            nameL = os.path.join(root, folder_name, p[0],
+                                 p[0] + '_' + '{:04}.jpg'.format(int(p[1])))
+            nameR = os.path.join(root, folder_name, p[2],
+                                 p[2] + '_' + '{:04}.jpg'.format(int(p[3])))
+            fold = i // 600
+            flag = -1
+        nameLs.append(nameL)
+        nameRs.append(nameR)
+        folds.append(fold)
+        flags.append(flag)
+    return [nameLs, nameRs, folds, flags]
+
+
+def get_accuracy(scores, flags, threshold):
+    p = np.sum(scores[flags == 1] > threshold)
+    n = np.sum(scores[flags == -1] < threshold)
+    return 1.0 * (p + n) / len(scores)
+
+
+def get_threshold(scores, flags, thrNum):
+    accuracys = np.zeros((2 * thrNum + 1, 1))
+    thresholds = np.arange(-thrNum, thrNum + 1) * 1.0 / thrNum
+    for i in range(2 * thrNum + 1):
+        accuracys[i] = get_accuracy(scores, flags, thresholds[i])
+
+    max_index = np.squeeze(accuracys == np.max(accuracys))
+    bestThreshold = np.mean(thresholds[max_index])
+    return bestThreshold
+
+
+def evaluation_10_fold(root='result.mat'):
+    ACCs = np.zeros(10)
+    result = scipy.io.loadmat(root)
+    for i in range(10):
+        fold = result['fold']
+        flags = result['flag']
+        featureLs = result['fl']
+        featureRs = result['fr']
+
+        valFold = fold != i
+        testFold = fold == i
+        flags = np.squeeze(flags)
+
+        mu = np.mean(
+            np.concatenate(
+                (featureLs[valFold[0], :], featureRs[valFold[0], :]), 0), 0)
+        mu = np.expand_dims(mu, 0)
+        featureLs = featureLs - mu
+        featureRs = featureRs - mu
+        featureLs = featureLs / np.expand_dims(
+            np.sqrt(np.sum(np.power(featureLs, 2), 1)), 1)
+        featureRs = featureRs / np.expand_dims(
+            np.sqrt(np.sum(np.power(featureRs, 2), 1)), 1)
+
+        scores = np.sum(np.multiply(featureLs, featureRs), 1)
+        threshold = get_threshold(scores[valFold[0]], flags[valFold[0]], 10000)
+        ACCs[i] = get_accuracy(scores[testFold[0]], flags[testFold[0]],
+                               threshold)
+    return ACCs
+
+
+def test(test_reader, flods, flags, net, args):
+    net.eval()
+    featureLs = None
+    featureRs = None
+    for idx, data in enumerate(test_reader()):
+        data_list = [[] for _ in range(4)]
+        for _ in range(len(data)):
+            data_list[0].append(data[_][0])
+            data_list[1].append(data[_][1])
+            data_list[2].append(data[_][2])
+            data_list[3].append(data[_][3])
+        res = [
+            net(fluid.dygraph.to_variable(np.array(d))).numpy()
+            for d in data_list
+        ]
+        featureL = np.concatenate((res[0], res[1]), 1)
+        featureR = np.concatenate((res[2], res[3]), 1)
+        if featureLs is None:
+            featureLs = featureL
+        else:
+            featureLs = np.concatenate((featureLs, featureL), 0)
+        if featureRs is None:
+            featureRs = featureR
+        else:
+            featureRs = np.concatenate((featureRs, featureR), 0)
+    result = {'fl': featureLs, 'fr': featureRs, 'fold': flods, 'flag': flags}
+    scipy.io.savemat(args.feature_save_dir, result)
+    ACCs = evaluation_10_fold(args.feature_save_dir)
+    for i in range(len(ACCs)):
+        print('{}    {:.2f}'.format(i + 1, ACCs[i] * 100))
+    print('--------')
+    print('AVE    {:.2f}'.format(np.mean(ACCs) * 100))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='PaddlePaddle SlimFaceNet')
+    parser.add_argument(
+        '--use_gpu', default=0, type=int, help='Use GPU or not, 0 is not used')
+    parser.add_argument(
+        '--test_data_dir', default='./lfw', type=str, help='lfw_data_dir')
+    parser.add_argument(
+        '--resume', default='output/0', type=str, help='resume')
+    parser.add_argument(
+        '--feature_save_dir',
+        default='result.mat',
+        type=str,
+        help='The path of the extract features save, must be .mat file')
+    args = parser.parse_args()
+
+    place = fluid.CPUPlace() if args.use_gpu == 0 else fluid.CUDAPlace(0)
+    with fluid.dygraph.guard(place):
+        train_dataset = CASIA_Face(root=args.train_data_dir)
+        nl, nr, flods, flags = parse_filelist(args.test_data_dir)
+        test_dataset = LFW(nl, nr)
+        test_reader = paddle.fluid.io.batch(
+            test_dataset.reader,
+            batch_size=args.test_batchsize,
+            drop_last=False)
+
+        net = models.__dict__[args.model](class_dim=train_dataset.class_nums)
+        if args.resume:
+            assert os.path.exists(args.resume + ".pdparams"
+                                  ), "Given dir {}.pdparams not exist.".format(
+                                      args.resume)
+            para_dict, opti_dict = fluid.dygraph.load_dygraph(args.resume)
+            net.set_dict(para_dict)
+
+        test(test_reader, flods, flags, net, args)
diff --git a/demo/slimfacenet/slim_eval.sh b/demo/slimfacenet/slim_eval.sh
new file mode 100644
index 0000000000000000000000000000000000000000..922adaf6f403bef6dbbdea5cfe18790819aefac7
--- /dev/null
+++ b/demo/slimfacenet/slim_eval.sh
@@ -0,0 +1,19 @@
+# ================================================================
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/bin/bash
+export CUDA_VISIBLE_DEVICES=0
+python train_eval.py --action test \
+    --train_data_dir=./CASIA/ \
+    --test_data_dir=./lfw/ \
diff --git a/demo/slimfacenet/slim_quant.sh b/demo/slimfacenet/slim_quant.sh
new file mode 100644
index 0000000000000000000000000000000000000000..41244347e4d51b1949686ddffecc83f561cbfcc6
--- /dev/null
+++ b/demo/slimfacenet/slim_quant.sh
@@ -0,0 +1,19 @@
+# ================================================================
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/bin/bash
+export CUDA_VISIBLE_DEVICES=0
+python train_eval.py --action quant \
+    --train_data_dir=./CASIA/ \
+    --test_data_dir=./lfw/ 
diff --git a/demo/slimfacenet/slim_train.sh b/demo/slimfacenet/slim_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..62cbd192fd5a1fff994a564a3a28f56867455ad4
--- /dev/null
+++ b/demo/slimfacenet/slim_train.sh
@@ -0,0 +1,21 @@
+# ================================================================
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/bin/bash
+export CUDA_VISIBLE_DEVICES=0
+python -u train_eval.py \
+    --train_data_dir=./CASIA/ \
+    --test_data_dir=./lfw/ \
+    --action train \
+    --model=SlimFaceNet_B_x0_75
diff --git a/demo/slimfacenet/train_eval.py b/demo/slimfacenet/train_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..77b366b92842445616e022d8c43a377c749fb832
--- /dev/null
+++ b/demo/slimfacenet/train_eval.py
@@ -0,0 +1,385 @@
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import shutil
+import subprocess
+import argparse
+import time
+import scipy.io
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.compiler as compiler
+
+from dataloader.casia import CASIA_Face
+from dataloader.lfw import LFW
+from lfw_eval import parse_filelist, evaluation_10_fold
+from paddleslim import models
+from paddleslim.quant import quant_post_static
+
+
+def now():
+    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
+
+
+def creat_optimizer(args, trainset_scale):
+    start_step = trainset_scale * args.start_epoch // args.train_batchsize
+
+    if args.lr_strategy == 'piecewise_decay':
+        bd = [
+            trainset_scale * int(e) // args.train_batchsize
+            for e in args.lr_steps.strip().split(',')
+        ]
+        lr = [float(e) for e in args.lr_list.strip().split(',')]
+        assert len(bd) == len(lr) - 1
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr),
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(args.l2_decay))
+    elif args.lr_strategy == 'cosine_decay':
+        lr = args.lr
+        step_each_epoch = trainset_scale // args.train_batchsize
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=fluid.layers.cosine_decay(lr, step_each_epoch,
+                                                    args.total_epoch),
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(args.l2_decay))
+    else:
+        print('Wrong learning rate strategy')
+        exit()
+    return optimizer
+
+
+def test(test_exe, test_program, test_out, args):
+    featureLs = None
+    featureRs = None
+    out_feature, test_reader, flods, flags = test_out
+    for idx, data in enumerate(test_reader()):
+        res = []
+        res.append(
+            test_exe.run(test_program,
+                         feed={u'image_test': data[0][u'image_test1']},
+                         fetch_list=out_feature))
+        res.append(
+            test_exe.run(test_program,
+                         feed={u'image_test': data[0][u'image_test2']},
+                         fetch_list=out_feature))
+        res.append(
+            test_exe.run(test_program,
+                         feed={u'image_test': data[0][u'image_test3']},
+                         fetch_list=out_feature))
+        res.append(
+            test_exe.run(test_program,
+                         feed={u'image_test': data[0][u'image_test4']},
+                         fetch_list=out_feature))
+        featureL = np.concatenate((res[0][0], res[1][0]), 1)
+        featureR = np.concatenate((res[2][0], res[3][0]), 1)
+        if featureLs is None:
+            featureLs = featureL
+        else:
+            featureLs = np.concatenate((featureLs, featureL), 0)
+        if featureRs is None:
+            featureRs = featureR
+        else:
+            featureRs = np.concatenate((featureRs, featureR), 0)
+    result = {'fl': featureLs, 'fr': featureRs, 'fold': flods, 'flag': flags}
+    scipy.io.savemat(args.feature_save_dir, result)
+    ACCs = evaluation_10_fold(args.feature_save_dir)
+    with open(os.path.join(args.save_ckpt, 'log.txt'), 'a+') as f:
+        f.writelines('eval model {}\n'.format(args.model))
+    for i in range(len(ACCs)):
+        print('{}    {}'.format(i + 1, ACCs[i] * 100))
+        with open(os.path.join(args.save_ckpt, 'log.txt'), 'a+') as f:
+            f.writelines('{}    {}\n'.format(i + 1, ACCs[i] * 100))
+    print('--------')
+    print('AVE {}'.format(np.mean(ACCs) * 100))
+    with open(os.path.join(args.save_ckpt, 'log.txt'), 'a+') as f:
+        f.writelines('--------\n')
+        f.writelines('AVE    {}\n'.format(np.mean(ACCs) * 100))
+    return np.mean(ACCs) * 100
+
+
+def train(exe, train_program, train_out, test_program, test_out, args):
+    loss, acc, global_lr, train_reader = train_out
+    fetch_list_train = [loss.name, acc.name, global_lr.name]
+    build_strategy = fluid.BuildStrategy()
+    build_strategy.fuse_all_optimizer_ops = True
+    compiled_prog = compiler.CompiledProgram(
+        train_program, build_strategy=build_strategy).with_data_parallel(
+            loss_name=loss.name, build_strategy=build_strategy)
+    best_ave = 0
+    for epoch_id in range(args.start_epoch, args.total_epoch):
+        for batch_id, data in enumerate(train_reader()):
+            loss, acc, global_lr = exe.run(compiled_prog,
+                                           feed=data,
+                                           fetch_list=fetch_list_train)
+            avg_loss = np.mean(np.array(loss))
+            avg_acc = np.mean(np.array(acc))
+            print(
+                '{}  Epoch: {:^4d} step: {:^4d} loss: {:.6f}, acc: {:.6f}, lr: {}'.
+                format(now(), epoch_id, batch_id, avg_loss, avg_acc,
+                       float(np.mean(np.array(global_lr)))))
+        if batch_id % args.save_frequency == 0:
+            model_path = os.path.join(args.save_ckpt, str(epoch_id))
+            fluid.io.save_persistables(
+                executor=exe, dirname=model_path, main_program=train_program)
+            temp_ave = test(exe, test_program, test_out, args)
+            if temp_ave > best_ave:
+                best_ave = temp_ave
+                print('Best AVE: {}'.format(best_ave))
+                out_feature, test_reader, flods, flags = test_out
+                fluid.io.save_inference_model(
+                    executor=exe,
+                    dirname='./out_inference',
+                    feeded_var_names=['image_test'],
+                    target_vars=[out_feature],
+                    main_program=test_program)
+
+
+def build_program(program, startup, args, is_train=True):
+    if args.use_gpu:
+        num_trainers = fluid.core.get_cuda_device_count()
+    else:
+        num_trainers = int(os.environ.get('CPU_NUM', 1))
+    places = fluid.cuda_places() if args.use_gpu else fluid.CPUPlace()
+
+    train_dataset = CASIA_Face(root=args.train_data_dir)
+    trainset_scale = len(train_dataset)
+
+    with fluid.program_guard(main_program=program, startup_program=startup):
+        with fluid.unique_name.guard():
+            # Model construction
+            model = models.__dict__[args.model](
+                class_dim=train_dataset.class_nums)
+
+            if is_train:
+                image = fluid.data(
+                    name='image', shape=[-1, 3, 112, 96], dtype='float32')
+                label = fluid.data(name='label', shape=[-1, 1], dtype='int64')
+                train_reader = fluid.io.batch(
+                    train_dataset.reader,
+                    batch_size=args.train_batchsize // num_trainers,
+                    drop_last=False)
+                reader = fluid.io.DataLoader.from_generator(
+                    feed_list=[image, label],
+                    capacity=64,
+                    iterable=True,
+                    return_list=False)
+                reader.set_sample_list_generator(train_reader, places=places)
+
+                model.extract_feature = False
+                loss, acc = model.net(image, label)
+                optimizer = creat_optimizer(args, trainset_scale)
+                optimizer.minimize(loss)
+                global_lr = optimizer._global_learning_rate()
+                out = (loss, acc, global_lr, reader)
+
+            else:
+                nl, nr, flods, flags = parse_filelist(args.test_data_dir)
+                test_dataset = LFW(nl, nr)
+                test_reader = fluid.io.batch(
+                    test_dataset.reader,
+                    batch_size=args.test_batchsize,
+                    drop_last=False)
+                image_test = fluid.data(
+                    name='image_test', shape=[-1, 3, 112, 96], dtype='float32')
+                image_test1 = fluid.data(
+                    name='image_test1',
+                    shape=[-1, 3, 112, 96],
+                    dtype='float32')
+                image_test2 = fluid.data(
+                    name='image_test2',
+                    shape=[-1, 3, 112, 96],
+                    dtype='float32')
+                image_test3 = fluid.data(
+                    name='image_test3',
+                    shape=[-1, 3, 112, 96],
+                    dtype='float32')
+                image_test4 = fluid.data(
+                    name='image_test4',
+                    shape=[-1, 3, 112, 96],
+                    dtype='float32')
+                reader = fluid.io.DataLoader.from_generator(
+                    feed_list=[
+                        image_test1, image_test2, image_test3, image_test4
+                    ],
+                    capacity=64,
+                    iterable=True,
+                    return_list=False)
+                reader.set_sample_list_generator(
+                    test_reader,
+                    places=fluid.cuda_places()
+                    if args.use_gpu else fluid.CPUPlace())
+
+                model.extract_feature = True
+                feature = model.net(image_test)
+                out = (feature, reader, flods, flags)
+
+            return out
+
+
+def quant_val_reader_batch():
+    nl, nr, flods, flags = parse_filelist(args.test_data_dir)
+    test_dataset = LFW(nl, nr)
+    test_reader = fluid.io.batch(
+        test_dataset.reader, batch_size=1, drop_last=False)
+    shuffle_reader = fluid.io.shuffle(test_reader, 3)
+
+    def _reader():
+        while True:
+            for idx, data in enumerate(shuffle_reader()):
+                yield np.expand_dims(data[0][0], axis=0)
+
+    return _reader
+
+
+def main():
+    global args
+    parser = argparse.ArgumentParser(description='PaddlePaddle SlimFaceNet')
+    parser.add_argument(
+        '--action', default='train', type=str, help='train/test/quant')
+    parser.add_argument(
+        '--model',
+        default='SlimFaceNet_B_x0_75',
+        type=str,
+        help='SlimFaceNet_B_x0_75/SlimFaceNet_C_x0_75/SlimFaceNet_A_x0_60')
+    parser.add_argument(
+        '--use_gpu', default=1, type=int, help='Use GPU or not, 0 is not used')
+    parser.add_argument(
+        '--lr_strategy',
+        default='piecewise_decay',
+        type=str,
+        help='lr_strategy')
+    parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
+    parser.add_argument(
+        '--lr_list',
+        default='0.1,0.01,0.001,0.0001',
+        type=str,
+        help='learning rate list (piecewise_decay)')
+    parser.add_argument(
+        '--lr_steps',
+        default='36,52,58',
+        type=str,
+        help='learning rate decay at which epochs')
+    parser.add_argument(
+        '--l2_decay', default=4e-5, type=float, help='base l2_decay')
+    parser.add_argument(
+        '--train_data_dir', default='./CASIA', type=str, help='train_data_dir')
+    parser.add_argument(
+        '--test_data_dir', default='./lfw', type=str, help='lfw_data_dir')
+    parser.add_argument(
+        '--train_batchsize', default=512, type=int, help='train_batchsize')
+    parser.add_argument(
+        '--test_batchsize', default=500, type=int, help='test_batchsize')
+    parser.add_argument(
+        '--img_shape', default='3,112,96', type=str, help='img_shape')
+    parser.add_argument(
+        '--start_epoch', default=0, type=int, help='start_epoch')
+    parser.add_argument(
+        '--total_epoch', default=80, type=int, help='total_epoch')
+    parser.add_argument(
+        '--save_frequency', default=1, type=int, help='save_frequency')
+    parser.add_argument(
+        '--save_ckpt', default='output', type=str, help='save_ckpt')
+    parser.add_argument(
+        '--feature_save_dir',
+        default='result.mat',
+        type=str,
+        help='The path of the extract features save, must be .mat file')
+    args = parser.parse_args()
+
+    if args.use_gpu:
+        num_trainers = fluid.core.get_cuda_device_count()
+    else:
+        num_trainers = int(os.environ.get('CPU_NUM', 1))
+    print(args)
+    print('num_trainers: {}'.format(num_trainers))
+    if args.save_ckpt == None:
+        args.save_ckpt = 'output'
+    if not os.path.isdir(args.save_ckpt):
+        os.makedirs(args.save_ckpt)
+    with open(os.path.join(args.save_ckpt, 'log.txt'), 'w+') as f:
+        f.writelines(str(args) + '\n')
+        f.writelines('num_trainers: {}'.format(num_trainers) + '\n')
+
+    if args.action == 'train':
+        train_program = fluid.Program()
+    test_program = fluid.Program()
+    startup_program = fluid.Program()
+
+    if args.action == 'train':
+        train_out = build_program(train_program, startup_program, args, True)
+    test_out = build_program(test_program, startup_program, args, False)
+    test_program = test_program.clone(for_test=True)
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(startup_program)
+
+    if args.action == 'train':
+        train(exe, train_program, train_out, test_program, test_out, args)
+    elif args.action == 'quant':
+        quant_post_static(
+            executor=exe,
+            model_dir='./out_inference/',
+            quantize_model_path='./quant_model/',
+            sample_generator=quant_val_reader_batch(),
+            model_filename=None,  #'model',
+            params_filename=None,  #'params',
+            save_model_filename=None,  #'model',
+            save_params_filename=None,  #'params',
+            batch_size=np.random.randint(80, 160),
+            batch_nums=np.random.randint(4, 10))
+    elif args.action == 'test':
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             dirname='./quant_model/',
+             model_filename=None,
+             params_filename=None,
+             executor=exe)
+        nl, nr, flods, flags = parse_filelist(args.test_data_dir)
+        test_dataset = LFW(nl, nr)
+        test_reader = fluid.io.batch(
+            test_dataset.reader,
+            batch_size=args.test_batchsize,
+            drop_last=False)
+        image_test = fluid.data(
+            name='image_test', shape=[-1, 3, 112, 96], dtype='float32')
+        image_test1 = fluid.data(
+            name='image_test1', shape=[-1, 3, 112, 96], dtype='float32')
+        image_test2 = fluid.data(
+            name='image_test2', shape=[-1, 3, 112, 96], dtype='float32')
+        image_test3 = fluid.data(
+            name='image_test3', shape=[-1, 3, 112, 96], dtype='float32')
+        image_test4 = fluid.data(
+            name='image_test4', shape=[-1, 3, 112, 96], dtype='float32')
+        reader = fluid.io.DataLoader.from_generator(
+            feed_list=[image_test1, image_test2, image_test3, image_test4],
+            capacity=64,
+            iterable=True,
+            return_list=False)
+        reader.set_sample_list_generator(
+            test_reader,
+            places=fluid.cuda_places() if args.use_gpu else fluid.CPUPlace())
+        test_out = (fetch_targets, reader, flods, flags)
+        print('fetch_targets[0]: ', fetch_targets[0])
+        print('feed_target_names: ', feed_target_names)
+        test(exe, inference_program, test_out, args)
+    else:
+        print('WRONG ACTION')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8a59d77da236bcf66f62ff537404fb5ec9e7ac4b
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,137 @@
+# 文档构建与发布教程
+
+## 1. 文档构成
+
+PaddleSlim文档包含以下部分：
+
+- 简介：概要介绍PaddleSlim功能。
+- 安装：安装文档。
+- 快速开始：各个策略示例，使用小数据，可快速完成执行。
+- 高阶教程：包括在实际任务上的操作步骤、高级特性的使用教程。
+- API文档：用户接口文档。
+
+以上文档均包含中英两版，其中，**英文API文档根据代码注释自动生成**。
+
+文档文件结构如下：
+
+```bash
+docs
+├── en
+│   ├── api_en # 英文API文档，该目录下文件为自动生成
+│   ├── conf.py # 英文文档编译配置文件
+│   ├── index_en.rst # 英文文档总导航页
+│   ├── index.rst # 中英文切换功能的辅助文件，无实际内容
+│   ├── install_en.md # 安装文档
+│   ├── intro_en.md # 简介
+│   ├── Makefile # 英文文档编译文件
+│   ├── model_zoo_en.md # 模型库
+│   ├── quick_start # 快速开始
+│   └── tutorials # 进阶教程
+├── requirements.txt # 文档编译所需依赖
+└── zh_cn
+    ├── algo # 算法原理
+    ├── api_cn # 中文API文档
+    ├── conf.py # 中文文档编译配置文件
+    ├── index_en.rst # 中英文切换功能的辅助文件，无实际内容
+    ├── index.rst # 中文文档总导航页
+    ├── install.md # 安装文档
+    ├── intro.md # 简介
+    ├── Makefile # 编译文件
+    ├── model_zoo.md # 模型库
+    ├── quick_start # 快速开始
+    └── tutorials # 进阶教程
+```
+
+## 2. 文档编译
+
+编译文档前需要确保已正确安装PaddleSlim，且Python可正常执行`import paddleslim`。
+
+执行以下命令安装文档编译依赖工具库：
+
+```
+pip install -r requirements.txt
+```
+
+##  2.1 编译中文文档
+
+进入路径`./docs/zh_cn`
+
+执行以下命令清理编译结果：
+
+```
+make clean
+```
+
+执行以下命令编译生成`html`:
+
+```
+make html
+```
+
+以上命令生成`html`文件到路径`./build/html/`。
+
+
+## 2.2 预览文档
+
+进入`PaddleSlim/docs/zh_cn/build/html`路径下。
+执行`python -m SimpleHTTPServer 8883`。
+假设当前机器IP为`server_ip`。
+
+通过浏览器查看链接`server_ip:8883`即可预览文档。
+
+## 2.3 编译英文文档
+
+进入路径`PaddleSlim/docs/en`
+
+编译文档前，需要先从代码注释生成API文档。
+
+### 2.2.1 自动生成API
+
+```
+sphinx-apidoc -M -o api_en/ ../../paddleslim
+```
+
+如果有新增`package`，请将其添加到`./api_en/index_en.rst`文件中。
+
+
+### 2.2.2 编译文档
+
+与2.1节步骤一样。
+
+# 3. 发布页面到Github
+
+回到路径`PaddleSlim/`。
+
+切换分支到`gh-pages`:
+
+```
+git checkout gh-pages
+```
+
+>注：直接切换到gh-pages分支可能会出现异常，可以尝试先切换到develop分支，再切到gh-pages分支。
+
+
+```
+rm docs/en/build/html/index.html
+rm docs/zh_cn/build/html/index_en.html
+cp -rf docs/en/build/html/* ./
+cp -rf docs/zh_cn/build/html/* ./
+```
+
+执行以下命令，添加更新：
+```
+git add -u
+```
+
+如果有新增html页面，需要单独对其执行`git add`。
+
+提交commit，并push到github。
+
+```
+git commit -m "update pages"
+git push origin gh-pages
+```
+
+## 4. 其它
+
+英文API文档格式请参考：https://wanghaoshuang.github.io/PaddleSlim/api_en/paddleslim.analysis.html
diff --git a/docs/docs/api/analysis_api.md b/docs/docs/api/analysis_api.md
deleted file mode 100644
index 2880e42a289fd80e1356ce0f3f18bc212661d33e..0000000000000000000000000000000000000000
--- a/docs/docs/api/analysis_api.md
+++ /dev/null
@@ -1,166 +0,0 @@
-## FLOPs
-paddleslim.analysis.flops(program, detail=False) [源代码](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/analysis/flops.py)
-
-: 获得指定网络的浮点运算次数(FLOPs)。
-
-**参数：**
-
-- **program(paddle.fluid.Program)** - 待分析的目标网络。更多关于Program的介绍请参考：[Program概念介绍](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Program_cn.html#program)。
-
-- **detail(bool)** - 是否返回每个卷积层的FLOPs。默认为False。
-
-- **only_conv(bool)** - 如果设置为True，则仅计算卷积层和全连接层的FLOPs，即浮点数的乘加（multiplication-adds）操作次数。如果设置为False，则也会计算卷积和全连接层之外的操作的FLOPs。
-
-**返回值：**
-
-- **flops(float)** - 整个网络的FLOPs。
-
-- **params2flops(dict)** - 每层卷积对应的FLOPs，其中key为卷积层参数名称，value为FLOPs值。
-
-**示例：**
-
-```
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
-from paddleslim.analysis import flops
-
-def conv_bn_layer(input,
-                  num_filters,
-                  filter_size,
-                  name,
-                  stride=1,
-                  groups=1,
-                  act=None):
-    conv = fluid.layers.conv2d(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        stride=stride,
-        padding=(filter_size - 1) // 2,
-        groups=groups,
-        act=None,
-        param_attr=ParamAttr(name=name + "_weights"),
-        bias_attr=False,
-        name=name + "_out")
-    bn_name = name + "_bn"
-    return fluid.layers.batch_norm(
-        input=conv,
-        act=act,
-        name=bn_name + '_output',
-        param_attr=ParamAttr(name=bn_name + '_scale'),
-        bias_attr=ParamAttr(bn_name + '_offset'),
-        moving_mean_name=bn_name + '_mean',
-        moving_variance_name=bn_name + '_variance', )
-
-main_program = fluid.Program()
-startup_program = fluid.Program()
-#   X       X              O       X              O
-# conv1-->conv2-->sum1-->conv3-->conv4-->sum2-->conv5-->conv6
-#     |            ^ |                    ^
-#     |____________| |____________________|
-#
-# X: prune output channels
-# O: prune input channels
-with fluid.program_guard(main_program, startup_program):
-    input = fluid.data(name="image", shape=[None, 3, 16, 16])
-    conv1 = conv_bn_layer(input, 8, 3, "conv1")
-    conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
-    sum1 = conv1 + conv2
-    conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
-    conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
-    sum2 = conv4 + sum1
-    conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
-    conv6 = conv_bn_layer(conv5, 8, 3, "conv6")
-
-print("FLOPs: {}".format(flops(main_program)))
-```
-
-## model_size
-paddleslim.analysis.model_size(program) [源代码](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/analysis/model_size.py)
-
-获得指定网络的参数数量。
-
-**参数：**
-
-- **program(paddle.fluid.Program)** - 待分析的目标网络。更多关于Program的介绍请参考：[Program概念介绍](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Program_cn.html#program)。
-
-**返回值：**
-
-- **model_size(int)** - 整个网络的参数数量。
-
-**示例：**
-
-```
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
-from paddleslim.analysis import model_size
-
-def conv_layer(input,
-                  num_filters,
-                  filter_size,
-                  name,
-                  stride=1,
-                  groups=1,
-                  act=None):
-    conv = fluid.layers.conv2d(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        stride=stride,
-        padding=(filter_size - 1) // 2,
-        groups=groups,
-        act=None,
-        param_attr=ParamAttr(name=name + "_weights"),
-        bias_attr=False,
-        name=name + "_out")
-    return conv
-
-main_program = fluid.Program()
-startup_program = fluid.Program()
-#   X       X              O       X              O
-# conv1-->conv2-->sum1-->conv3-->conv4-->sum2-->conv5-->conv6
-#     |            ^ |                    ^
-#     |____________| |____________________|
-#
-# X: prune output channels
-# O: prune input channels
-with fluid.program_guard(main_program, startup_program):
-    input = fluid.data(name="image", shape=[None, 3, 16, 16])
-    conv1 = conv_layer(input, 8, 3, "conv1")
-    conv2 = conv_layer(conv1, 8, 3, "conv2")
-    sum1 = conv1 + conv2
-    conv3 = conv_layer(sum1, 8, 3, "conv3")
-    conv4 = conv_layer(conv3, 8, 3, "conv4")
-    sum2 = conv4 + sum1
-    conv5 = conv_layer(sum2, 8, 3, "conv5")
-    conv6 = conv_layer(conv5, 8, 3, "conv6")
-
-print("FLOPs: {}".format(model_size(main_program)))
-```
-
-## TableLatencyEvaluator
-paddleslim.analysis.TableLatencyEvaluator(table_file, delimiter=",") [源代码](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/analysis/latency.py)
-
-: 基于硬件延时表的模型延时评估器。
-
-**参数：**
-
-- **table_file(str)** - 所使用的延时评估表的绝对路径。关于演示评估表格式请参考：[PaddleSlim硬件延时评估表格式](../paddleslim/analysis/table_latency.md)
-
-- **delimiter(str)** - 硬件延时评估表中，操作信息之前所使用的分割符，默认为英文字符逗号。
-
-**返回值：**
-
-- **Evaluator** - 硬件延时评估器的实例。
-
-paddleslim.analysis.TableLatencyEvaluator.latency(graph) [源代码](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/analysis/latency.py)
-
-: 获得指定网络的预估延时。
-
-**参数：**
-
-- **graph(Program)** - 待预估的目标网络。
-
-**返回值：**
-
-- **latency** - 目标网络的预估延时。
diff --git a/docs/docs/api/api_guide.md b/docs/docs/api/api_guide.md
deleted file mode 100644
index 79910a06f3bc55083dcca410e681f2a94a5018ab..0000000000000000000000000000000000000000
--- a/docs/docs/api/api_guide.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# PaddleSlim API文档导航
-
-## [模型分析](./analysis_api.md)
-
-## [卷积通道剪裁](./prune_api.md)
-
-## [蒸馏]()
-
-- [单进程蒸馏](./single_distiller_api.md)
-
-- [通道剪裁](./prune_api.md)
-
-### [量化](./quantization_api.md)
-
-- [量化训练](./quantization_api.md#量化训练API)
-
-- [离线量化](./quantization_api.md#离线量化API)
-
-- [embedding量化](./quantization_api.md#Embedding量化API)
-
-## [小模型结构搜索]()
-
-- [nas API](./nas_api.md)
-- [SearchSpace](./search_space.md)
diff --git a/docs/docs/api/nas_api.md b/docs/docs/api/nas_api.md
deleted file mode 100644
index 68b6fef39fc58ff4ae340b95c1279fabdf2c54f7..0000000000000000000000000000000000000000
--- a/docs/docs/api/nas_api.md
+++ /dev/null
@@ -1,182 +0,0 @@
-# paddleslim.nas API文档
-
-## SANAS API文档
-
-## class SANAS
-SANAS（Simulated Annealing Neural Architecture Search）是基于模拟退火算法进行模型结构搜索的算法，一般用于离散搜索任务。
-
----
-
->paddleslim.nas.SANAS(configs, server_addr, init_temperature, reduce_rate, search_steps, save_checkpoint, load_checkpoint, is_server)
-
-**参数：**
-- **configs(list<tuple>):** 搜索空间配置列表，格式是`[(key, {input_size, output_size, block_num, block_mask})]`或者`[(key)]`（MobileNetV2、MobilenetV1和ResNet的搜索空间使用和原本网络结构相同的搜索空间，所以仅需指定`key`即可）, `input_size` 和`output_size`表示输入和输出的特征图的大小，`block_num`是指搜索网络中的block数量，`block_mask`是一组由0和1组成的列表，0代表不进行下采样的block，1代表下采样的block。 更多paddleslim提供的搜索空间配置可以参考。
-- **server_addr(tuple):** SANAS的地址，包括server的ip地址和端口号，如果ip地址为None或者为""的话则默认使用本机ip。默认：（"", 8881）。
-- **init_temperature(float):** 基于模拟退火进行搜索的初始温度。默认：100。
-- **reduce_rate(float):** 基于模拟退火进行搜索的衰减率。默认：0.85。
-- **search_steps(int):** 搜索过程迭代的次数。默认：300。
-- **save_checkpoint(str|None):** 保存checkpoint的文件目录，如果设置为None的话则不保存checkpoint。默认：`./nas_checkpoint`。
-- **load_checkpoint(str|None):** 加载checkpoint的文件目录，如果设置为None的话则不加载checkpoint。默认：None。
-- **is_server(bool):** 当前实例是否要启动一个server。默认：True。
-
-**返回：** 
-一个SANAS类的实例
-
-**示例代码：**
-```
-from paddleslim.nas import SANAS
-config = [('MobileNetV2Space')]
-sanas = SANAS(config=config)
-```
-
----
-
->tokens2arch(tokens)
-通过一组token得到实际的模型结构，一般用来把搜索到最优的token转换为模型结构用来做最后的训练。
-
-**参数：**
-- **tokens(list):** 一组token。
-
-**返回**
-返回一个模型结构实例。
-
-**示例代码：**
-```
-import paddle.fluid as fluid
-input = fluid.data(name='input', shape=[None, 3, 32, 32], dtype='float32')
-archs = sanas.token2arch(tokens)
-for arch in archs:
-    output = arch(input)
-    input = output
-```
----
-
->next_archs():
-获取下一组模型结构。
-
-**返回**
-返回模型结构实例的列表，形式为list。
-
-**示例代码：**
-```
-import paddle.fluid as fluid
-input = fluid.data(name='input', shape=[None, 3, 32, 32], dtype='float32')
-archs = sanas.next_archs()
-for arch in archs:
-    output = arch(input)
-    input = output
-```
-
----
-
->reward(score):
-把当前模型结构的得分情况回传。
-
-**参数：**
-**score<float>:** 当前模型的得分，分数越大越好。
-
-**返回**
-模型结构更新成功或者失败，成功则返回`True`，失败则返回`False`。
-
-
-**代码示例**
-```python
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-from paddleslim.nas import SANAS
-from paddleslim.analysis import flops
-
-max_flops = 321208544
-batch_size = 256
-
-# 搜索空间配置
-config=[('MobileNetV2Space')] 
-
-# 实例化SANAS
-sa_nas = SANAS(config, server_addr=("", 8887), init_temperature=10.24, reduce_rate=0.85, search_steps=100, is_server=True)
-
-for step in range(100):
-    archs = sa_nas.next_archs()
-    train_program = fluid.Program()
-    test_program = fluid.Program()
-    startup_program = fluid.Program()
-    ### 构造训练program
-    with fluid.program_guard(train_program, startup_program):
-        image = fluid.data(name='image', shape=[None, 3, 32, 32], dtype='float32')
-        label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-
-        for arch in archs:
-            output = arch(image)
-        out = fluid.layers.fc(output, size=10, act="softmax") 
-        softmax_out = fluid.layers.softmax(input=out, use_cudnn=False)
-        cost = fluid.layers.cross_entropy(input=softmax_out, label=label)
-        avg_cost = fluid.layers.mean(cost)
-        acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1)
-
-        ### 构造测试program
-        test_program = train_program.clone(for_test=True)
-        ### 定义优化器
-        sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-        sgd.minimize(avg_cost)
-
-
-    ### 增加限制条件，如果没有则进行无限制搜索
-    if flops(train_program) > max_flops:
-        continue
-
-    ### 定义代码是在cpu上运行
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    exe.run(startup_program)
-
-    ### 定义训练输入数据
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10(cycle=False), buf_size=1024),
-        batch_size=batch_size,
-        drop_last=True)
-
-    ### 定义预测输入数据
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10(cycle=False),
-        batch_size=batch_size,
-        drop_last=False)
-    train_feeder = fluid.DataFeeder([image, label], place, program=train_program)
-    test_feeder = fluid.DataFeeder([image, label], place, program=test_program)
-
-
-    ### 开始训练，每个搜索结果训练5个epoch
-    for epoch_id in range(5):
-        for batch_id, data in enumerate(train_reader()):
-            fetches = [avg_cost.name]
-            outs = exe.run(train_program,
-                           feed=train_feeder.feed(data),
-                           fetch_list=fetches)[0]
-            if batch_id % 10 == 0:
-                print('TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}'.format(step, epoch_id, batch_id, outs[0]))
-
-    ### 开始预测，得到最终的测试结果作为score回传给sa_nas
-    reward = []
-    for batch_id, data in enumerate(test_reader()):
-        test_fetches = [
-            avg_cost.name, acc_top1.name
-        ]
-        batch_reward = exe.run(test_program,
-                               feed=test_feeder.feed(data),
-                               fetch_list=test_fetches)
-        reward_avg = np.mean(np.array(batch_reward), axis=1)
-        reward.append(reward_avg)
-
-        print('TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}'.
-            format(step, batch_id, batch_reward[0],batch_reward[1]))
-
-    finally_reward = np.mean(np.array(reward), axis=0)
-    print(
-        'FINAL TEST: avg_cost: {}, acc_top1: {}'.format(
-            finally_reward[0], finally_reward[1]))
-
-    ### 回传score
-    sa_nas.reward(float(finally_reward[1]))
-
-```
diff --git a/docs/docs/api/prune_api.md b/docs/docs/api/prune_api.md
deleted file mode 100644
index 2b659a17ead7737d293a72941136a411c3378160..0000000000000000000000000000000000000000
--- a/docs/docs/api/prune_api.md
+++ /dev/null
@@ -1,340 +0,0 @@
-
-## Pruner
-paddleslim.prune.Pruner(criterion="l1_norm")[源代码](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/prune/pruner.py#L28)
-
-: 对卷积网络的通道进行一次剪裁。剪裁一个卷积层的通道，是指剪裁该卷积层输出的通道。卷积层的权重形状为`[output_channel, input_channel, kernel_size, kernel_size]`，通过剪裁该权重的第一纬度达到剪裁输出通道数的目的。
-
-**参数：**
-
-- **criterion** - 评估一个卷积层内通道重要性所参考的指标。目前仅支持`l1_norm`。默认为`l1_norm`。
-
-**返回：** 一个Pruner类的实例
-
-**示例代码：**
-
-```
-from paddleslim.prune import Pruner
-pruner = Pruner()
-```
-
-paddleslim.prune.Pruner.prune(program, scope, params, ratios, place=None, lazy=False, only_graph=False, param_backup=False, param_shape_backup=False)[源代码](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/prune/pruner.py#L36)
-
-: 对目标网络的一组卷积层的权重进行裁剪。
-
-**参数：**
-
-- **program(paddle.fluid.Program)** - 要裁剪的目标网络。更多关于Program的介绍请参考：[Program概念介绍](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Program_cn.html#program)。
-
-- **scope(paddle.fluid.Scope)** - 要裁剪的权重所在的`scope`，Paddle中用`scope`实例存放模型参数和运行时变量的值。Scope中的参数值会被`inplace`的裁剪。更多介绍请参考[Scope概念介绍]()
-
-- **params(list<str>)** - 需要被裁剪的卷积层的参数的名称列表。可以通过以下方式查看模型中所有参数的名称:
-```
-for block in program.blocks:
-    for param in block.all_parameters():
-        print("param: {}; shape: {}".format(param.name, param.shape))
-```
-
-- **ratios(list<float>)** - 用于裁剪`params`的剪切率，类型为列表。该列表长度必须与`params`的长度一致。
-
-- **place(paddle.fluid.Place)** - 待裁剪参数所在的设备位置，可以是`CUDAPlace`或`CPUPlace`。[Place概念介绍]()
-
-- **lazy(bool)** - `lazy`为True时，通过将指定通道的参数置零达到裁剪的目的，参数的`shape保持不变`；`lazy`为False时，直接将要裁的通道的参数删除，参数的`shape`会发生变化。
-
-- **only_graph(bool)** - 是否只裁剪网络结构。在Paddle中，Program定义了网络结构，Scope存储参数的数值。一个Scope实例可以被多个Program使用，比如定义了训练网络的Program和定义了测试网络的Program是使用同一个Scope实例的。`only_graph`为True时，只对Program中定义的卷积的通道进行剪裁；`only_graph`为false时，Scope中卷积参数的数值也会被剪裁。默认为False。
-
-- **param_backup(bool)** - 是否返回对参数值的备份。默认为False。
-
-- **param_shape_backup(bool)** - 是否返回对参数`shape`的备份。默认为False。
-
-**返回：**
-
-- **pruned_program(paddle.fluid.Program)** - 被裁剪后的Program。
-
-- **param_backup(dict)** - 对参数数值的备份，用于恢复Scope中的参数数值。
-
-- **param_shape_backup(dict)** - 对参数形状的备份。
-
-**示例：**
-
-点击[AIStudio](https://aistudio.baidu.com/aistudio/projectDetail/200786)执行以下示例代码。
-```
-
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
-from paddleslim.prune import Pruner
-
-def conv_bn_layer(input,
-                  num_filters,
-                  filter_size,
-                  name,
-                  stride=1,
-                  groups=1,
-                  act=None):
-    conv = fluid.layers.conv2d(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        stride=stride,
-        padding=(filter_size - 1) // 2,
-        groups=groups,
-        act=None,
-        param_attr=ParamAttr(name=name + "_weights"),
-        bias_attr=False,
-        name=name + "_out")
-    bn_name = name + "_bn"
-    return fluid.layers.batch_norm(
-        input=conv,
-        act=act,
-        name=bn_name + '_output',
-        param_attr=ParamAttr(name=bn_name + '_scale'),
-        bias_attr=ParamAttr(bn_name + '_offset'),
-        moving_mean_name=bn_name + '_mean',
-        moving_variance_name=bn_name + '_variance', )
-
-main_program = fluid.Program()
-startup_program = fluid.Program()
-#   X       X              O       X              O
-# conv1-->conv2-->sum1-->conv3-->conv4-->sum2-->conv5-->conv6
-#     |            ^ |                    ^
-#     |____________| |____________________|
-#
-# X: prune output channels
-# O: prune input channels
-with fluid.program_guard(main_program, startup_program):
-    input = fluid.data(name="image", shape=[None, 3, 16, 16])
-    conv1 = conv_bn_layer(input, 8, 3, "conv1")
-    conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
-    sum1 = conv1 + conv2
-    conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
-    conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
-    sum2 = conv4 + sum1
-    conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
-    conv6 = conv_bn_layer(conv5, 8, 3, "conv6")
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-scope = fluid.Scope()
-exe.run(startup_program, scope=scope)
-pruner = Pruner()
-main_program, _, _ = pruner.prune(
-    main_program,
-    scope,
-    params=["conv4_weights"],
-    ratios=[0.5],
-    place=place,
-    lazy=False,
-    only_graph=False,
-    param_backup=False,
-    param_shape_backup=False)
-
-for param in main_program.global_block().all_parameters():
-    if "weights" in param.name:
-        print("param name: {}; param shape: {}".format(param.name, param.shape))
-
-```
-
-
----
-
-## sensitivity
-paddleslim.prune.sensitivity(program, place, param_names, eval_func, sensitivities_file=None, pruned_ratios=None) [源代码](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/prune/sensitive.py#L34)
-
-: 计算网络中每个卷积层的敏感度。每个卷积层的敏感度信息统计方法为：依次剪掉当前卷积层不同比例的输出通道数，在测试集上计算剪裁后的精度损失。得到敏感度信息后，可以通过观察或其它方式确定每层卷积的剪裁率。
-
-**参数：**
-
-- **program(paddle.fluid.Program)** - 待评估的目标网络。更多关于Program的介绍请参考：[Program概念介绍](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Program_cn.html#program)。
-
-- **place(paddle.fluid.Place)** - 待分析的参数所在的设备位置，可以是`CUDAPlace`或`CPUPlace`。[Place概念介绍]()
-
-- **param_names(list<str>)** - 待分析的卷积层的参数的名称列表。可以通过以下方式查看模型中所有参数的名称:
-
-```
-for block in program.blocks:
-    for param in block.all_parameters():
-        print("param: {}; shape: {}".format(param.name, param.shape))
-```
-
-- **eval_func(function)** - 用于评估裁剪后模型效果的回调函数。该回调函数接受被裁剪后的`program`为参数，返回一个表示当前program的精度，用以计算当前裁剪带来的精度损失。
-
-- **sensitivities_file(str)** - 保存敏感度信息的本地文件系统的文件。在敏感度计算过程中，会持续将新计算出的敏感度信息追加到该文件中。重启任务后，文件中已有敏感度信息不会被重复计算。该文件可以用`pickle`加载。
-
-- **pruned_ratios(list<float>)** - 计算卷积层敏感度信息时，依次剪掉的通道数比例。默认为[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]。
-
-**返回：**
-
-- **sensitivities(dict)** - 存放敏感度信息的dict，其格式为：
-
-```
-{"weight_0":
-   {0.1: 0.22,
-    0.2: 0.33
-   },
- "weight_1":
-   {0.1: 0.21,
-    0.2: 0.4
-   }
-}
-```
-
-其中，`weight_0`是卷积层参数的名称，sensitivities['weight_0']的`value`为剪裁比例，`value`为精度损失的比例。
-
-**示例：**
-
-点击[AIStudio](https://aistudio.baidu.com/aistudio/projectdetail/201401)运行以下示例代码。
-
-```
-import paddle
-import numpy as np
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
-from paddleslim.prune import sensitivity
-import paddle.dataset.mnist as reader
-
-def conv_bn_layer(input,
-                  num_filters,
-                  filter_size,
-                  name,
-                  stride=1,
-                  groups=1,
-                  act=None):
-    conv = fluid.layers.conv2d(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        stride=stride,
-        padding=(filter_size - 1) // 2,
-        groups=groups,
-        act=None,
-        param_attr=ParamAttr(name=name + "_weights"),
-        bias_attr=False,
-        name=name + "_out")
-    bn_name = name + "_bn"
-    return fluid.layers.batch_norm(
-        input=conv,
-        act=act,
-        name=bn_name + '_output',
-        param_attr=ParamAttr(name=bn_name + '_scale'),
-        bias_attr=ParamAttr(bn_name + '_offset'),
-        moving_mean_name=bn_name + '_mean',
-        moving_variance_name=bn_name + '_variance', )
-
-main_program = fluid.Program()
-startup_program = fluid.Program()
-#   X       X              O       X              O
-# conv1-->conv2-->sum1-->conv3-->conv4-->sum2-->conv5-->conv6
-#     |            ^ |                    ^
-#     |____________| |____________________|
-#
-# X: prune output channels
-# O: prune input channels
-image_shape = [1,28,28]
-with fluid.program_guard(main_program, startup_program):
-    image = fluid.data(name='image', shape=[None]+image_shape, dtype='float32')
-    label = fluid.data(name='label', shape=[None, 1], dtype='int64')  
-    conv1 = conv_bn_layer(image, 8, 3, "conv1")
-    conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
-    sum1 = conv1 + conv2
-    conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
-    conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
-    sum2 = conv4 + sum1
-    conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
-    conv6 = conv_bn_layer(conv5, 8, 3, "conv6")
-    out = fluid.layers.fc(conv6, size=10, act="softmax")
-#    cost = fluid.layers.cross_entropy(input=out, label=label)
-#    avg_cost = fluid.layers.mean(x=cost)
-    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
-#    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
-
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-exe.run(startup_program)
-
-val_reader = paddle.batch(reader.test(), batch_size=128)
-val_feeder = feeder = fluid.DataFeeder(
-        [image, label], place, program=main_program)
-
-def eval_func(program):
-
-    acc_top1_ns = []
-    for data in val_reader():
-        acc_top1_n = exe.run(program,
-                             feed=val_feeder.feed(data),
-                             fetch_list=[acc_top1.name])
-        acc_top1_ns.append(np.mean(acc_top1_n))
-    return np.mean(acc_top1_ns)
-param_names = []
-for param in main_program.global_block().all_parameters():
-    if "weights" in param.name:
-        param_names.append(param.name)
-sensitivities = sensitivity(main_program,
-                            place,
-                            param_names,
-                            eval_func,
-                            sensitivities_file="./sensitive.data",
-                            pruned_ratios=[0.1, 0.2, 0.3])
-print(sensitivities)
-
-```
-
-## merge_sensitive
-paddleslim.prune.merge_sensitive(sensitivities)[源代码](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/prune/sensitive.py#L161)
-
-: 合并多个敏感度信息。
-
-参数：
-
-- **sensitivities(list<dict> | list<str>)** - 待合并的敏感度信息，可以是字典的列表，或者是存放敏感度信息的文件的路径列表。
-
-返回：
-
-- **sensitivities(dict)** - 合并后的敏感度信息。其格式为：
-
-```
-{"weight_0":
-   {0.1: 0.22,
-    0.2: 0.33
-   },
- "weight_1":
-   {0.1: 0.21,
-    0.2: 0.4
-   }
-}
-```
-
-其中，`weight_0`是卷积层参数的名称，sensitivities['weight_0']的`value`为剪裁比例，`value`为精度损失的比例。
-
-示例：
-
-
-## load_sensitivities
-paddleslim.prune.load_sensitivities(sensitivities_file)[源代码](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/prune/sensitive.py#L184)
-
-: 从文件中加载敏感度信息。
-
-参数：
-
-- **sensitivities_file(str)** - 存放敏感度信息的本地文件.
-
-返回：
-
-- **sensitivities(dict)** - 敏感度信息。
-
-示例：
-
-
-## get_ratios_by_loss
-paddleslim.prune.get_ratios_by_loss(sensitivities, loss)[源代码](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/prune/sensitive.py#L206)
-
-: 根据敏感度和精度损失阈值计算出一组剪切率。对于参数`w`, 其剪裁率为使精度损失低于`loss`的最大剪裁率。
-
-参数：
-
-- **sensitivities(dict)** - 敏感度信息。
-
-- **loss** - 精度损失阈值。
-
-返回：
-
-- **ratios(dict)** - 一组剪切率。`key`是待剪裁参数的名称。`value`是对应参数的剪裁率。
diff --git a/docs/docs/api/quantization_api.md b/docs/docs/api/quantization_api.md
deleted file mode 100644
index 356b937e8495d0895d0831bea8508a8cd285cbbe..0000000000000000000000000000000000000000
--- a/docs/docs/api/quantization_api.md
+++ /dev/null
@@ -1,239 +0,0 @@
-
-
-## 量化配置
-通过字典配置量化参数
-
-```
-quant_config_default = {
-    'weight_quantize_type': 'abs_max',
-    'activation_quantize_type': 'abs_max',
-    'weight_bits': 8,
-    'activation_bits': 8,
-    # ops of name_scope in not_quant_pattern list, will not be quantized
-    'not_quant_pattern': ['skip_quant'],
-    # ops of type in quantize_op_types, will be quantized
-    'quantize_op_types':
-    ['conv2d', 'depthwise_conv2d', 'mul', 'elementwise_add', 'pool2d'],
-    # data type after quantization, such as 'uint8', 'int8', etc. default is 'int8'
-    'dtype': 'int8',
-    # window size for 'range_abs_max' quantization. defaulf is 10000
-    'window_size': 10000,
-    # The decay coefficient of moving average, default is 0.9
-    'moving_rate': 0.9,
-}
-```
-
-**参数：**
-
-- **weight_quantize_type(str)** - 参数量化方式。可选``'abs_max'``,  ``'channel_wise_abs_max'``, ``'range_abs_max'``, ``'moving_average_abs_max'``。 默认``'abs_max'``。
-- **activation_quantize_type(str)** - 激活量化方式，可选``'abs_max'``, ``'range_abs_max'``, ``'moving_average_abs_max'``，默认``'abs_max'``。
-- **weight_bits(int)** - 参数量化bit数，默认8, 推荐设为8。
-- **activation_bits(int)** -  激活量化bit数，默认8， 推荐设为8。
-- **not_quant_pattern(str | list[str])** - 所有``name_scope``包含``'not_quant_pattern'``字符串的``op``，都不量化, 设置方式请参考[*fluid.name_scope*](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/name_scope_cn.html#name-scope)。
-- **quantize_op_types(list[str])** -  需要进行量化的``op``类型，目前支持``'conv2d', 'depthwise_conv2d', 'mul' ``。
-- **dtype(int8)** - 量化后的参数类型，默认 ``int8``, 目前仅支持``int8``。
-- **window_size(int)** -  ``'range_abs_max'``量化方式的``window size``，默认10000。
-- **moving_rate(int)** - ``'moving_average_abs_max'``量化方式的衰减系数，默认 0.9。
-
-
-## quant_aware
-paddleslim.quant.quant_aware(program, place, config, scope=None, for_test=False)[[源代码]](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/quant/quanter.py)
-: 在``program``中加入量化和反量化``op``, 用于量化训练。
-
-
-**参数：**
-
-* **program (fluid.Program)** -  传入训练或测试``program``。
-* **place(fluid.CPUPlace | fluid.CUDAPlace)** -  该参数表示``Executor``执行所在的设备。
-* **config(dict)** -  量化配置表。
-* **scope(fluid.Scope, optional)** -  传入用于存储``Variable``的``scope``，需要传入``program``所使用的``scope``，一般情况下，是[*fluid.global_scope()*](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html)。设置为``None``时将使用[*fluid.global_scope()*](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html)，默认值为``None``。
-* **for_test(bool)** -  如果``program``参数是一个测试``program``，``for_test``应设为``True``，否则设为``False``。
-
-**返回**
-
-含有量化和反量化``operator``的``program``
-
-**返回类型**
-
-* 当``for_test=False``，返回类型为``fluid.CompiledProgram``， **注意，此返回值不能用于保存参数**。
-* 当``for_test=True``，返回类型为``fluid.Program``。
-
-!!! note "注意事项"
-
-* 此接口会改变``program``结构，并且可能增加一些``persistable``的变量，所以加载模型参数时请注意和相应的``program``对应。
-* 此接口底层经历了``fluid.Program``-> ``fluid.framework.IrGraph``->``fluid.Program``的转变，在``fluid.framework.IrGraph``中没有``Parameter``的概念，``Variable``只有``persistable``和``not persistable``的区别，所以在保存和加载参数时，请使用``fluid.io.save_persistables``和``fluid.io.load_persistables``接口。
-* 由于此接口会根据``program``的结构和量化配置来对``program``添加op，所以``Paddle``中一些通过``fuse op``来加速训练的策略不能使用。已知以下策略在使用量化时必须设为``False``： ``fuse_all_reduce_ops, sync_batch_norm``。
-* 如果传入的``program``中存在和任何op都没有连接的``Variable``，则会在量化的过程中被优化掉。
-
-
-
-## convert 
-paddleslim.quant.convert(program, place, config, scope=None, save_int8=False)[[源代码]](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/quant/quanter.py)
-
-
-: 把训练好的量化``program``，转换为可用于保存``inference model``的``program``。
-
-**参数：**
-
-- **program (fluid.Program)** -  传入测试``program``。
-- **place(fluid.CPUPlace | fluid.CUDAPlace)** - 该参数表示``Executor``执行所在的设备。
-- **config(dict)** -  量化配置表。
-- **scope(fluid.Scope)** - 传入用于存储``Variable``的``scope``，需要传入``program``所使用的``scope``，一般情况下，是[*fluid.global_scope()*](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html)。设置为``None``时将使用[*fluid.global_scope()*](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html)，默认值为``None``。
-- **save_int8（bool)** -  是否需要返回参数为``int8``的``program``。该功能目前只能用于确认模型大小。默认值为``False``。
-
-**返回**
-
-- **program (fluid.Program)** - freezed program，可用于保存inference model，参数为``float32``类型，但其数值范围可用int8表示。
-- **int8_program (fluid.Program)** - freezed program，可用于保存inference model，参数为``int8``类型。当``save_int8``为``False``时，不返回该值。
-
-!!! note "注意事项"
-
-因为该接口会对``op``和``Variable``做相应的删除和修改，所以此接口只能在训练完成之后调用。如果想转化训练的中间模型，可加载相应的参数之后再使用此接口。
-
-**代码示例**
-
-```python hl_lines="27 28"
-#encoding=utf8
-import paddle.fluid as fluid
-import paddleslim.quant as quant
-
-
-train_program = fluid.Program()
-
-with fluid.program_guard(train_program):
-    image = fluid.data(name='x', shape=[None, 1, 28, 28])
-    label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-    conv = fluid.layers.conv2d(image, 32, 1)
-    feat = fluid.layers.fc(conv, 10, act='softmax')
-    cost = fluid.layers.cross_entropy(input=feat, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-use_gpu = True
-place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
-exe = fluid.Executor(place)
-exe.run(fluid.default_startup_program())
-eval_program = train_program.clone(for_test=True)
-#配置
-config = {'weight_quantize_type': 'abs_max',
-        'activation_quantize_type': 'moving_average_abs_max'}
-build_strategy = fluid.BuildStrategy()
-exec_strategy = fluid.ExecutionStrategy()
-#调用api
-quant_train_program = quant.quant_aware(train_program, place, config, for_test=False)
-quant_eval_program = quant.quant_aware(eval_program, place, config, for_test=True)
-#关闭策略
-build_strategy.fuse_all_reduce_ops = False
-build_strategy.sync_batch_norm = False
-quant_train_program = quant_train_program.with_data_parallel(
-    loss_name=avg_cost.name,
-    build_strategy=build_strategy,
-    exec_strategy=exec_strategy)
-
-inference_prog = quant.convert(quant_eval_program, place, config)
-```
-
-更详细的用法请参考 <a href='https://github.com/PaddlePaddle/PaddleSlim/tree/develop/demo/quant/quant_aware'>量化训练demo</a>。
-
-## quant_post
-paddleslim.quant.quant_post(executor, model_dir, quantize_model_path,sample_generator, model_filename=None, params_filename=None, batch_size=16,batch_nums=None, scope=None, algo='KL', quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"])[[源代码]](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/quant/quanter.py)
-
-: 对保存在``${model_dir}``下的模型进行量化，使用``sample_generator``的数据进行参数校正。
-
-**参数:**
-
-- **executor (fluid.Executor)** - 执行模型的executor，可以在cpu或者gpu上执行。
-- **model_dir（str)** - 需要量化的模型所在的文件夹。
-- **quantize_model_path(str)** - 保存量化后的模型的路径
-- **sample_generator(python generator)** - 读取数据样本，每次返回一个样本。
-- **model_filename(str, optional)** - 模型文件名，如果需要量化的模型的参数存在一个文件中，则需要设置``model_filename``为模型文件的名称，否则设置为``None``即可。默认值是``None``。
-- **params_filename(str)** - 参数文件名，如果需要量化的模型的参数存在一个文件中，则需要设置``params_filename``为参数文件的名称，否则设置为``None``即可。默认值是``None``。
-- **batch_size(int)** - 每个batch的图片数量。默认值为16 。
-- **batch_nums(int, optional)** - 迭代次数。如果设置为``None``，则会一直运行到``sample_generator`` 迭代结束， 否则，迭代次数为``batch_nums``, 也就是说参与对``Scale``进行校正的样本个数为 ``'batch_nums' * 'batch_size' ``.
-- **scope(fluid.Scope, optional)** - 用来获取和写入``Variable``, 如果设置为``None``,则使用[*fluid.global_scope()*](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html). 默认值是``None``.
-- **algo(str)** - 量化时使用的算法名称，可为``'KL'``或者``'direct'``。该参数仅针对激活值的量化，因为参数值的量化使用的方式为``'channel_wise_abs_max'``. 当``algo`` 设置为``'direct'``时，使用校正数据的激活值的绝对值的最大值当作``Scale``值，当设置为``'KL'``时，则使用``KL``散度的方法来计算``Scale``值。默认值为``'KL'``。
-- **quantizable_op_type(list[str])** -  需要量化的``op``类型列表。默认值为``["conv2d", "depthwise_conv2d", "mul"]``。
-
-**返回**
-
-无。
-
-!!! note "注意事项"
-
-因为该接口会收集校正数据的所有的激活值，所以使用的校正图片不能太多。``'KL'``散度的计算也比较耗时。
-
-**代码示例**
-
-> 注： 此示例不能直接运行，因为需要加载``${model_dir}``下的模型，所以不能直接运行。
-
-```python hl_lines="9"
-import paddle.fluid as fluid
-import paddle.dataset.mnist as reader
-from paddleslim.quant import quant_post
-val_reader = reader.train()
-use_gpu = True
-place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
-
-exe = fluid.Executor(place)
-quant_post(
-        executor=exe,
-        model_dir='./model_path',
-        quantize_model_path='./save_path',
-        sample_generator=val_reader,
-        model_filename='__model__',
-        params_filename='__params__',
-        batch_size=16,
-        batch_nums=10)
-```
-更详细的用法请参考 <a href='https://github.com/PaddlePaddle/PaddleSlim/tree/develop/demo/quant/quant_post'>离线量化demo</a>。
-
-## quant_embedding
-paddleslim.quant.quant_embedding(program, place, config, scope=None)[[源代码]](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/quant/quant_embedding.py)
-: 对``Embedding``参数进行量化。
-
-**参数:**
-
-- **program(fluid.Program)** - 需要量化的program
-- **scope(fluid.Scope, optional)** - 用来获取和写入``Variable``, 如果设置为``None``,则使用[*fluid.global_scope()*](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html).
-- **place(fluid.CPUPlace | fluid.CUDAPlace)** - 运行program的设备
-- **config(dict)** - 定义量化的配置。可以配置的参数有：
-    - ``'params_name'`` (str, required): 需要进行量化的参数名称，此参数必须设置。
-    - ``'quantize_type'`` (str, optional): 量化的类型，目前支持的类型是``'abs_max'``, 待支持的类型有 ``'log', 'product_quantization'``。 默认值是``'abs_max'``.
-    - ``'quantize_bits'``（int, optional): 量化的``bit``数，目前支持的``bit``数为8。默认值是8.
-    - ``'dtype'``(str, optional): 量化之后的数据类型， 目前支持的是``'int8'``. 默认值是``int8``。
-    - ``'threshold'``(float, optional): 量化之前将根据此阈值对需要量化的参数值进行``clip``. 如果不设置，则跳过``clip``过程直接量化。
-
-**返回**
-
-量化之后的program
-
-**返回类型**
-
-``fluid.Program``
-
-**代码示例**
-```python hl_lines="22"
-import paddle.fluid as fluid
-import paddleslim.quant as quant
-
-train_program = fluid.Program()
-with fluid.program_guard(train_program):
-    input_word = fluid.data(name="input_word", shape=[None, 1], dtype='int64')
-    input_emb = fluid.embedding(
-        input=input_word,
-        is_sparse=False,
-        size=[100, 128],
-        param_attr=fluid.ParamAttr(name='emb',
-        initializer=fluid.initializer.Uniform(-0.005, 0.005)))
-
-infer_program = train_program.clone(for_test=True)
-
-use_gpu = True
-place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
-exe = fluid.Executor(place)
-exe.run(fluid.default_startup_program())
-
-config = {'params_name': 'emb', 'quantize_type': 'abs_max'}
-quant_program = quant.quant_embedding(infer_program, place, config)
-```
-
-更详细的用法请参考 <a href='https://github.com/PaddlePaddle/PaddleSlim/tree/develop/demo/quant/quant_embedding'>Embedding量化demo</a>。
diff --git a/docs/docs/api/single_distiller_api.md b/docs/docs/api/single_distiller_api.md
deleted file mode 100644
index c3685f7a63fc98602ce91fce17c03bd37149016d..0000000000000000000000000000000000000000
--- a/docs/docs/api/single_distiller_api.md
+++ /dev/null
@@ -1,216 +0,0 @@
-## merge
-paddleslim.dist.merge(teacher_program, student_program, data_name_map, place, scope=fluid.global_scope(), name_prefix='teacher_') [[源代码]](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L19) 
-
-: merge将两个paddle program（teacher_program, student_program）融合为一个program，并将融合得到的program返回。在融合的program中，可以为其中合适的teacher特征图和student特征图添加蒸馏损失函数，从而达到用teacher模型的暗知识（Dark Knowledge）指导student模型学习的目的。
-
-**参数：**
-
-- **teacher_program**(Program)-定义了teacher模型的 [*paddle program*](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Program_cn.html#program)
-- **student_program**(Program)-定义了student模型的 [*paddle program*](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Program_cn.html#program)
-- **data_name_map**(dict)-teacher输入接口名与student输入接口名的映射，其中dict的 *key* 为teacher的输入名，*value* 为student的输入名
-- **place**(fluid.CPUPlace()|fluid.CUDAPlace(N))-该参数表示程序运行在何种设备上，这里的N为GPU对应的ID
-- **scope**(Scope)-该参数表示程序使用的变量作用域，如果不指定将使用默认的全局作用域。默认值：[*fluid.global_scope()*](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/global_scope_cn.html#global-scope)
-- **name_prefix**(str)-merge操作将统一为teacher的[*Variables*](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.3/api_guides/low_level/program.html#variable)添加的名称前缀name_prefix。默认值：'teacher_'
-
-**返回：** 由student_program和teacher_program merge得到的program
-
-!!! note "Note"
-    *data_name_map* 是 **teacher_var name到student_var name的映射**，如果写反可能无法正确进行merge
-
-
-**使用示例：**
-
-```python hl_lines="17 18"
-import paddle.fluid as fluid
-import paddleslim.dist as dist
-student_program = fluid.Program()
-with fluid.program_guard(student_program):
-    x = fluid.layers.data(name='x', shape=[1, 28, 28])
-    conv = fluid.layers.conv2d(x, 32, 1)
-    out = fluid.layers.conv2d(conv, 64, 3, padding=1)
-teacher_program = fluid.Program()
-with fluid.program_guard(teacher_program):
-    y = fluid.layers.data(name='y', shape=[1, 28, 28])
-    conv = fluid.layers.conv2d(y, 32, 1)
-    conv = fluid.layers.conv2d(conv, 32, 3, padding=1)
-    out = fluid.layers.conv2d(conv, 64, 3, padding=1)
-data_name_map = {'y':'x'}
-USE_GPU = False
-place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
-main_program = dist.merge(teacher_program, student_program,
-		                  data_name_map, place)
-```
-
-
-## fsp_loss
-paddleslim.dist.fsp_loss(teacher_var1_name, teacher_var2_name, student_var1_name, student_var2_name, program=fluid.default_main_program()) [[源代码]](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L90)
-
-: fsp_loss为program内的teacher var和student var添加fsp loss，出自论文[<<A Gift from Knowledge Distillation: Fast Optimization, Network Minimization and Transfer Learning\>\>](http://openaccess.thecvf.com/content_cvpr_2017/papers/Yim_A_Gift_From_CVPR_2017_paper.pdf)
-
-**参数：**
-
-- **teacher_var1_name**(str): teacher_var1的名称. 对应的variable是一个形为`[batch_size, x_channel, height, width]`的4-D特征图Tensor，数据类型为float32或float64
-- **teacher_var2_name**(str): teacher_var2的名称. 对应的variable是一个形为`[batch_size, y_channel, height, width]`的4-D特征图Tensor，数据类型为float32或float64。只有y_channel可以与teacher_var1的x_channel不同，其他维度必须与teacher_var1相同
-- **student_var1_name**(str): student_var1的名称. 对应的variable需与teacher_var1尺寸保持一致，是一个形为`[batch_size, x_channel, height, width]`的4-D特征图Tensor，数据类型为float32或float64
-- **student_var2_name**(str): student_var2的名称. 对应的variable需与teacher_var2尺寸保持一致，是一个形为`[batch_size, y_channel, height, width]`的4-D特征图Tensor，数据类型为float32或float64。只有y_channel可以与student_var1的x_channel不同，其他维度必须与student_var1相同
-- **program**(Program): 用于蒸馏训练的fluid program。默认值：[*fluid.default_main_program()*](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.3/api_cn/fluid_cn.html#default-main-program)
-
-**返回：** 由teacher_var1, teacher_var2, student_var1, student_var2组合得到的fsp_loss
-
-**使用示例：**
-
-```python hl_lines="19 20"
-import paddle.fluid as fluid
-import paddleslim.dist as dist
-student_program = fluid.Program()
-with fluid.program_guard(student_program):
-    x = fluid.layers.data(name='x', shape=[1, 28, 28])
-    conv = fluid.layers.conv2d(x, 32, 1, name='s1')
-    out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='s2')
-teacher_program = fluid.Program()
-with fluid.program_guard(teacher_program):
-    y = fluid.layers.data(name='y', shape=[1, 28, 28])
-    conv = fluid.layers.conv2d(y, 32, 1, name='t1')
-    conv = fluid.layers.conv2d(conv, 32, 3, padding=1)
-    out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='t2')
-data_name_map = {'y':'x'}
-USE_GPU = False
-place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
-main_program = merge(teacher_program, student_program, data_name_map, place)
-with fluid.program_guard(main_program):
-    distillation_loss = dist.fsp_loss('teacher_t1.tmp_1', 'teacher_t2.tmp_1',
-			                          's1.tmp_1', 's2.tmp_1', main_program)
-```
-
-
-
-## l2_loss
-paddleslim.dist.l2_loss(teacher_var_name, student_var_name, program=fluid.default_main_program())[[源代码]](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L118)
-
-: l2_loss为program内的teacher var和student var添加l2 loss
-
-**参数：**
-
-- **teacher_var_name**(str): teacher_var的名称. 
-- **student_var_name**(str): student_var的名称.
-- **program**(Program): 用于蒸馏训练的fluid program。默认值：[*fluid.default_main_program()*](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.3/api_cn/fluid_cn.html#default-main-program)
-
-**返回：** 由teacher_var, student_var组合得到的l2_loss
-
-**使用示例：**
-
-```python hl_lines="19 20"
-import paddle.fluid as fluid
-import paddleslim.dist as dist
-student_program = fluid.Program()
-with fluid.program_guard(student_program):
-    x = fluid.layers.data(name='x', shape=[1, 28, 28])
-    conv = fluid.layers.conv2d(x, 32, 1, name='s1')
-    out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='s2')
-teacher_program = fluid.Program()
-with fluid.program_guard(teacher_program):
-    y = fluid.layers.data(name='y', shape=[1, 28, 28])
-    conv = fluid.layers.conv2d(y, 32, 1, name='t1')
-    conv = fluid.layers.conv2d(conv, 32, 3, padding=1)
-    out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='t2')
-data_name_map = {'y':'x'}
-USE_GPU = False
-place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
-main_program = merge(teacher_program, student_program, data_name_map, place)
-with fluid.program_guard(main_program):
-    distillation_loss = dist.l2_loss('teacher_t2.tmp_1', 's2.tmp_1',
-			                         main_program)
-```
-
-
-
-## soft_label_loss
-paddleslim.dist.soft_label_loss(teacher_var_name, student_var_name, program=fluid.default_main_program(), teacher_temperature=1., student_temperature=1.)[[源代码]](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L136)
-
-: soft_label_loss为program内的teacher var和student var添加soft label loss，出自论文[<<Distilling the Knowledge in a Neural Network\>\>](https://arxiv.org/pdf/1503.02531.pdf)
-
-**参数：**
-
-- **teacher_var_name**(str): teacher_var的名称. 
-- **student_var_name**(str): student_var的名称. 
-- **program**(Program): 用于蒸馏训练的fluid program。默认值：[*fluid.default_main_program()*](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.3/api_cn/fluid_cn.html#default-main-program)
-- **teacher_temperature**(float): 对teacher_var进行soft操作的温度值，温度值越大得到的特征图越平滑 
-- **student_temperature**(float): 对student_var进行soft操作的温度值，温度值越大得到的特征图越平滑 
-
-**返回：** 由teacher_var, student_var组合得到的soft_label_loss
-
-**使用示例：**
-
-```python hl_lines="19 20"
-import paddle.fluid as fluid
-import paddleslim.dist as dist
-student_program = fluid.Program()
-with fluid.program_guard(student_program):
-    x = fluid.layers.data(name='x', shape=[1, 28, 28])
-    conv = fluid.layers.conv2d(x, 32, 1, name='s1')
-    out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='s2')
-teacher_program = fluid.Program()
-with fluid.program_guard(teacher_program):
-    y = fluid.layers.data(name='y', shape=[1, 28, 28])
-    conv = fluid.layers.conv2d(y, 32, 1, name='t1')
-    conv = fluid.layers.conv2d(conv, 32, 3, padding=1)
-    out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='t2')
-data_name_map = {'y':'x'}
-USE_GPU = False
-place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
-main_program = merge(teacher_program, student_program, data_name_map, place)
-with fluid.program_guard(main_program):
-    distillation_loss = dist.soft_label_loss('teacher_t2.tmp_1',
-			                                 's2.tmp_1', main_program, 1., 1.)
-```
-
-
-
-## loss
-paddleslim.dist.loss(loss_func, program=fluid.default_main_program(), **kwargs) [[源代码]](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L165)
-
-: loss函数支持对任意多对teacher_var和student_var使用自定义损失函数
-
-**参数：**
-
-- **loss_func**(python function): 自定义的损失函数，输入为teacher var和student var，输出为自定义的loss 
-- **program**(Program): 用于蒸馏训练的fluid program。默认值：[*fluid.default_main_program()*](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.3/api_cn/fluid_cn.html#default-main-program)
-- **\**kwargs**: loss_func输入名与对应variable名称
-
-**返回**：自定义的损失函数loss
-
-**使用示例：**
-
-```python hl_lines="24 25"
-import paddle.fluid as fluid
-import paddleslim.dist as dist
-student_program = fluid.Program()
-with fluid.program_guard(student_program):
-    x = fluid.layers.data(name='x', shape=[1, 28, 28])
-    conv = fluid.layers.conv2d(x, 32, 1, name='s1')
-    out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='s2')
-teacher_program = fluid.Program()
-with fluid.program_guard(teacher_program):
-    y = fluid.layers.data(name='y', shape=[1, 28, 28])
-    conv = fluid.layers.conv2d(y, 32, 1, name='t1')
-    conv = fluid.layers.conv2d(conv, 32, 3, padding=1)
-    out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='t2')
-data_name_map = {'y':'x'}
-USE_GPU = False
-place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
-main_program = merge(teacher_program, student_program, data_name_map, place)
-def adaptation_loss(t_var, s_var):
-    teacher_channel = t_var.shape[1]
-    s_hint = fluid.layers.conv2d(s_var, teacher_channel, 1)
-    hint_loss = fluid.layers.reduce_mean(fluid.layers.square(s_hint - t_var))
-    return hint_loss
-with fluid.program_guard(main_program):
-    distillation_loss = dist.loss(main_program, adaptation_loss,
-			t_var='teacher_t2.tmp_1', s_var='s2.tmp_1')
-```
-
-!!! note "注意事项"
-    在添加蒸馏loss时会引入新的variable，需要注意新引入的variable不要与student variables命名冲突。这里建议两种用法（两种方法任选其一即可）：
-
-    1. 建议与student_program使用同一个命名空间，以避免一些未指定名称的variables(例如tmp_0, tmp_1...)多次定义为同一名称出现命名冲突
-    2. 建议在添加蒸馏loss时指定一个命名空间前缀，具体用法请参考Paddle官方文档[*fluid.name_scope*](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/name_scope_cn.html#name-scope)
diff --git a/docs/docs/index.md b/docs/docs/index.md
deleted file mode 100644
index 1c0f22919759f6bf2d0d8f980e18079ff1f3a220..0000000000000000000000000000000000000000
--- a/docs/docs/index.md
+++ /dev/null
@@ -1,61 +0,0 @@
-
-
-# PaddleSlim
-
-PaddleSlim是PaddlePaddle框架的一个子模块，主要用于压缩图像领域模型。在PaddleSlim中，不仅实现了目前主流的网络剪枝、量化、蒸馏三种压缩策略，还实现了超参数搜索和小模型网络结构搜索功能。在后续版本中，会添加更多的压缩策略，以及完善对NLP领域模型的支持。
-
-## 功能
-
-- 模型剪裁
-  - 支持通道均匀模型剪裁（uniform pruning)
-  - 基于敏感度的模型剪裁
-  - 基于进化算法的自动模型剪裁三种方式
-
-- 量化训练
-  - 在线量化训练（training aware）
-  - 离线量化（post training）
-  - 支持对权重全局量化和Channel-Wise量化
-
-- 蒸馏
-
-- 轻量神经网络结构自动搜索（Light-NAS）
-  - 支持基于进化算法的轻量神经网络结构自动搜索（Light-NAS）
-  - 支持 FLOPS / 硬件延时约束
-  - 支持多平台模型延时评估
-
-
-## 安装
-
-安装PaddleSlim前，请确认已正确安装Paddle1.6版本或更新版本。Paddle安装请参考：[Paddle安装教程](https://www.paddlepaddle.org.cn/install/quick)。
-
-
-- 安装develop版本
-
-
-```
-git clone https://github.com/PaddlePaddle/PaddleSlim.git
-cd PaddleSlim
-python setup.py install
-```
-
-- 安装官方发布的最新版本
-
-```
-pip install paddleslim -i https://pypi.org/simple
-```
-
-- 安装历史版本
-
-请点击[pypi.org](https://pypi.org/project/paddleslim/#history)查看可安装历史版本。
-
-## 使用
-
-- [API文档](doc/api_guide.md)：API使用介绍，包括[蒸馏]()、[剪裁]()、[量化]()和[模型结构搜索]()。
-- [示例](doc/demo_guide.md)：基于mnist和cifar10等简单分类任务的模型压缩示例，您可以通过该部分快速体验和了解PaddleSlim的功能。
-- [实践教程]()：经典模型的分析和压缩实验教程。
-- [模型库]()：经过压缩的分类、检测、语义分割模型，包括权重文件、网络结构文件和性能数据。
-- [Paddle检测库]()：介绍如何在检测库中使用PaddleSlim。
-- [Paddle分割库]()：介绍如何在分割库中使用PaddleSlim。
-- [PaddleLite]()：介绍如何使用预测库PaddleLite部署PaddleSlim产出的模型。
-
-## 贡献与反馈
diff --git a/docs/docs/tutorials/demo_guide.md b/docs/docs/tutorials/demo_guide.md
deleted file mode 100644
index ec1fc0e133d86c32d8576039027a1369999ce275..0000000000000000000000000000000000000000
--- a/docs/docs/tutorials/demo_guide.md
+++ /dev/null
@@ -1,18 +0,0 @@
-
-## [蒸馏](../demo/distillation/distillation_demo.py)
-
-蒸馏demo默认使用ResNet50作为teacher网络，MobileNet作为student网络，此外还支持将teacher和student换成[models目录](../demo/models)支持的任意模型。
-
-demo中对teahcer模型和student模型的一层特征图添加了l2_loss的蒸馏损失函数，使用时也可根据需要选择fsp_loss, soft_label_loss以及自定义的loss函数。
-
-训练默认使用的是cifar10数据集，piecewise_decay学习率衰减策略，momentum优化器进行120轮蒸馏训练。使用者也可以简单地用args参数切换为使用ImageNet数据集，cosine_decay学习率衰减策略等其他训练配置。
-
-## 量化
-
-### [量化训练demo文档](./quant_aware_demo.md)
-### [离线量化demo文档](./quant_post_demo.md)
-### [Embedding量化demo文档](./quant_embedding_demo.md)
-
-## NAS
-
-### [NAS示例](./nas_demo.md)
diff --git a/docs/docs/tutorials/distillation_demo.md b/docs/docs/tutorials/distillation_demo.md
deleted file mode 100644
index c565eee72fea52c07e30bab47707eec759257eb7..0000000000000000000000000000000000000000
--- a/docs/docs/tutorials/distillation_demo.md
+++ /dev/null
@@ -1,110 +0,0 @@
-本示例将介绍如何使用PaddleSlim蒸馏接口来对模型进行蒸馏训练。
-
-## 接口介绍
-
-请参考[蒸馏API文档](https://paddlepaddle.github.io/PaddleSlim/api/single_distiller_api/)。
-
-## PaddleSlim蒸馏训练流程
-
-一般情况下，模型参数量越多，结构越复杂，其性能越好，但运算量和资源消耗也越大。**知识蒸馏** 就是一种将大模型学习到的有用信息（Dark Knowledge）压缩进更小更快的模型，而获得可以匹敌大模型结果的方法。
-
-在本示例中精度较高的大模型被称为teacher，精度稍逊但速度更快的小模型被称为student。
-
-### 1. 定义student_program
-
-```python
-student_program = fluid.Program()
-student_startup = fluid.Program()
-with fluid.program_guard(student_program, student_startup):
-    image = fluid.data(
-        name='image', shape=[None] + [3, 224, 224], dtype='float32')
-    label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-    # student model definition
-    model = MobileNet()
-    out = model.net(input=image, class_dim=1000)
-    cost = fluid.layers.cross_entropy(input=out, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-```
-
-### 2. 定义teacher_program
-
-在定义好`teacher_program`后，可以一并加载训练好的pretrained_model。
-
-在`teacher_program`内需要加上`with fluid.unique_name.guard():`，保证teacher的变量命名不被`student_program`影响，从而能够正确地加载预训练参数。
-
-```python
-teacher_program = fluid.Program()
-teacher_startup = fluid.Program()
-with fluid.program_guard(teacher_program, teacher_startup):
-    with fluid.unique_name.guard():
-        image = fluid.data(
-            name='data', shape=[None] + [3, 224, 224], dtype='float32')
-        # teacher model definition
-        teacher_model = ResNet()
-        predict = teacher_model.net(image, class_dim=1000)
-exe.run(teacher_startup)
-def if_exist(var):
-    return os.path.exists(
-        os.path.join("./pretrained", var.name)
-fluid.io.load_vars(
-    exe,
-    "./pretrained",
-    main_program=teacher_program,
-    predicate=if_exist)
-```
-
-### 3.选择特征图
-
-定义好`student_program`和`teacher_program`后，我们需要从中两两对应地挑选出若干个特征图，留待后续为其添加知识蒸馏损失函数。
-
-```python
-# get all student variables
-student_vars = []
-for v in student_program.list_vars():
-    try:
-        student_vars.append((v.name, v.shape))
-    except:
-        pass
-print("="*50+"student_model_vars"+"="*50)
-print(student_vars)
-# get all teacher variables
-teacher_vars = []
-for v in teacher_program.list_vars():
-    try:
-        teacher_vars.append((v.name, v.shape))
-    except:
-        pass
-print("="*50+"teacher_model_vars"+"="*50)
-print(teacher_vars)
-```
-
-### 4. 合并Program（merge）
-
-PaddlePaddle使用Program来描述计算图，为了同时计算student和teacher两个Program，这里需要将其两者合并（merge）为一个Program。
-
-merge过程操作较多，具体细节请参考[merge API文档](https://paddlepaddle.github.io/PaddleSlim/api/single_distiller_api/#merge)。
-
-```python
-data_name_map = {'data': 'image'}
-student_program = merge(teacher_program, student_program, data_name_map, place)
-```
-
-### 5.添加蒸馏loss
-
-在添加蒸馏loss的过程中，可能还会引入部分变量（Variable），为了避免命名重复这里可以使用`with fluid.name_scope("distill"):`为新引入的变量加一个命名作用域。
-
-另外需要注意的是，merge过程为`teacher_program`的变量统一加了名称前缀，默认是`"teacher_"`, 这里在添加`l2_loss`时也要为teacher的变量加上这个前缀。
-
-```python
-with fluid.program_guard(student_program, student_startup):
-    with fluid.name_scope("distill"):
-        distill_loss = l2_loss('teacher_bn5c_branch2b.output.1.tmp_3',
-		    'depthwise_conv2d_11.tmp_0', student_program)
-        distill_weight = 1
-        loss = avg_cost + distill_loss * distill_weight
-        opt = create_optimizer()
-        opt.minimize(loss)
-exe.run(student_startup)
-```
-
-至此，我们就得到了用于蒸馏训练的`student_program`，后面就可以使用一个普通program一样对其开始训练和评估。
diff --git a/docs/docs/tutorials/nas_demo.md b/docs/docs/tutorials/nas_demo.md
deleted file mode 100644
index f28dbd71dba67703935598699935d9a91fd60c3f..0000000000000000000000000000000000000000
--- a/docs/docs/tutorials/nas_demo.md
+++ /dev/null
@@ -1,70 +0,0 @@
-# 网络结构搜索示例
-
-本示例介绍如何使用网络结构搜索接口，搜索到一个更小或者精度更高的模型，该文档仅介绍paddleslim中SANAS的使用及如何利用SANAS得到模型结构，完整示例代码请参考sa_nas_mobilenetv2.py或者block_sa_nas_mobilenetv2.py。
-
-## 接口介绍
-请参考。
-
-### 1. 配置搜索空间
-详细的搜索空间配置可以参考<a href='../../../paddleslim/nas/nas_api.md'>神经网络搜索API文档</a>。
-```
-config = [('MobileNetV2Space')]
-
-```
-
-### 2. 利用搜索空间初始化SANAS实例
-```
-from paddleslim.nas import SANAS
-
-sa_nas = SANAS(
-    config,
-    server_addr=("", 8881),
-    init_temperature=10.24,
-    reduce_rate=0.85,
-    search_steps=300,
-    is_server=True)
-
-```
-
-### 3. 根据实例化的NAS得到当前的网络结构
-```
-archs = sa_nas.next_archs()
-```
-
-### 4. 根据得到的网络结构和输入构造训练和测试program
-```
-import paddle.fluid as fluid
-
-train_program = fluid.Program()
-test_program = fluid.Program()
-startup_program = fluid.Program()
-
-with fluid.program_guard(train_program, startup_program):
-    data = fluid.data(name='data', shape=[None, 3, 32, 32], dtype='float32')
-    label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-    for arch in archs:
-        data = arch(data)
-    output = fluid.layers.fc(data, 10)
-    softmax_out = fluid.layers.softmax(input=output, use_cudnn=False)
-    cost = fluid.layers.cross_entropy(input=softmax_out, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1)
-
-    test_program = train_program.clone(for_test=True)
-    sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-    sgd.minimize(avg_cost)
-    
-```
-
-### 5. 根据构造的训练program添加限制条件
-```
-from paddleslim.analysis import flops
-
-if flops(train_program) > 321208544:
-    continue
-```
-
-### 6. 回传score
-```
-sa_nas.reward(score)
-```
diff --git a/docs/docs/tutorials/quant_aware_demo.md b/docs/docs/tutorials/quant_aware_demo.md
deleted file mode 100644
index 5fae50c5ff752c36863bfa57a9a9f08135b90f00..0000000000000000000000000000000000000000
--- a/docs/docs/tutorials/quant_aware_demo.md
+++ /dev/null
@@ -1,77 +0,0 @@
-# 在线量化示例
-
-本示例介绍如何使用在线量化接口，来对训练好的分类模型进行量化, 可以减少模型的存储空间和显存占用。
-
-## 接口介绍
-
-请参考 <a href='../../../paddleslim/quant/quantization_api_doc.md'>量化API文档</a>。
-
-## 分类模型的离线量化流程
-
-### 1. 配置量化参数
-
-```
-quant_config = {
-    'weight_quantize_type': 'abs_max',
-    'activation_quantize_type': 'moving_average_abs_max',
-    'weight_bits': 8,
-    'activation_bits': 8,
-    'not_quant_pattern': ['skip_quant'],
-    'quantize_op_types': ['conv2d', 'depthwise_conv2d', 'mul'],
-    'dtype': 'int8',
-    'window_size': 10000,
-    'moving_rate': 0.9,
-    'quant_weight_only': False
-}
-```
-
-### 2. 对训练和测试program插入可训练量化op
-
-```
-val_program = quant_aware(val_program, place, quant_config, scope=None, for_test=True)
-
-compiled_train_prog = quant_aware(train_prog, place, quant_config, scope=None, for_test=False)
-```
-
-### 3.关掉指定build策略
-
-```
-build_strategy = fluid.BuildStrategy()
-build_strategy.fuse_all_reduce_ops = False
-build_strategy.sync_batch_norm = False
-exec_strategy = fluid.ExecutionStrategy()
-compiled_train_prog = compiled_train_prog.with_data_parallel(
-        loss_name=avg_cost.name,
-        build_strategy=build_strategy,
-        exec_strategy=exec_strategy)
-```
-
-### 4. freeze program
-
-```
-float_program, int8_program = convert(val_program, 
-                                      place,
-                                      quant_config,
-                                      scope=None,
-                                      save_int8=True)
-```
-
-### 5.保存预测模型
-
-```
-fluid.io.save_inference_model(
-    dirname=float_path,
-    feeded_var_names=[image.name],
-    target_vars=[out], executor=exe,
-    main_program=float_program,
-    model_filename=float_path + '/model',
-    params_filename=float_path + '/params')
-
-fluid.io.save_inference_model(
-    dirname=int8_path,
-    feeded_var_names=[image.name],
-    target_vars=[out], executor=exe,
-    main_program=int8_program,
-    model_filename=int8_path + '/model',
-    params_filename=int8_path + '/params')
-```
diff --git a/docs/docs/tutorials/quant_embedding_demo.md b/docs/docs/tutorials/quant_embedding_demo.md
deleted file mode 100755
index 422ef5b6ecbf96a356dfb6e8943d2863f6da5e23..0000000000000000000000000000000000000000
--- a/docs/docs/tutorials/quant_embedding_demo.md
+++ /dev/null
@@ -1,226 +0,0 @@
-# Embedding量化示例
-
-本示例介绍如何使用Embedding量化的接口 [paddleslim.quant.quant_embedding]() 。``quant_embedding``接口将网络中的Embedding参数从``float32``类型量化到 ``8-bit``整数类型，在几乎不损失模型精度的情况下减少模型的存储空间和显存占用。
-
-
-接口介绍请参考 <a href='../../../paddleslim/quant/quantization_api_doc.md'>量化API文档</a>。
-
-该接口对program的修改：
-
-量化前:
-
-<p align="center">
-<img src="./image/before.png" height=200 width=100 hspace='10'/> <br />
-<strong>图1：量化前的模型结构</strong>
-</p>
-
-量化后：
-
-<p align="center">
-<img src="./image/after.png" height=300 width=300 hspace='10'/> <br />
-<strong>图2: 量化后的模型结构</strong>
-</p>
-
-以下将以 ``基于skip-gram的word2vector模型`` 为例来说明如何使用``quant_embedding``接口。首先介绍 ``基于skip-gram的word2vector模型`` 的正常训练和测试流程。
-
-## 基于skip-gram的word2vector模型
-
-以下是本例的简要目录结构及说明：
-
-```text
-.
-├── cluster_train.py    # 分布式训练函数
-├── cluster_train.sh    # 本地模拟多机脚本
-├── train.py            # 训练函数
-├── infer.py            # 预测脚本
-├── net.py              # 网络结构
-├── preprocess.py       # 预处理脚本，包括构建词典和预处理文本
-├── reader.py           # 训练阶段的文本读写
-├── train.py            # 训练函数
-└── utils.py            # 通用函数
-
-```
-
-### 介绍
-本例实现了skip-gram模式的word2vector模型。
-
-同时推荐用户参考[ IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/124377)
-
-### 数据下载
-全量数据集使用的是来自1 Billion Word Language Model Benchmark的(http://www.statmt.org/lm-benchmark) 的数据集.
-
-```bash
-mkdir data
-wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
-tar xzvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
-mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ data/
-```
-
-备用数据地址下载命令如下
-
-```bash
-mkdir data
-wget https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar
-tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar
-mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ data/
-```
-
-为了方便快速验证，我们也提供了经典的text8样例数据集，包含1700w个词。 下载命令如下
-
-```bash
-mkdir data
-wget https://paddlerec.bj.bcebos.com/word2vec/text.tar
-tar xvf text.tar
-mv text data/
-```
-
-
-### 数据预处理
-以样例数据集为例进行预处理。全量数据集注意解压后以training-monolingual.tokenized.shuffled 目录为预处理目录，和样例数据集的text目录并列。
-
-词典格式: 词<空格>词频。注意低频词用'UNK'表示
-
-可以按格式自建词典，如果自建词典跳过第一步。
-```
-the 1061396
-of 593677
-and 416629
-one 411764
-in 372201
-a 325873
-<UNK> 324608
-to 316376
-zero 264975
-nine 250430
-```
-
-第一步根据英文语料生成词典，中文语料可以通过修改text_strip方法自定义处理方法。
-
-```bash
-python preprocess.py --build_dict --build_dict_corpus_dir data/text/ --dict_path data/test_build_dict
-```
-
-第二步根据词典将文本转成id, 同时进行downsample，按照概率过滤常见词, 同时生成word和id映射的文件，文件名为词典+"_word_to_id_"。
-
-```bash
-python preprocess.py --filter_corpus --dict_path data/test_build_dict --input_corpus_dir data/text --output_corpus_dir data/convert_text8 --min_count 5 --downsample 0.001
-```
-
-### 训练
-具体的参数配置可运行
-
-
-```bash
-python train.py -h
-```
-
-单机多线程训练
-```bash
-OPENBLAS_NUM_THREADS=1 CPU_NUM=5 python train.py --train_data_dir data/convert_text8 --dict_path data/test_build_dict --num_passes 10 --batch_size 100 --model_output_dir v1_cpu5_b100_lr1dir --base_lr 1.0 --print_batch 1000 --with_speed --is_sparse
-```
-
-本地单机模拟多机训练
-
-```bash
-sh cluster_train.sh
-```
-
-本示例中按照单机多线程训练的命令进行训练，训练完毕后，可看到在当前文件夹下保存模型的路径为:     ``v1_cpu5_b100_lr1dir``, 运行 ``ls v1_cpu5_b100_lr1dir``可看到该文件夹下保存了训练的10个epoch的模型文件。
-```
-pass-0  pass-1  pass-2  pass-3  pass-4  pass-5  pass-6  pass-7  pass-8  pass-9
-```
-
-### 预测
-测试集下载命令如下
-
-```bash
-#全量数据集测试集
-wget https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar
-#样本数据集测试集
-wget https://paddlerec.bj.bcebos.com/word2vec/test_mid_dir.tar
-```
-
-预测命令，注意词典名称需要加后缀"_word_to_id_", 此文件是预处理阶段生成的。
-```bash
-python infer.py --infer_epoch --test_dir data/test_mid_dir --dict_path data/test_build_dict_word_to_id_ --batch_size 20000 --model_dir v1_cpu5_b100_lr1dir/  --start_index 0 --last_index 9
-```
-运行该预测命令, 可看到如下输出
-```
-('start index: ', 0, ' last_index:', 9)
-('vocab_size:', 63642)
-step:1 249
-epoch:0          acc:0.014
-step:1 590
-epoch:1          acc:0.033
-step:1 982
-epoch:2          acc:0.055
-step:1 1338
-epoch:3          acc:0.075
-step:1 1653
-epoch:4          acc:0.093
-step:1 1914
-epoch:5          acc:0.107
-step:1 2204
-epoch:6          acc:0.124
-step:1 2416
-epoch:7          acc:0.136
-step:1 2606
-epoch:8          acc:0.146
-step:1 2722
-epoch:9          acc:0.153
-```
-
-## 量化``基于skip-gram的word2vector模型``
-
-量化配置为:
-```
-config = {
-        'params_name': 'emb',
-        'quantize_type': 'abs_max'
-        }
-```
-
-运行命令为：
-
-```bash
-python infer.py --infer_epoch --test_dir data/test_mid_dir --dict_path data/test_build_dict_word_to_id_ --batch_size 20000 --model_dir v1_cpu5_b100_lr1dir/  --start_index 0 --last_index 9 --emb_quant True
-```
-
-运行输出为:
-
-```
-('start index: ', 0, ' last_index:', 9)
-('vocab_size:', 63642)
-quant_embedding config {'quantize_type': 'abs_max', 'params_name': 'emb', 'quantize_bits': 8, 'dtype': 'int8'}
-step:1 253
-epoch:0          acc:0.014
-quant_embedding config {'quantize_type': 'abs_max', 'params_name': 'emb', 'quantize_bits': 8, 'dtype': 'int8'}
-step:1 586
-epoch:1          acc:0.033
-quant_embedding config {'quantize_type': 'abs_max', 'params_name': 'emb', 'quantize_bits': 8, 'dtype': 'int8'}
-step:1 970
-epoch:2          acc:0.054
-quant_embedding config {'quantize_type': 'abs_max', 'params_name': 'emb', 'quantize_bits': 8, 'dtype': 'int8'}
-step:1 1364
-epoch:3          acc:0.077
-quant_embedding config {'quantize_type': 'abs_max', 'params_name': 'emb', 'quantize_bits': 8, 'dtype': 'int8'}
-step:1 1642
-epoch:4          acc:0.092
-quant_embedding config {'quantize_type': 'abs_max', 'params_name': 'emb', 'quantize_bits': 8, 'dtype': 'int8'}
-step:1 1936
-epoch:5          acc:0.109
-quant_embedding config {'quantize_type': 'abs_max', 'params_name': 'emb', 'quantize_bits': 8, 'dtype': 'int8'}
-step:1 2216
-epoch:6          acc:0.124
-quant_embedding config {'quantize_type': 'abs_max', 'params_name': 'emb', 'quantize_bits': 8, 'dtype': 'int8'}
-step:1 2419
-epoch:7          acc:0.136
-quant_embedding config {'quantize_type': 'abs_max', 'params_name': 'emb', 'quantize_bits': 8, 'dtype': 'int8'}
-step:1 2603
-epoch:8          acc:0.146
-quant_embedding config {'quantize_type': 'abs_max', 'params_name': 'emb', 'quantize_bits': 8, 'dtype': 'int8'}
-step:1 2719
-epoch:9          acc:0.153
-```
-
-量化后的模型保存在``./output_quant``中，可看到量化后的参数``'emb.int8'``的大小为3.9M, 在``./v1_cpu5_b100_lr1dir``中可看到量化前的参数``'emb'``的大小为16M。
diff --git a/docs/docs/tutorials/quant_post_demo.md b/docs/docs/tutorials/quant_post_demo.md
deleted file mode 100755
index 72cd68781d6de71aca19d3b34f1daf187494f371..0000000000000000000000000000000000000000
--- a/docs/docs/tutorials/quant_post_demo.md
+++ /dev/null
@@ -1,72 +0,0 @@
-# 离线量化示例
-
-本示例介绍如何使用离线量化接口``paddleslim.quant.quant_post``来对训练好的分类模型进行离线量化, 该接口无需对模型进行训练就可得到量化模型，减少模型的存储空间和显存占用。
-
-## 接口介绍
-
-请参考 <a href='../../../paddleslim/quant/quantization_api_doc.md'>量化API文档</a>。
-
-## 分类模型的离线量化流程
-
-### 准备数据
-
-在当前文件夹下创建``data``文件夹，将``imagenet``数据集解压在``data``文件夹下，解压后``data``文件夹下应包含以下文件：
-- ``'train'``文件夹，训练图片
-- ``'train_list.txt'``文件
-- ``'val'``文件夹，验证图片
-- ``'val_list.txt'``文件
-
-### 准备需要量化的模型
-因为离线量化接口只支持加载通过``fluid.io.save_inference_model``接口保存的模型，因此如果您的模型是通过其他接口保存的，那需要先将模型进行转化。本示例将以分类模型为例进行说明。
-
-首先在[imagenet分类模型](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification#%E5%B7%B2%E5%8F%91%E5%B8%83%E6%A8%A1%E5%9E%8B%E5%8F%8A%E5%85%B6%E6%80%A7%E8%83%BD)中下载训练好的``mobilenetv1``模型。
-
-在当前文件夹下创建``'pretrain'``文件夹，将``mobilenetv1``模型在该文件夹下解压，解压后的目录为``pretrain/MobileNetV1_pretrained``
-
-### 导出模型
-通过运行以下命令可将模型转化为离线量化接口可用的模型：
-```
-python export_model.py --model "MobileNet" --pretrained_model ./pretrain/MobileNetV1_pretrained --data imagenet
-```
-转化之后的模型存储在``inference_model/MobileNet/``文件夹下，可看到该文件夹下有``'model'``, ``'weights'``两个文件。
-
-### 离线量化
-接下来对导出的模型文件进行离线量化，离线量化的脚本为[quant_post.py](./quant_post.py)，脚本中使用接口``paddleslim.quant.quant_post``对模型进行离线量化。运行命令为：
-```
-python quant_post.py --model_path ./inference_model/MobileNet --save_path ./quant_model_train/MobileNet --model_filename model --params_filename weights
-```
-
-- ``model_path``: 需要量化的模型坐在的文件夹
-- ``save_path``: 量化后的模型保存的路径
-- ``model_filename``: 如果需要量化的模型的参数文件保存在一个文件中，则设置为该模型的模型文件名称，如果参数文件保存在多个文件中，则不需要设置。
-- ``params_filename``: 如果需要量化的模型的参数文件保存在一个文件中，则设置为该模型的参数文件名称，如果参数文件保存在多个文件中，则不需要设置。
-
-运行以上命令后，可在``${save_path}``下看到量化后的模型文件和参数文件。
-
-> 使用的量化算法为``'KL'``, 使用训练集中的160张图片进行量化参数的校正。
-
-
-### 测试精度
-
-使用[eval.py](./eval.py)脚本对量化前后的模型进行测试，得到模型的分类精度进行对比。
-
-首先测试量化前的模型的精度，运行以下命令：
-```
-python eval.py --model_path ./inference_model/MobileNet --model_name model --params_name weights
-```
-精度输出为:
-```
-top1_acc/top5_acc= [0.70913923 0.89548034]
-```
-
-使用以下命令测试离线量化后的模型的精度：
-
-```
-python eval.py --model_path ./quant_model_train/MobileNet
-```
-
-精度输出为
-```
-top1_acc/top5_acc= [0.70141864 0.89086477]
-```
-从以上精度对比可以看出，对``mobilenet``在``imagenet``上的分类模型进行离线量化后 ``top1``精度损失为``0.77%``， ``top5``精度损失为``0.46%``. 
diff --git a/docs/docs/tutorials/sensitivity_demo.md b/docs/docs/tutorials/sensitivity_demo.md
deleted file mode 100644
index 920c98ef34523f98577265022b3b2888dbeb61c9..0000000000000000000000000000000000000000
--- a/docs/docs/tutorials/sensitivity_demo.md
+++ /dev/null
@@ -1,81 +0,0 @@
-该示例介绍如何分析卷积网络中各卷积层的敏感度，以及如何根据计算出的敏感度选择一组合适的剪裁率。
-该示例默认会自动下载并使用MNIST数据。支持以下模型：
-
-- MobileNetV1
-- MobileNetV2
-- ResNet50
-
-## 1. 接口介绍
-
-该示例涉及以下接口：
-
-- [paddleslim.prune.sensitivity](https://paddlepaddle.github.io/PaddleSlim/api/prune_api/#sensitivity)
-- [paddleslim.prune.merge_sensitive](https://paddlepaddle.github.io/PaddleSlim/api/prune_api/#merge_sensitive)
-- [paddleslim.prune.get_ratios_by_loss](https://paddlepaddle.github.io/PaddleSlim/api/prune_api/#get_ratios_by_losssensitivities-loss)
-
-## 2. 运行示例
-
-
-在路径`PaddleSlim/demo/sensitive`下执行以下代码运行示例：
-
-```
-export CUDA_VISIBLE_DEVICES=0
-python train.py --model "MobileNetV1"
-```
-
-通过`python train.py --help`查看更多选项。
-
-## 3. 重要步骤说明
-
-### 3.1 计算敏感度
-
-计算敏感度之前，用户需要搭建好用于测试的网络，以及实现评估模型精度的回调函数。
-
-调用`paddleslim.prune.sensitivity`接口计算敏感度。敏感度信息会追加到`sensitivities_file`选项所指定的文件中，如果需要重新计算敏感度，需要先删除`sensitivities_file`文件。
-
-如果模型评估速度较慢，可以通过多进程的方式加速敏感度计算过程。比如在进程1中设置`pruned_ratios=[0.1, 0.2, 0.3, 0.4]`，并将敏感度信息存放在文件`sensitivities_0.data`中，然后在进程2中设置`pruned_ratios=[0.5, 0.6, 0.7]`，并将敏感度信息存储在文件`sensitivities_1.data`中。这样每个进程只会计算指定剪切率下的敏感度信息。多进程可以运行在单机多卡，或多机多卡。
-
-代码如下：
-
-```
-# 进程1
-sensitivity(
-    val_program,
-    place,
-    params,
-    test,
-    sensitivities_file="sensitivities_0.data",
-    pruned_ratios=[0.1, 0.2, 0.3, 0.4])
-```
-
-```
-# 进程2
-sensitivity(
-    val_program,
-    place,
-    params,
-    test,
-    sensitivities_file="sensitivities_1.data",
-    pruned_ratios=[0.5, 0.6, 0.7])
-```
-
-
-### 3.2 合并敏感度
-
-如果用户通过上一节多进程的方式生成了多个存储敏感度信息的文件，可以通过`paddleslim.prune.merge_sensitive`将其合并，合并后的敏感度信息存储在一个`dict`中。代码如下：
-
-```
-sens = merge_sensitive(["./sensitivities_0.data", "./sensitivities_1.data"])
-```
-
-### 3.3 计算剪裁率
-
-调用`paddleslim.prune.get_ratios_by_loss`接口计算一组剪裁率。
-
-```
-ratios = get_ratios_by_loss(sens, 0.01)
-```
-
-其中，`0.01`为一个阈值，对于任意卷积层，其剪裁率为使精度损失低于阈值`0.01`的最大剪裁率。
-
-用户在计算出一组剪裁率之后可以通过接口`paddleslim.prune.Pruner`剪裁网络，并用接口`paddleslim.analysis.flops`计算`FLOPs`。如果`FLOPs`不满足要求，调整阈值重新计算出一组剪裁率。
diff --git a/docs/en/Makefile b/docs/en/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..141d0b25f71fd0d96f59c5f682ea537d2ba767ea
--- /dev/null
+++ b/docs/en/Makefile
@@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = ./
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/en/api_en/index_en.rst b/docs/en/api_en/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6e5a142cb5c30098646bb22d4efd26f2fca3e425
--- /dev/null
+++ b/docs/en/api_en/index_en.rst
@@ -0,0 +1,21 @@
+.. PaddleSlim documentation master file, created by
+   sphinx-quickstart on Wed Feb  5 14:04:52 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+API Documents
+==============
+
+.. toctree::
+   :maxdepth: 1
+
+   paddleslim.analysis.rst
+   paddleslim.prune.rst
+   paddleslim.dist.rst
+   paddleslim.quant.rst
+   paddleslim.nas.rst
+   paddleslim.nas.one_shot.rst
+   paddleslim.nas.darts.rst
+   paddleslim.pantheon.rst
+   search_space_en.md
+   table_latency_en.md
diff --git a/docs/en/api_en/modules.rst b/docs/en/api_en/modules.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b6e3e88a6f5b63ddf4381af1b86c1b19fd53c150
--- /dev/null
+++ b/docs/en/api_en/modules.rst
@@ -0,0 +1,7 @@
+paddleslim
+==========
+
+.. toctree::
+   :maxdepth: 4
+
+   paddleslim
diff --git a/docs/en/api_en/paddleslim.analysis.rst b/docs/en/api_en/paddleslim.analysis.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e9dc6db11dce2d4df4704fe5d43d3c943f50677f
--- /dev/null
+++ b/docs/en/api_en/paddleslim.analysis.rst
@@ -0,0 +1,36 @@
+paddleslim\.analysis package
+============================
+
+.. automodule:: paddleslim.analysis
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+Submodules
+----------
+
+paddleslim\.analysis\.flops module
+----------------------------------
+
+.. automodule:: paddleslim.analysis.flops
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.analysis\.latency module
+------------------------------------
+
+.. automodule:: paddleslim.analysis.latency
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.analysis\.model\_size module
+----------------------------------------
+
+.. automodule:: paddleslim.analysis.model_size
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
diff --git a/docs/en/api_en/paddleslim.common.rst b/docs/en/api_en/paddleslim.common.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a59bd0085952d4b75bb4c336e7e19e21c0353bea
--- /dev/null
+++ b/docs/en/api_en/paddleslim.common.rst
@@ -0,0 +1,68 @@
+paddleslim\.common package
+==========================
+
+.. automodule:: paddleslim.common
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+Submodules
+----------
+
+paddleslim\.common\.cached\_reader module
+-----------------------------------------
+
+.. automodule:: paddleslim.common.cached_reader
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.common\.controller module
+-------------------------------------
+
+.. automodule:: paddleslim.common.controller
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.common\.controller\_client module
+---------------------------------------------
+
+.. automodule:: paddleslim.common.controller_client
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.common\.controller\_server module
+---------------------------------------------
+
+.. automodule:: paddleslim.common.controller_server
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.common\.lock module
+-------------------------------
+
+.. automodule:: paddleslim.common.lock
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.common\.log\_helper module
+--------------------------------------
+
+.. automodule:: paddleslim.common.log_helper
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.common\.sa\_controller module
+-----------------------------------------
+
+.. automodule:: paddleslim.common.sa_controller
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
diff --git a/docs/en/api_en/paddleslim.core.rst b/docs/en/api_en/paddleslim.core.rst
new file mode 100644
index 0000000000000000000000000000000000000000..38ed2f4d153a4d84833d1dca458292acef350072
--- /dev/null
+++ b/docs/en/api_en/paddleslim.core.rst
@@ -0,0 +1,28 @@
+paddleslim\.core package
+========================
+
+.. automodule:: paddleslim.core
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+Submodules
+----------
+
+paddleslim\.core\.graph\_wrapper module
+---------------------------------------
+
+.. automodule:: paddleslim.core.graph_wrapper
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.core\.registry module
+---------------------------------
+
+.. automodule:: paddleslim.core.registry
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
diff --git a/docs/en/api_en/paddleslim.dist.rst b/docs/en/api_en/paddleslim.dist.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a886778da14600e33ee0f02d98dce1be4b9a1e89
--- /dev/null
+++ b/docs/en/api_en/paddleslim.dist.rst
@@ -0,0 +1,20 @@
+paddleslim\.dist package
+========================
+
+.. automodule:: paddleslim.dist
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+Submodules
+----------
+
+paddleslim\.dist\.single\_distiller module
+------------------------------------------
+
+.. automodule:: paddleslim.dist.single_distiller
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
diff --git a/docs/en/api_en/paddleslim.models.rst b/docs/en/api_en/paddleslim.models.rst
new file mode 100644
index 0000000000000000000000000000000000000000..958a9682e08ff89be5ccbb1d61fcc36ceab00e55
--- /dev/null
+++ b/docs/en/api_en/paddleslim.models.rst
@@ -0,0 +1,52 @@
+paddleslim\.models package
+==========================
+
+.. automodule:: paddleslim.models
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+Submodules
+----------
+
+paddleslim\.models\.classification\_models module
+-------------------------------------------------
+
+.. automodule:: paddleslim.models.classification_models
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.models\.mobilenet module
+------------------------------------
+
+.. automodule:: paddleslim.models.mobilenet
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.models\.mobilenet\_v2 module
+----------------------------------------
+
+.. automodule:: paddleslim.models.mobilenet_v2
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.models\.resnet module
+---------------------------------
+
+.. automodule:: paddleslim.models.resnet
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.models\.util module
+-------------------------------
+
+.. automodule:: paddleslim.models.util
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
diff --git a/docs/en/api_en/paddleslim.nas.darts.rst b/docs/en/api_en/paddleslim.nas.darts.rst
new file mode 100644
index 0000000000000000000000000000000000000000..20d2fe042818c5990664fcd2a82bad8a63b7f6cb
--- /dev/null
+++ b/docs/en/api_en/paddleslim.nas.darts.rst
@@ -0,0 +1,9 @@
+paddleslim\.nas\.darts package
+==============================
+
+.. automodule:: paddleslim.nas.darts
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
diff --git a/docs/en/api_en/paddleslim.nas.one_shot.rst b/docs/en/api_en/paddleslim.nas.one_shot.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f9ebde94783a33ede7463148b909d3147e9bbb1f
--- /dev/null
+++ b/docs/en/api_en/paddleslim.nas.one_shot.rst
@@ -0,0 +1,28 @@
+paddleslim\.nas\.one\_shot package
+==================================
+
+.. automodule:: paddleslim.nas.one_shot
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+Submodules
+----------
+
+paddleslim\.nas\.one\_shot\.one\_shot\_nas module
+-------------------------------------------------
+
+.. automodule:: paddleslim.nas.one_shot.one_shot_nas
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.nas\.one\_shot\.super\_mnasnet module
+-------------------------------------------------
+
+.. automodule:: paddleslim.nas.one_shot.super_mnasnet
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
diff --git a/docs/en/api_en/paddleslim.nas.rst b/docs/en/api_en/paddleslim.nas.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f6b17bcf87600e96c4b3b31573d9db3b0291a08c
--- /dev/null
+++ b/docs/en/api_en/paddleslim.nas.rst
@@ -0,0 +1,22 @@
+paddleslim\.nas package
+=======================
+
+Subpackages
+-----------
+
+.. toctree::
+
+    paddleslim.nas.one_shot
+
+Submodules
+----------
+
+paddleslim\.nas\.sa\_nas module
+-------------------------------
+
+.. automodule:: paddleslim.nas.sa_nas
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
diff --git a/docs/en/api_en/paddleslim.pantheon.rst b/docs/en/api_en/paddleslim.pantheon.rst
new file mode 100644
index 0000000000000000000000000000000000000000..59f48ce9dc4c653b9724eda46050da768623a0b3
--- /dev/null
+++ b/docs/en/api_en/paddleslim.pantheon.rst
@@ -0,0 +1,36 @@
+paddleslim\.pantheon package
+============================
+
+.. automodule:: paddleslim.pantheon
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+Submodules
+----------
+
+paddleslim\.pantheon\.student module
+------------------------------------
+
+.. automodule:: paddleslim.pantheon.student
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.pantheon\.teacher module
+------------------------------------
+
+.. automodule:: paddleslim.pantheon.teacher
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.pantheon\.utils module
+----------------------------------
+
+.. automodule:: paddleslim.pantheon.utils
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
diff --git a/docs/en/api_en/paddleslim.prune.rst b/docs/en/api_en/paddleslim.prune.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ec663d35ecd0f240fd47328654619feff16e805b
--- /dev/null
+++ b/docs/en/api_en/paddleslim.prune.rst
@@ -0,0 +1,60 @@
+paddleslim\.prune package
+=========================
+
+.. automodule:: paddleslim.prune
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+Submodules
+----------
+
+paddleslim\.prune\.auto\_pruner module
+--------------------------------------
+
+.. automodule:: paddleslim.prune.auto_pruner
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.prune\.prune\_io module
+-----------------------------------
+
+.. automodule:: paddleslim.prune.prune_io
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.prune\.prune\_walker module
+---------------------------------------
+
+.. automodule:: paddleslim.prune.prune_walker
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.prune\.pruner module
+--------------------------------
+
+.. automodule:: paddleslim.prune.pruner
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.prune\.sensitive module
+-----------------------------------
+
+.. automodule:: paddleslim.prune.sensitive
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.prune\.sensitive\_pruner module
+-------------------------------------------
+
+.. automodule:: paddleslim.prune.sensitive_pruner
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
diff --git a/docs/en/api_en/paddleslim.quant.rst b/docs/en/api_en/paddleslim.quant.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6051606a65214c4d3e9027472bd26701d99d5f54
--- /dev/null
+++ b/docs/en/api_en/paddleslim.quant.rst
@@ -0,0 +1,28 @@
+paddleslim\.quant package
+=========================
+
+.. automodule:: paddleslim.quant
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+Submodules
+----------
+
+paddleslim\.quant\.quant\_embedding module
+------------------------------------------
+
+.. automodule:: paddleslim.quant.quant_embedding
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+paddleslim\.quant\.quanter module
+---------------------------------
+
+.. automodule:: paddleslim.quant.quanter
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
diff --git a/docs/en/api_en/paddleslim.rst b/docs/en/api_en/paddleslim.rst
new file mode 100644
index 0000000000000000000000000000000000000000..85bf130ce662f5fa165879dfa766b4672e09b9ab
--- /dev/null
+++ b/docs/en/api_en/paddleslim.rst
@@ -0,0 +1,35 @@
+paddleslim package
+==================
+
+.. automodule:: paddleslim
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+Subpackages
+-----------
+
+.. toctree::
+
+    paddleslim.analysis
+    paddleslim.common
+    paddleslim.core
+    paddleslim.dist
+    paddleslim.models
+    paddleslim.nas
+    paddleslim.pantheon
+    paddleslim.prune
+    paddleslim.quant
+
+Submodules
+----------
+
+paddleslim\.version module
+--------------------------
+
+.. automodule:: paddleslim.version
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
diff --git a/docs/en/api_en/search_space_en.md b/docs/en/api_en/search_space_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..f568bf830346186c5f7f651f6a65d18fd0f0e675
--- /dev/null
+++ b/docs/en/api_en/search_space_en.md
@@ -0,0 +1,111 @@
+# search space
+Search Space used in neural architecture search. Search Space is a collection of model architecture, the purpose of SANAS is to get a model which FLOPs or latency is smaller or percision is higher.
+
+## search space which paddleslim.nas provided
+
+#### Based on origin model architecture:
+1. MobileNetV2Space<br>
+&emsp; MobileNetV2's architecture can reference: [code](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/image_classification/models/mobilenet_v2.py#L29), [paper](https://arxiv.org/abs/1801.04381)
+
+2. MobileNetV1Space<br>
+&emsp; MobilNetV1's architecture can reference: [code](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/image_classification/models/mobilenet_v1.py#L29), [paper](https://arxiv.org/abs/1704.04861)
+
+3. ResNetSpace<br>
+&emsp; ResNetSpace's architecture can reference: [code](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/image_classification/models/resnet.py#L30), [paper](https://arxiv.org/pdf/1512.03385.pdf)
+
+
+#### Based on block from different model:
+1. MobileNetV1BlockSpace<br>
+&emsp; MobileNetV1Block's architecture can reference: [code](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/image_classification/models/mobilenet_v1.py#L173)
+
+2. MobileNetV2BlockSpace<br>
+&emsp; MobileNetV2Block's architecture can reference: [code](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/image_classification/models/mobilenet_v2.py#L174)
+
+3. ResNetBlockSpace<br>
+&emsp; ResNetBlock's architecture can reference: [code](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/image_classification/models/resnet.py#L148)
+
+4. InceptionABlockSpace<br>
+&emsp; InceptionABlock's architecture can reference: [code](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/image_classification/models/inception_v4.py#L140)
+
+5. InceptionCBlockSpace<br>
+&emsp; InceptionCBlock's architecture can reference: [code](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/image_classification/models/inception_v4.py#L291)
+
+
+## How to use search space
+1. Only need to specify the name of search space if use the space based on origin model architecture, such as configs for class SANAS is [('MobileNetV2Space')] if you want to use origin MobileNetV2 as search space.
+2. Use search space paddleslim.nas provided based on block:<br>
+  2.1 Use `input_size`, `output_size` and `block_num` to construct search space, such as configs for class SANAS is ('MobileNetV2BlockSpace', {'input_size': 224, 'output_size': 32, 'block_num': 10})].<br>
+  2.2 Use `block_mask` to construct search space, such as configs for class SANAS is [('MobileNetV2BlockSpace', {'block_mask': [0, 1, 1, 1, 1, 0, 1, 0]})].
+
+## How to write yourself search space
+If you want to write yourself search space, you need to inherit base class named SearchSpaceBase and overwrite following functions:<br>
+&emsp; 1. Function to get initial tokens(function `init_tokens`), set the initial tokens which you want, every token in tokens means index of search list, such as if tokens=[0, 3, 5], it means the list of channel of current model architecture is [8, 40, 128].
+&emsp; 2. Function about the length of every token in tokens(function `range_table`), range of every token in tokens.
+&emsp; 3. Function to get model architecture according to tokens(function `token2arch`), get model architecture according to tokens in the search process.
+
+For example, how to add a search space with resnet block. New search space can NOT has the same name with existing search space.
+
+```python
+### import necessary head file
+from .search_space_base import SearchSpaceBase
+from .search_space_registry import SEARCHSPACE
+import numpy as np
+
+### use decorator SEARCHSPACE.register to register yourself search space to search space NameSpace
+@SEARCHSPACE.register
+### define a search space class inherit the base class SearchSpaceBase
+class ResNetBlockSpace2(SearchSpaceBase):
+    def __init__(self, input_size, output_size, block_num, block_mask):
+        ### define the iterm you want to search, such as the numeber of channel, the number of convolution repeat, the size of kernel.
+        ### self.filter_num represents the search list about the numeber of channel.
+        self.filter_num = np.array([8, 16, 32, 40, 64, 128, 256, 512])
+
+    ### define initial tokens, the length of initial tokens according to block_num or block_mask.
+    def init_tokens(self):
+        return [0] * 3 * len(self.block_mask)
+
+    ### define the range of index in tokens.
+    def range_table(self):
+        return [len(self.filter_num)] * 3 * len(self.block_mask)
+
+    ### transform tokens to model architecture.
+    def token2arch(self, tokens=None):
+        if tokens == None:
+            tokens = self.init_tokens()
+
+        self.bottleneck_params_list = []
+        for i in range(len(self.block_mask)):
+            self.bottleneck_params_list.append(self.filter_num[tokens[i * 3 + 0]],
+                                               self.filter_num[tokens[i * 3 + 1]],
+                                               self.filter_num[tokens[i * 3 + 2]],
+                                               2 if self.block_mask[i] == 1 else 1)
+
+        def net_arch(input):
+            for i, layer_setting in enumerate(self.bottleneck_params_list):
+                channel_num, stride = layer_setting[:-1], layer_setting[-1]
+                input = self._resnet_block(input, channel_num, stride, name='resnet_layer{}'.format(i+1))
+
+            return input
+
+        return net_arch
+
+    ### code to get block.
+    def _resnet_block(self, input, channel_num, stride, name=None):
+        shortcut_conv = self._shortcut(input, channel_num[2], stride, name=name)
+        input = self._conv_bn_layer(input=input, num_filters=channel_num[0], filter_size=1, act='relu', name=name + '_conv0')
+        input = self._conv_bn_layer(input=input, num_filters=channel_num[1], filter_size=3, stride=stride, act='relu', name=name + '_conv1')
+        input = self._conv_bn_layer(input=input, num_filters=channel_num[2], filter_size=1, name=name + '_conv2')
+        return fluid.layers.elementwise_add(x=shortcut_conv, y=input, axis=0, name=name+'_elementwise_add')
+
+    def _shortcut(self, input, channel_num, stride, name=None):
+        channel_in = input.shape[1]
+        if channel_in != channel_num or stride != 1:
+            return self.conv_bn_layer(input, num_filters=channel_num, filter_size=1, stride=stride, name=name+'_shortcut')
+        else:
+            return input
+
+    def _conv_bn_layer(self, input, num_filters, filter_size, stride=1, padding='SAME', act=None, name=None):
+        conv = fluid.layers.conv2d(input, num_filters, filter_size, stride, name=name+'_conv')
+        bn = fluid.layers.batch_norm(conv, act=act, name=name+'_bn')
+        return bn
+```
diff --git a/docs/en/api_en/table_latency_en.md b/docs/en/api_en/table_latency_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..6a6c6ac665dd8fa021780b753e2287e52c16d404
--- /dev/null
+++ b/docs/en/api_en/table_latency_en.md
@@ -0,0 +1,144 @@
+# Table about hardware lantency
+
+The table about hardware latency is used to evaluate the inference time in special environment and inference engine. The following text used to introduce the format that PaddleSlim support.
+
+## Introduce
+
+The table about hardware latency saved all possible operations, one operation in the table including type and parameters, such as: type can be `conv2d`, and corresponding parameters can be the size of feature map, number of kernel, and the size of kernel.
+The latency of every operation depends on hardware and inference engine.
+
+## Overview format
+The table about hardware latency saved in the way of file or multi-line string.
+The first line of the table about hardware latency saved the information about version, every line in the following represents a operation and its latency.
+
+## Version
+
+The information about version split by comma in the english format, and the detail is hardware, inference engine and timestamp.
+
+- ** hardware: ** Used to mark the environment of hardware, including type of architecture, version and so on.
+
+- ** inference engine: ** Used to mark inference engine, including the name of inference engine, version, optimize options and so on.
+
+- ** timestamp: ** Used to mark the time of this table created.
+
+## Operation
+
+The information about operation split by comma in the english format, the information about operation and latency split by tabs.
+
+### conv2d
+
+**format**
+
+```text
+op_type,flag_bias,flag_relu,n_in,c_in,h_in,w_in,c_out,groups,kernel,padding,stride,dilation\tlatency
+```
+
+**introduce**
+
+- **op_type(str)** - The type of this op.
+- **flag_bias (int)** - Whether has bias or not(0: donot has bias, 1: has bias).
+- **flag_relu (int)** - Whether has relu or not(0: donot has relu, 1: has relu).
+- **n_in (int)** - The batch size of input.
+- **c_in (int)** - The number of channel about input.
+- **h_in (int)** - The height of input feature map.
+- **w_in (int)** - The width of input feature map.
+- **c_out (int)** - The number of channel about output.
+- **groups (int)** - The group of conv2d.
+- **kernel (int)** - The size of kernel.
+- **padding (int)** - The size of padding.
+- **stride (int)** - The size of stride.
+- **dilation (int)** - The size of dilation.
+- **latency (float)** - The latency of this op.
+
+### activaiton
+
+**format**
+
+```text
+op_type,n_in,c_in,h_in,w_in\tlatency
+```
+
+**introduce**
+
+- **op_type(str)** - The type of this op.
+- **n_in (int)** - The batch size of input.
+- **c_in (int)** - The number of channel about input.
+- **h_in (int)** - The height of input feature map.
+- **w_in (int)** - The width of input feature map.
+- **latency (float)** - The latency of this op.
+
+### batch_norm
+
+**format**
+
+```text
+op_type,active_type,n_in,c_in,h_in,w_in\tlatency
+```
+
+**introduce**
+
+- **op_type(str)** - The type of this op.
+- **active_type (string|None)** - The type of activation function, including relu, prelu, sigmoid, relu6, tanh.
+- **n_in (int)** - The batch size of input.
+- **c_in (int)** - The number of channel about input.
+- **h_in (int)** - The height of input feature map.
+- **w_in (int)** - The width of input feature map.
+- **latency (float)** - The latency of this op.
+
+### eltwise
+
+**format**
+
+```text
+op_type,n_in,c_in,h_in,w_in\tlatency
+```
+
+**introduce**
+
+- **op_type(str)** - The type of this op.
+- **n_in (int)** - The batch size of input.
+- **c_in (int)** - The number of channel about input.
+- **h_in (int)** - The height of input feature map.
+- **w_in (int)** - The width of input feature map.
+- **latency (float)** - The latency of this op.
+
+### pooling
+
+**format**
+
+```text
+op_type,flag_global_pooling,n_in,c_in,h_in,w_in,kernel,padding,stride,ceil_mode,pool_type\tlatency
+```
+
+**introduce**
+
+- **op_type(str)** - The type of this op.
+- **flag_global_pooling (int)** - Whether is global pooling or not(0: is not global, 1: is global pooling).
+- **n_in (int)** - The batch size of input.
+- **c_in (int)** - The number of channel about input.
+- **h_in (int)** - The height of input feature map.
+- **w_in (int)** - The width of input feature map.
+- **kernel (int)** - The size of kernel.
+- **padding (int)** - The size of padding.
+- **stride (int)** - The size of stride.
+- **ceil_mode (int)** - Whether to compute height and width by using ceil function(0: use floor function, 1: use ceil function).
+- **pool_type (int)** - The type of pooling(1: max pooling 2: average pooling including padding 3: average pooling excluding padding).
+- **latency (float)** - The latency of this op.
+
+### softmax
+
+**format**
+
+```text
+op_type,axis,n_in,c_in,h_in,w_in\tlatency
+```
+
+**introduce**
+
+- **op_type(str)** - The type of this op.
+- **axis (int)** - The index to compute softmax, index in the range of [-1, rank-1], `rank` is the rank of input.
+- **n_in (int)** - The batch size of input.
+- **c_in (int)** - The number of channel about input.
+- **h_in (int)** - The height of input feature map.
+- **w_in (int)** - The width of input feature map.
+- **latency (float)** - The latency of this op.
diff --git a/docs/en/conf.py b/docs/en/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..30d2fd07094e3d60c62a6ae5bab72ef661391f4c
--- /dev/null
+++ b/docs/en/conf.py
@@ -0,0 +1,171 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../../'))
+
+# -- Project information -----------------------------------------------------
+
+project = u'PaddleSlim'
+copyright = u'2020, paddleslim'
+author = u'paddleslim'
+
+# The short X.Y version
+version = u''
+# The full version, including alpha/beta/rc tags
+release = u''
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.doctest',
+    'sphinx.ext.coverage',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.githubpages',
+    'sphinx.ext.napoleon',
+    'recommonmark',
+    'sphinx_markdown_tables',
+#    'm2r',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = ['.rst', '.md']
+#source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = u'en_US'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'PaddleSlimdoc'
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [(master_doc, 'PaddleSlim.tex', u'PaddleSlim Documentation',
+                    u'paddleslim', 'manual'), ]
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(master_doc, 'paddleslim', u'PaddleSlim Documentation', [author],
+              1)]
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'PaddleSlim', u'PaddleSlim Documentation', author,
+     'PaddleSlim', 'One line description of project.', 'Miscellaneous'),
+]
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+# -- Extension configuration -------------------------------------------------
diff --git a/docs/en/index.rst b/docs/en/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..21bdecb667343be7e4afddd1e8b07b2ee84eac4e
--- /dev/null
+++ b/docs/en/index.rst
@@ -0,0 +1,17 @@
+.. PaddleSlim documentation master file, created by
+   sphinx-quickstart on Wed Feb  5 14:04:52 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Index
+==============
+
+.. toctree::
+   :maxdepth: 1
+
+   intro_en.md
+   install_en.md
+   quick_start/index_en
+   tutorials/index_en
+   api_en/index_en
+   model_zoo_en.md
diff --git a/docs/en/install_en.md b/docs/en/install_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..9d0eedba8ea361e4529eccdc538f0906a78c4000
--- /dev/null
+++ b/docs/en/install_en.md
@@ -0,0 +1,22 @@
+# Install
+
+Please ensure you have installed PaddlePaddle1.7+. [How to install PaddlePaddle](https://www.paddlepaddle.org.cn/install/quick)。
+
+
+- Install by pip
+
+```bash
+pip install paddleslim -i https://pypi.org/simple
+```
+
+- Install from source
+
+```bash
+git clone https://github.com/PaddlePaddle/PaddleSlim.git
+cd PaddleSlim
+python setup.py install
+```
+
+- History packages
+
+History packages is available in [pypi.org](https://pypi.org/project/paddleslim/#history).
diff --git a/docs/en/intro_en.md b/docs/en/intro_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..524c826f907f21863a48f331b38a04745d61c2d1
--- /dev/null
+++ b/docs/en/intro_en.md
@@ -0,0 +1,85 @@
+# Introduction
+
+PaddleSlim is a toolkit for model compression. It contains a collection of compression strategies, such as pruning, fixed point quantization, knowledge distillation, hyperparameter searching and neural architecture search.
+
+PaddleSlim provides solutions of compression on computer vision models, such as image classification, object detection and semantic segmentation. Meanwhile, PaddleSlim Keeps exploring advanced compression strategies for language model. Furthermore, benckmark of compression strategies on some open tasks is available for your reference.
+
+PaddleSlim also provides auxiliary and primitive API for developer and researcher to survey, implement and apply the method in latest papers. PaddleSlim will support developer in ability of framework and technology consulting.
+
+## Features
+
+### Pruning
+
+  - Uniform pruning of convolution
+  - Sensitivity-based prunning
+  - Automated pruning based evolution search strategy
+  - Support pruning of various deep architectures such as VGG, ResNet, and MobileNet.
+  - Support self-defined range of pruning, i.e., layers to be pruned.
+
+### Fixed Point Quantization
+
+  - **Training aware**
+    - Dynamic strategy: During inference, we quantize models with hyperparameters dynamically estimated from small batches of samples.
+    - Static strategy: During inference, we quantize models with the same hyperparameters estimated from training data.
+    - Support layer-wise and channel-wise quantization.
+  - **Post training**
+
+### Knowledge Distillation
+
+  - **Naive knowledge distillation:** transfers dark knowledge by merging the teacher and student model into the same Program
+  - **Paddle large-scale scalable knowledge distillation framework Pantheon:** a universal solution for knowledge distillation, more flexible than the naive knowledge distillation, and easier to scale to the large-scale applications.
+
+    - Decouple the teacher and student models --- they run in different processes in the same or different nodes, and transfer knowledge via TCP/IP ports or local files;
+    - Friendly to assemble multiple teacher models and each of them can work in either online or offline mode independently;
+    - Merge knowledge from different teachers and make batch data for the student model automatically;
+    - Support the large-scale knowledge prediction of teacher models on multiple devices.
+
+### Neural Architecture Search
+
+  - Neural architecture search based on evolution strategy.
+  - Support distributed search.
+  - One-Shot neural architecture search.
+  - Differentiable Architecture Search.
+  - Support FLOPs and latency constrained search.
+  - Support the latency estimation on different hardware and platforms.
+
+## Performance
+
+### Image Classification
+
+Dataset: ImageNet2012; Model: MobileNetV1;
+
+|Method |Accuracy(baseline: 70.91%) |Model Size(baseline: 17.0M)|
+|:---:|:---:|:---:|
+| Knowledge Distillation(ResNet50)| **+1.06%** | |
+| Knowledge Distillation(ResNet50) + int8 quantization |**+1.10%**| **-71.76%**|
+| Pruning(FLOPs-50%) + int8 quantization|**-1.71%**|**-86.47%**|
+
+
+### Object Detection
+
+#### Dataset: Pascal VOC; Model: MobileNet-V1-YOLOv3
+
+|        Method           | mAP(baseline: 76.2%)         | Model Size(baseline: 94MB)      |
+| :---------------------:   | :------------: | :------------:|
+| Knowledge Distillation(ResNet34-YOLOv3) | **+2.8%**      |              |
+| Pruning(FLOPs -52.88%)        | **+1.4%**      | **-67.76%**   |
+|Knowledge DistillationResNet34-YOLOv3)+Pruning(FLOPs-69.57%)| **+2.6%**|**-67.00%**|
+
+
+#### Dataset: COCO; Model: MobileNet-V1-YOLOv3
+
+|        Method           | mAP(baseline: 29.3%) | Model Size|
+| :---------------------:   | :------------: | :------:|
+| Knowledge Distillation(ResNet34-YOLOv3) |  **+2.1%**     |-|
+| Knowledge Distillation(ResNet34-YOLOv3)+Pruning(FLOPs-67.56%) | **-0.3%** | **-66.90%**|
+
+### NAS
+
+Dataset: ImageNet2012; Model: MobileNetV2
+
+|Device           | Infer time cost | Top1 accuracy(baseline:71.90%) |
+|:---------------:|:---------:|:--------------------:|
+| RK3288  | **-23%**    | +0.07%    |
+| Android cellphone  | **-20%**    | +0.16% |
+| iPhone 6s   | **-17%**    | +0.32%  |
diff --git a/docs/en/model_zoo_en.md b/docs/en/model_zoo_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..89a83c635561c5d251d26dd5612f9aa0f0a42917
--- /dev/null
+++ b/docs/en/model_zoo_en.md
@@ -0,0 +1,251 @@
+# Model Zoo
+
+## 1. Image Classification
+
+Dataset：ImageNet1000
+
+### 1.1 Quantization
+
+| Model | Method | Top-1/Top-5 Acc | Model Size（MB） | TensorRT latency(V100, ms) | Download |
+|:--:|:---:|:--:|:--:|:--:|:--:|
+|MobileNetV1|-|70.99%/89.68%| 17 | -| [model](http://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV1_pretrained.tar) |
+|MobileNetV1|quant_post|70.18%/89.25% (-0.81%/-0.43%)| 4.4 | - | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV1_quant_post.tar) |
+|MobileNetV1|quant_aware|70.60%/89.57% (-0.39%/-0.11%)| 4.4 | -| [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV1_quant_aware.tar) |
+| MobileNetV2 | - |72.15%/90.65%| 15 | - | [model](https://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV2_pretrained.tar) |
+| MobileNetV2 | quant_post | 71.15%/90.11% (-1%/-0.54%)| 4.0   | - | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV2_quant_post.tar) |
+| MobileNetV2 | quant_aware |72.05%/90.63% (-0.1%/-0.02%)| 4.0 | - | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV2_quant_aware.tar) |
+|ResNet50|-|76.50%/93.00%| 99 | 2.71 | [model](http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_pretrained.tar) |
+|ResNet50|quant_post|76.33%/93.02% (-0.17%/+0.02%)| 25.1| 1.19 | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/ResNet50_quant_post.tar) |
+|ResNet50|quant_aware|    76.48%/93.11% (-0.02%/+0.11%)| 25.1 | 1.17 | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/ResNet50_quant_awre.tar) |
+
+PaddleLite latency(ms)
+
+| Device    | Model    | Method      | armv7 Thread 1 | armv7 Thread 2 | armv7 Thread 4 | armv8 Thread 1 | armv8 Thread 2 | armv8 Thread 4 |
+| ------- | ----------- | ------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- |
+| Qualcomm 835 | MobileNetV1 | FP32 baseline | 96.1942        | 53.2058        | 32.4468        | 88.4955        | 47.95          | 27.5189        |
+| Qualcomm 835 | MobileNetV1 | quant_aware   | 60.8186        | 32.1931        | 16.4275        | 56.4311        | 29.5446        | 15.1053        |
+| Qualcomm 835 | MobileNetV1 | quant_post    | 60.5615        | 32.4016        | 16.6596        | 56.5266        | 29.7178        | 15.1459        |
+| Qualcomm 835 | MobileNetV2 | FP32 baseline | 65.715         | 38.1346        | 25.155         | 61.3593        | 36.2038        | 22.849         |
+| Qualcomm 835 | MobileNetV2 | quant_aware   | 48.3655        | 30.2021        | 21.9303        | 46.1487        | 27.3146        | 18.3053        |
+| Qualcomm 835 | MobileNetV2 | quant_post    | 48.3495        | 30.3069        | 22.1506        | 45.8715        | 27.4105        | 18.2223        |
+| Qualcomm 835 | ResNet50    | FP32 baseline | 526.811        | 319.6486       | 205.8345       | 506.1138       | 335.1584       | 214.8936       |
+| Qualcomm 835 | ResNet50    | quant_aware   | 475.4538       | 256.8672       | 139.699        | 461.7344       | 247.9506       | 145.9847       |
+| Qualcomm 835 | ResNet50    | quant_post    | 476.0507       | 256.5963       | 139.7266       | 461.9176       | 248.3795       | 149.353        |
+| Qualcomm 855 | MobileNetV1 | FP32 baseline | 33.5086        | 19.5773        | 11.7534        | 31.3474        | 18.5382        | 10.0811        |
+| Qualcomm 855 | MobileNetV1 | quant_aware   | 36.7067        | 21.628         | 11.0372        | 14.0238        | 8.199          | 4.2588         |
+| Qualcomm 855 | MobileNetV1 | quant_post    | 37.0498        | 21.7081        | 11.0779        | 14.0947        | 8.1926         | 4.2934         |
+| Qualcomm 855 | MobileNetV2 | FP32 baseline | 25.0396        | 15.2862        | 9.6609         | 22.909         | 14.1797        | 8.8325         |
+| Qualcomm 855 | MobileNetV2 | quant_aware   | 28.1583        | 18.3317        | 11.8103        | 16.9158        | 11.1606        | 7.4148         |
+| Qualcomm 855 | MobileNetV2 | quant_post    | 28.1631        | 18.3917        | 11.8333        | 16.9399        | 11.1772        | 7.4176         |
+| Qualcomm 855 | ResNet50    | FP32 baseline | 185.3705       | 113.0825       | 87.0741        | 177.7367       | 110.0433       | 74.4114        |
+| Qualcomm 855 | ResNet50    | quant_aware   | 327.6883       | 202.4536       | 106.243        | 243.5621       | 150.0542       | 78.4205        |
+| Qualcomm 855 | ResNet50    | quant_post    | 328.2683       | 201.9937       | 106.744        | 242.6397       | 150.0338       | 79.8659        |
+| Kirin 970 | MobileNetV1 | FP32 baseline | 101.2455       | 56.4053        | 35.6484        | 94.8985        | 51.7251        | 31.9511        |
+| Kirin 970 | MobileNetV1 | quant_aware   | 62.5012        | 32.1863        | 16.6018        | 57.7477        | 29.2116        | 15.0703        |
+| Kirin 970 | MobileNetV1 | quant_post    | 62.4412        | 32.2585        | 16.6215        | 57.825         | 29.2573        | 15.1206        |
+| Kirin 970 | MobileNetV2 | FP32 baseline | 70.4176        | 42.0795        | 25.1939        | 68.9597        | 39.2145        | 22.6617        |
+| Kirin 970 | MobileNetV2 | quant_aware   | 52.9961        | 31.5323        | 22.1447        | 49.4858        | 28.0856        | 18.7287        |
+| Kirin 970 | MobileNetV2 | quant_post    | 53.0961        | 31.7987        | 21.8334        | 49.383         | 28.2358        | 18.3642        |
+| Kirin 970 | ResNet50    | FP32 baseline | 586.8943       | 344.0858       | 228.2293       | 573.3344       | 351.4332       | 225.8006       |
+| Kirin 970 | ResNet50    | quant_aware   | 488.361        | 260.1697       | 142.416        | 479.5668       | 249.8485       | 138.1742       |
+| Kirin 970 | ResNet50    | quant_post    | 489.6188       | 258.3279       | 142.6063       | 480.0064       | 249.5339       | 138.5284       |
+
+### 1.2 Pruning
+
+PaddleLite:
+
+env: Qualcomm SnapDragon 845 + armv8
+
+criterion: time cost in Thread1/Thread2/Thread4
+
+PaddleLite version: v2.3
+
+
+|Model | Method | Top-1/Top-5 Acc | ModelSize(MB) | GFLOPs |PaddleLite cost(ms)|TensorRT speed(FPS)| download |
+|:--:|:---:|:--:|:--:|:--:|:--:|:--:|:--:|
+| MobileNetV1 |    Baseline    |         70.99%/89.68%         |       17       |  1.11  |66.052\35.8014\19.5762|-| [download](http://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV1_pretrained.tar) |
+| MobileNetV1 |  uniform -50%  | 69.4%/88.66% (-1.59%/-1.02%)  |       9        |  0.56  | 33.5636\18.6834\10.5076|-|[download](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV1_uniform-50.tar) |
+| MobileNetV1 | sensitive -30% |  70.4%/89.3% (-0.59%/-0.38%)  |       12       |  0.74  | 46.5958\25.3098\13.6982|-|[download](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV1_sensitive-30.tar) |
+| MobileNetV1 | sensitive -50% | 69.8% / 88.9% (-1.19%/-0.78%) |       9        |  0.56  |37.9892\20.7882\11.3144|-| [download](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV1_sensitive-50.tar) |
+| MobileNetV2 |       -        |         72.15%/90.65%         |       15       |  0.59  |41.7874\23.375\13.3998|-| [download](https://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV2_pretrained.tar) |
+| MobileNetV2 |  uniform -50%  | 65.79%/86.11% (-6.35%/-4.47%) |       11       | 0.296  |23.8842\13.8698\8.5572|-| [download](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV2_uniform-50.tar) |
+|  ResNet34   |       -        |         74.57%/92.14%         |       84       |  7.36  |217.808\139.943\96.7504|342.32| [download](https://paddle-imagenet-models-name.bj.bcebos.com/ResNet34_pretrained.tar) |
+|  ResNet34   |  uniform -50%  | 70.99%/89.95% (-3.58%/-2.19%) |       41       |  3.67  |114.787\75.0332\51.8438|452.41| [download](https://paddlemodels.bj.bcebos.com/PaddleSlim/ResNet34_uniform-50.tar) |
+|  ResNet34   |  auto -55.05%  | 70.24%/89.63% (-4.33%/-2.51%) |       33       |  3.31  |105.924\69.3222\48.0246|457.25| [download](https://paddlemodels.bj.bcebos.com/PaddleSlim/ResNet34_auto-55.tar) |
+
+### 1.3 Distillation
+
+| Model | Method | Top-1/Top-5 Acc | Model Size（MB） | Download |
+|:--:|:---:|:--:|:--:|:--:|
+| MobileNetV1 |                     student                     |  70.99%/89.68%  |       17       | [model](http://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV1_pretrained.tar) |
+|ResNet50_vd|teacher|79.12%/94.44%| 99 | [model](https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_pretrained.tar) |
+|MobileNetV1|ResNet50_vd<sup>[1](#trans1)</sup> distill|72.77%/90.68% (+1.78%/+1.00%)| 17 | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV1_distilled.tar) |
+| MobileNetV2 |                     student                     |  72.15%/90.65%  |       15       | [model](https://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV2_pretrained.tar) |
+| MobileNetV2 |            ResNet50_vd distill             |  74.28%/91.53% (+2.13%/+0.88%)  |       15       | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV2_distilled.tar) |
+|  ResNet50   |                     student                     |  76.50%/93.00%  |       99       | [model](http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_pretrained.tar) |
+|ResNet101|teacher|77.56%/93.64%| 173 | [model](http://paddle-imagenet-models-name.bj.bcebos.com/ResNet101_pretrained.tar) |
+|  ResNet50   |             ResNet101 distill              |  77.29%/93.65% (+0.79%/+0.65%)  |       99       | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/ResNet50_distilled.tar) |
+
+Note: The `_vd` suffix indicates that the pre-trained model uses Mixup. Please refer to the detailed introduction: [mixup: Beyond Empirical Risk Minimization](https://arxiv.org/abs/1710.09412)
+
+
+### 1.4 NAS
+
+| Model | Method | Top-1/Top-5 Acc | Volume（MB） | GFLOPs | Download |
+|:--:|:---:|:--:|:--:|:--:|:--:|
+|   MobileNetV2   |       -        |            72.15%/90.65%           |     15      |  0.59  | [model](https://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV2_pretrained.tar) |
+| MobileNetV2_NAS |     SANAS      |  71.518%/90.208% (-0.632%/-0.442%) |     14      | 0.295  | [model](https://paddlemodels.cdn.bcebos.com/PaddleSlim/MobileNetV2_sanas.tar) |
+
+Dataset: Cifar10
+| Model | Method |  Acc  |  Params（MB） | Download |
+|:---:|:--:|:--:|:--:|:--:|
+|           Darts           |   -   |     97.135%         |        3.767         |  -  |
+| Darts_SA(Based on Darts)  | SANAS |  97.276%(+0.141%)   |    3.344(-11.2%)     |  -  |
+
+Note: The token of MobileNetV2_NAS is [4, 4, 5, 1, 1, 2, 1, 1, 0, 2, 6, 2, 0, 3, 4, 5, 0, 4, 5, 5, 1, 4, 8, 0, 0]. The token of Darts_SA is [5, 5, 0, 5, 5, 10, 7, 7, 5, 7, 7, 11, 10, 12, 10, 0, 5, 3, 10, 8].
+
+
+## 2. Object Detection
+
+### 2.1 Quantization
+
+Dataset： COCO 2017
+
+|              Model              |  Method  | Dataset | Image/GPU | Input 608 Box AP | Input 416 Box AP | Input 320 Box AP | Model Size（MB） | TensorRT latency(V100, ms) |  Download  |
+| :----------------------------: | :---------: | :----: | :-------: | :------------: | :------------: | :------------: | :------------: | :----------: |:----------: |
+|      MobileNet-V1-YOLOv3       |      -      |  COCO  |     8     |      29.3      |      29.3      |      27.1      |       95       |  - | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) |
+|      MobileNet-V1-YOLOv3       | quant_post  |  COCO  |     8     |     27.9 (-1.4)|    28.0 (-1.3)      |    26.0 (-1.0) |       25       | -  | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_quant_post.tar) |
+|      MobileNet-V1-YOLOv3       | quant_aware |  COCO  |     8     |     28.1 (-1.2)|  28.2 (-1.1)      |    25.8 (-1.2) |       26.3     | -  | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenet_coco_quant_aware.tar) |
+|      R34-YOLOv3                |      -      |  COCO  |     8     |      36.2      |      34.3      |      31.4      |       162       |  - | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34.tar) |
+|      R34-YOLOv3                | quant_post  |  COCO  |     8     | 35.7 (-0.5)    |      -         |      -         |       42.7      |  - | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r34_coco_quant_post.tar) |
+|      R34-YOLOv3                | quant_aware |  COCO  |     8     |  35.2 (-1.0)   | 33.3 (-1.0)    |     30.3 (-1.1)|       44       |  - | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r34_coco_quant_aware.tar) |
+| R50-dcn-YOLOv3 obj365_pretrain |      -      |  COCO  |     8     |      41.4      |       -      |       -       |       177       | 18.56  |[model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r50vd_dcn_obj365_pretrained_coco.tar) |
+| R50-dcn-YOLOv3 obj365_pretrain | quant_aware |  COCO  |     8     |   40.6 (-0.8)  |       37.5   |       34.1    |       66       |  14.64 | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r50vd_dcn_obj365_pretrained_coco_quant_aware.tar) |
+
+
+
+Dataset：WIDER-FACE
+
+
+
+|     Model      |   Method    | Image/GPU | Input Size |        Easy/Medium/Hard         | Model Size（MB） |                           Download                           |
+| :------------: | :---------: | :-------: | :--------: | :-----------------------------: | :--------------: | :----------------------------------------------------------: |
+|   BlazeFace    |      -      |     8     |    640     |         91.5/89.2/79.7          |       815        | [model](https://paddlemodels.bj.bcebos.com/object_detection/blazeface_original.tar) |
+|   BlazeFace    | quant_post  |     8     |    640     | 87.8/85.1/74.9 (-3.7/-4.1/-4.8) |       228        | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/blazeface_origin_quant_post.tar) |
+|   BlazeFace    | quant_aware |     8     |    640     | 90.5/87.9/77.6 (-1.0/-1.3/-2.1) |       228        | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/blazeface_origin_quant_aware.tar) |
+| BlazeFace-Lite |      -      |     8     |    640     |         90.9/88.5/78.1          |       711        | [model](https://paddlemodels.bj.bcebos.com/object_detection/blazeface_lite.tar) |
+| BlazeFace-Lite | quant_post  |     8     |    640     | 89.4/86.7/75.7 (-1.5/-1.8/-2.4) |       211        | [model]((https://paddlemodels.bj.bcebos.com/PaddleSlim/blazeface_lite_quant_post.tar)) |
+| BlazeFace-Lite | quant_aware |     8     |    640     | 89.7/87.3/77.0 (-1.2/-1.2/-1.1) |       211        | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/blazeface_lite_quant_aware.tar) |
+| BlazeFace-NAS  |      -      |     8     |    640     |         83.7/80.7/65.8          |       244        | [model](https://paddlemodels.bj.bcebos.com/object_detection/blazeface_nas.tar) |
+| BlazeFace-NAS  | quant_post  |     8     |    640     | 81.6/78.3/63.6 (-2.1/-2.4/-2.2) |        71        | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/blazeface_nas_quant_post.tar) |
+| BlazeFace-NAS  | quant_aware |     8     |    640     | 83.1/79.7/64.2 (-0.6/-1.0/-1.6) |        71        | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/blazeface_nas_quant_aware.tar) |
+
+### 2.2 Pruning
+
+Dataset：Pasacl VOC & COCO 2017
+
+PaddleLite:
+
+env: Qualcomm SnapDragon 845 + armv8
+
+criterion: time cost in Thread1/Thread2/Thread4
+
+PaddleLite version: v2.3
+
+|             Model              |      Method       |  Dataset   | Image/GPU | Input 608 Box AP | Input 416 Box AP | Input 320 Box AP | Model Size(MB) | GFLOPs (608*608) | PaddleLite cost(ms)(608*608) | TensorRT speed(FPS)(608*608) |              Download                           |
+| :----------------------------: | :---------------: | :--------: | :-------: | :--------------: | :--------------: | :--------------: | :------------: | :--------------: | :--------------: | :--------------: | :----------------------------: |
+|      MobileNet-V1-YOLOv3       |     Baseline      | Pascal VOC |     8     |       76.2       |       76.7       |       75.3       |       94       |      40.49       | 1238\796.943\520.101 |60.40| [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) |
+|      MobileNet-V1-YOLOv3       | sensitive -52.88% | Pascal VOC |     8     |   77.6 (+1.4)    |    77.7 (1.0)    |   75.5 (+0.2)    |       31       |      19.08       | 602.497\353.759\222.427 |99.36| [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenet_v1_voc_prune.tar) |
+|      MobileNet-V1-YOLOv3       |         -         |    COCO    |     8     |       29.3       |       29.3       |       27.0       |       95       |      41.35       |-|-| [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) |
+|      MobileNet-V1-YOLOv3       | sensitive -51.77% |    COCO    |     8     |   26.0 (-3.3)    |   25.1 (-4.2)    |   22.6 (-4.4)    |       32       |      19.94       |-|73.93| [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenet_v1_prune.tar) |
+|         R50-dcn-YOLOv3         |         -         |    COCO    |     8     |       39.1       |        -         |        -         |      177       |      89.60       |-|27.68| [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r50vd_dcn.tar) |
+|         R50-dcn-YOLOv3         | sensitive -9.37%  |    COCO    |     8     |   39.3 (+0.2)    |        -         |        -         |      150       |      81.20       |-|30.08| [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r50vd_dcn_prune.tar) |
+|         R50-dcn-YOLOv3         | sensitive -24.68% |    COCO    |     8     |   37.3 (-1.8)    |        -         |        -         |      113       |      67.48       |-|34.32| [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r50vd_dcn_prune578.tar) |
+| R50-dcn-YOLOv3 obj365_pretrain |         -         |    COCO    |     8     |       41.4       |        -         |        -         |      177       |      89.60       |-|-| [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r50vd_dcn_obj365_pretrained_coco.tar) |
+| R50-dcn-YOLOv3 obj365_pretrain | sensitive -9.37%  |    COCO    |     8     |   40.5 (-0.9)    |        -         |        -         |      150       |      81.20       |-|-| [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r50vd_dcn_obj365_pretrained_coco_prune.tar) |
+| R50-dcn-YOLOv3 obj365_pretrain | sensitive -24.68% |    COCO    |     8     |   37.8 (-3.3)    |        -         |        -         |      113       |      67.48       |-|-| [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r50vd_dcn_obj365_pretrained_coco_prune578.tar) |
+
+### 2.3 Distillation
+
+Dataset：Pasacl VOC & COCO 2017
+
+
+|        Model        |         Method          |  Dataset   | Image/GPU | Input 608 Box AP | Input 416 Box AP | Input 320 Box AP | Model Size（MB） |                           Download                           |
+| :-----------------: | :---------------------: | :--------: | :-------: | :--------------: | :--------------: | :--------------: | :--------------: | :----------------------------------------------------------: |
+| MobileNet-V1-YOLOv3 |            -            | Pascal VOC |     8     |       76.2       |       76.7       |       75.3       |        94        | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) |
+|   ResNet34-YOLOv3   |            -            | Pascal VOC |     8     |       82.6       |       81.9       |       80.1       |       162        | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34_voc.tar) |
+| MobileNet-V1-YOLOv3 | ResNet34-YOLOv3 distill | Pascal VOC |     8     |   79.0 (+2.8)    |   78.2 (+1.5)    |   75.5 (+0.2)    |        94        | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_voc_distilled.tar) |
+| MobileNet-V1-YOLOv3 |            -            |    COCO    |     8     |       29.3       |       29.3       |       27.0       |        95        | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) |
+|   ResNet34-YOLOv3   |            -            |    COCO    |     8     |       36.2       |       34.3       |       31.4       |       163        | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34.tar) |
+| MobileNet-V1-YOLOv3 | ResNet34-YOLOv3 distill |    COCO    |     8     |   31.4 (+2.1)    |   30.0 (+0.7)    |   27.1 (+0.1)    |        95        | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_distilled.tar) |
+
+
+### 2.4 NAS
+
+Dataset: WIDER-FACE
+
+|      Model      |  Method   | Image/GPU | Input size |        Easy/Medium/Hard         |  volume（KB） |    latency（ms）|                         Download                             |
+| :------------: | :---------: | :-------: | :------: | :-----------------------------: | :------------: | :------------: | :----------------------------------------------------------: |
+|   BlazeFace    |      -      |     8     |   640    |         91.5/89.2/79.7          |      815       |       71.862     | [model](https://paddlemodels.bj.bcebos.com/object_detection/blazeface_original.tar) |
+| BlazeFace-NAS  |      -      |     8     |   640    |         83.7/80.7/65.8          |      244       |       21.117     |[model](https://paddlemodels.bj.bcebos.com/object_detection/blazeface_nas.tar) |
+| BlazeFace-NASV2 |    SANAS    |     8     |   640    |         87.0/83.7/68.5          |      389       |       22.558     | [model](https://paddlemodels.bj.bcebos.com/object_detection/blazeface_nas2.tar) |
+
+Note: latency is based on latency_855.txt, the file is test on 855 by PaddleLite。The config of BlazeFace-NASV2 is in [there](https://github.com/PaddlePaddle/PaddleDetection/blob/master/configs/face_detection/blazeface_nas_v2.yml).
+
+
+## 3. Image Segmentation
+Dataset：Cityscapes
+
+### 3.1 Quantization
+
+|         Model          |   Method    |     mIoU      | Model Size（MB） |                           Download                           |
+| :--------------------: | :---------: | :-----------: | :--------------: | :----------------------------------------------------------: |
+| DeepLabv3+/MobileNetv1 |      -      |     63.26     |       6.6        | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/deeplabv3_mobilenetv1.tar ) |
+| DeepLabv3+/MobileNetv1 | quant_post  | 58.63 (-4.63) |       1.8        | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/deeplabv3_mobilenetv1_2049x1025_quant_post.tar) |
+| DeepLabv3+/MobileNetv1 | quant_aware | 62.03 (-1.23) |       1.8        | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/deeplabv3_mobilenetv1_2049x1025_quant_aware.tar) |
+| DeepLabv3+/MobileNetv2 |      -      |     69.81     |       7.4        | [model](https://paddleseg.bj.bcebos.com/models/mobilenet_cityscapes.tgz) |
+| DeepLabv3+/MobileNetv2 | quant_post  | 67.59 (-2.22) |       2.1        | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/deeplabv3_mobilenetv2_2049x1025_quant_post.tar) |
+| DeepLabv3+/MobileNetv2 | quant_aware | 68.33 (-1.48) |       2.1        | [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/deeplabv3_mobilenetv2_2049x1025_quant_aware.tar) |
+
+Image segmentation model PaddleLite latency (ms), input size 769x769
+
+| Device       | Model                  | Method        | armv7 Thread 1 | armv7 Thread 2 | armv7 Thread 4 | armv8 Thread 1 | armv8 Thread 2 | armv8 Thread 4 |
+| ------------ | ---------------------- | ------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- |
+| Qualcomm 835 | Deeplabv3- MobileNetV1 | FP32 baseline | 1227.9894      | 734.1922       | 527.9592       | 1109.96        | 699.3818       | 479.0818       |
+| Qualcomm 835 | Deeplabv3- MobileNetV1 | quant_aware   | 848.6544       | 512.785        | 382.9915       | 752.3573       | 455.0901       | 307.8808       |
+| Qualcomm 835 | Deeplabv3- MobileNetV1 | quant_post    | 840.2323       | 510.103        | 371.9315       | 748.9401       | 452.1745       | 309.2084       |
+| Qualcomm 835 | Deeplabv3-MobileNetV2  | FP32 baseline | 1282.8126      | 793.2064       | 653.6538       | 1193.9908      | 737.1827       | 593.4522       |
+| Qualcomm 835 | Deeplabv3-MobileNetV2  | quant_aware   | 976.0495       | 659.0541       | 513.4279       | 892.1468       | 582.9847       | 484.7512       |
+| Qualcomm 835 | Deeplabv3-MobileNetV2  | quant_post    | 981.44         | 658.4969       | 538.6166       | 885.3273       | 586.1284       | 484.0018       |
+| Qualcomm 855 | Deeplabv3- MobileNetV1 | FP32 baseline | 568.8748       | 339.8578       | 278.6316       | 420.6031       | 281.3197       | 217.5222       |
+| Qualcomm 855 | Deeplabv3- MobileNetV1 | quant_aware   | 608.7578       | 347.2087       | 260.653        | 241.2394       | 177.3456       | 143.9178       |
+| Qualcomm 855 | Deeplabv3- MobileNetV1 | quant_post    | 609.0142       | 347.3784       | 259.9825       | 239.4103       | 180.1894       | 139.9178       |
+| Qualcomm 855 | Deeplabv3-MobileNetV2  | FP32 baseline | 639.4425       | 390.1851       | 322.7014       | 477.7667       | 339.7411       | 262.2847       |
+| Qualcomm 855 | Deeplabv3-MobileNetV2  | quant_aware   | 703.7275       | 497.689        | 417.1296       | 394.3586       | 300.2503       | 239.9204       |
+| Qualcomm 855 | Deeplabv3-MobileNetV2  | quant_post    | 705.7589       | 474.4076       | 427.2951       | 394.8352       | 297.4035       | 264.6724       |
+| Kirin 970    | Deeplabv3- MobileNetV1 | FP32 baseline | 1682.1792      | 1437.9774      | 1181.0246      | 1261.6739      | 1068.6537      | 690.8225       |
+| Kirin 970    | Deeplabv3- MobileNetV1 | quant_aware   | 1062.3394      | 1248.1014      | 878.3157       | 774.6356       | 710.6277       | 528.5376       |
+| Kirin 970    | Deeplabv3- MobileNetV1 | quant_post    | 1109.1917      | 1339.6218      | 866.3587       | 771.5164       | 716.5255       | 500.6497       |
+| Kirin 970    | Deeplabv3-MobileNetV2  | FP32 baseline | 1771.1301      | 1746.0569      | 1222.4805      | 1448.9739      | 1192.4491      | 760.606        |
+| Kirin 970    | Deeplabv3-MobileNetV2  | quant_aware   | 1320.2905      | 921.4522       | 676.0732       | 1145.8801      | 821.5685       | 590.1713       |
+| Kirin 970    | Deeplabv3-MobileNetV2  | quant_post    | 1320.386       | 918.5328       | 672.2481       | 1020.753       | 820.094        | 591.4114       |
+
+
+
+
+
+### 3.2 Pruning
+
+PaddleLite:
+
+env: Qualcomm SnapDragon 845 + armv8
+
+criterion: time cost in Thread1/Thread2/Thread4
+
+PaddleLite version: v2.3
+
+|   Model   |      Method       |     mIoU      | Model Size（MB） | GFLOPs | PaddleLite cost(ms) | TensorRT speed(FPS) |          Download        |
+| :-------: | :---------------: | :-----------: | :--------------: | :----: | :--------------: | :----: |  :-------------------: |
+| fast-scnn |     baseline      |     69.64     |        11        | 14.41  | 1226.36\682.96\415.664 |39.53| [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/fast_scnn_cityscape.tar) |
+| fast-scnn | uniform  -17.07%  | 69.58 (-0.06) |       8.5        | 11.95  | 1140.37\656.612\415.888 |42.01| [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/fast_scnn_cityscape_uniform-17.tar) |
+| fast-scnn | sensitive -47.60% | 66.68 (-2.96) |       5.7        |  7.55  | 866.693\494.467\291.748 |51.48| [model](https://paddlemodels.bj.bcebos.com/PaddleSlim/fast_scnn_cityscape_sensitive-47.tar) |
diff --git a/docs/en/quick_start/distillation_tutorial_en.md b/docs/en/quick_start/distillation_tutorial_en.md
new file mode 100755
index 0000000000000000000000000000000000000000..7fb410655fe086fcee84e1f84a49c1e38609cde1
--- /dev/null
+++ b/docs/en/quick_start/distillation_tutorial_en.md
@@ -0,0 +1,115 @@
+#  Knowledge Distillation for Image Classification
+
+In this tutorial, you will learn how to use knowledge distillation API of PaddleSlim
+by a demo of MobileNetV1 model on MNIST dataset. This tutorial following workflow:
+
+1. Import dependency
+2. Define student_program and teacher_program
+3. Select feature maps
+4. Merge program and add distillation loss
+5. Train distillation model
+
+## 1. Import dependency
+
+PaddleSlim dependents on Paddle1.7. Please ensure that you have installed paddle correctly. Import Paddle and PaddleSlim as below:
+
+```
+import paddle
+import paddle.fluid as fluid
+import paddleslim as slim
+```
+
+## 2. Define student_program and teacher_program
+
+This tutorial trains and verifies distillation model on the MNIST dataset. The input image shape is `[1, 28, 28] `and the number of output categories is 10.
+Select `ResNet50` as the teacher to perform distillation training on the students of the` MobileNet` architecture.
+
+```python
+model = slim.models.MobileNet()
+student_program = fluid.Program()
+student_startup = fluid.Program()
+with fluid.program_guard(student_program, student_startup):
+    image = fluid.data(
+        name='image', shape=[None] + [1, 28, 28], dtype='float32')
+    label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+    out = model.net(input=image, class_dim=10)
+    cost = fluid.layers.cross_entropy(input=out, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+```
+
+
+
+```python
+model = slim.models.ResNet50()
+teacher_program = fluid.Program()
+teacher_startup = fluid.Program()
+with fluid.program_guard(teacher_program, teacher_startup):
+    with fluid.unique_name.guard():
+        image = fluid.data(
+            name='image', shape=[None] + [1, 28, 28], dtype='float32')
+        predict = teacher_model.net(image, class_dim=10)
+exe = fluid.Executor(fluid.CPUPlace())
+exe.run(teacher_startup)
+```
+
+## 3. Select feature maps
+
+We can use the student_program's list_vars method to observe all the Variables, and select one or more variables from it to fit the corresponding variables of the teacher.
+
+```python
+# get all student variables
+student_vars = []
+for v in student_program.list_vars():
+    student_vars.append((v.name, v.shape))
+#uncomment the following lines to observe student's variables for distillation
+#print("="*50+"student_model_vars"+"="*50)
+#print(student_vars)
+
+# get all teacher variables
+teacher_vars = []
+for v in teacher_program.list_vars():
+    teacher_vars.append((v.name, v.shape))
+#uncomment the following lines to observe teacher's variables for distillation
+#print("="*50+"teacher_model_vars"+"="*50)
+#print(teacher_vars)
+```
+
+we can see that the shape of 'bn5c_branch2b.output.1.tmp_3' in the teacher_program and the 'depthwise_conv2d_11.tmp_0' of the student are the same and can form the distillation loss function.
+
+## 4. Merge program and add distillation loss
+The merge operation adds all Variables and Ops in teacher_program to student_Program. At the same time, in order to avoid naming conflicts caused by variables with the same name in two programs, merge will also add a unified naming prefix **name_prefix** for Variables in teacher_program, which The default value is 'teacher_'_.
+
+In order to ensure that the data of the teacher network and the student network are the same, the merge operation also merges the input data layers of the two programs, so you need to specify a data layer name mapping ***data_name_map***, where key is the input data name of the teacher, and value Is student's.
+
+```python
+data_name_map = {'image': 'image'}
+main = slim.dist.merge(teacher_program, student_program, data_name_map, fluid.CPUPlace())
+with fluid.program_guard(student_program, student_startup):
+    l2_loss = slim.dist.l2_loss('teacher_bn5c_branch2b.output.1.tmp_3', 'depthwise_conv2d_11.tmp_0', student_program)
+    loss = l2_loss + avg_cost
+    opt = fluid.optimizer.Momentum(0.01, 0.9)
+    opt.minimize(loss)
+exe.run(student_startup)
+```
+
+## 5. Train distillation model
+
+The package `paddle.dataset.mnist` of Paddle define the downloading and reading of MNIST dataset.
+Define training data reader and test data reader as below：
+
+```python
+train_reader = paddle.fluid.io.batch(
+    paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+train_feeder = fluid.DataFeeder(['image', 'label'], fluid.CPUPlace(), student_program)
+```
+
+Excute following code to run an `epoch` training:
+
+
+```python
+for data in train_reader():
+    acc1, acc5, loss_np = exe.run(student_program, feed=train_feeder.feed(data), fetch_list=[acc_top1.name, acc_top5.name, loss.name])
+    print("Acc1: {:.6f}, Acc5: {:.6f}, Loss: {:.6f}".format(acc1.mean(), acc5.mean(), loss_np.mean()))
+```
diff --git a/docs/en/quick_start/index_en.rst b/docs/en/quick_start/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..bd4d8ec1907bd0d15eb055f86b4aa6032a3befa9
--- /dev/null
+++ b/docs/en/quick_start/index_en.rst
@@ -0,0 +1,12 @@
+
+Quick Start
+========
+
+.. toctree::
+   :maxdepth: 1
+
+   pruning_tutorial_en.md
+   nas_tutorial_en.md
+   quant_aware_tutorial_en.md
+   quant_post_static_tutorial_en.md
+    
diff --git a/docs/en/quick_start/nas_tutorial_en.md b/docs/en/quick_start/nas_tutorial_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..040f46530cd61c815f633fdf478b368c0d88e07e
--- /dev/null
+++ b/docs/en/quick_start/nas_tutorial_en.md
@@ -0,0 +1,155 @@
+# Nerual Architecture Search for Image Classification
+
+This tutorial shows how to use [API](../api/nas_api.md) about SANAS in PaddleSlim. We start experiment based on MobileNetV2 as example. The tutorial contains follow section.
+
+1. necessary imports
+2. initial SANAS instance
+3. define function about building program
+4. define function about input data
+5. define function about training
+6. define funciton about evaluation
+7. start search
+  7.1 fetch model architecture
+  7.2 build program
+  7.3 define input data
+  7.4 train model
+  7.5 evaluate model
+  7.6 reture score
+8. full example
+
+
+The following chapter describes each steps in order.
+
+## 1. import dependency
+Please make sure that you haved installed Paddle correctly, then do the necessary imports.
+```python
+import paddle
+import paddle.fluid as fluid
+import paddleslim as slim
+import numpy as np
+```
+
+## 2. initial SANAS instance
+```python
+sanas = slim.nas.SANAS(configs=[('MobileNetV2Space')], server_addr=("", 8337), save_checkpoint=None)
+```
+
+## 3. define function about building program
+Build program about training and evaluation according to the model architecture.
+```python
+def build_program(archs):
+    train_program = fluid.Program()
+    startup_program = fluid.Program()
+    with fluid.program_guard(train_program, startup_program):
+        data = fluid.data(name='data', shape=[None, 3, 32, 32], dtype='float32')
+        label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+        output = archs(data)
+        output = fluid.layers.fc(input=output, size=10)
+
+        softmax_out = fluid.layers.softmax(input=output, use_cudnn=False)
+        cost = fluid.layers.cross_entropy(input=softmax_out, label=label)
+        avg_cost = fluid.layers.mean(cost)
+        acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=softmax_out, label=label, k=5)
+        test_program = fluid.default_main_program().clone(for_test=True)
+
+        optimizer = fluid.optimizer.Adam(learning_rate=0.1)
+        optimizer.minimize(avg_cost)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_program)
+    return exe, train_program, test_program, (data, label), avg_cost, acc_top1, acc_top5
+```
+
+## 4. define function about input data
+The dataset we used is cifar10, and `paddle.dataset.cifar` in Paddle including the download and pre-read about cifar.
+```python
+def input_data(inputs):
+    train_reader = paddle.fluid.io.batch(paddle.reader.shuffle(paddle.dataset.cifar.train10(cycle=False), buf_size=1024),batch_size=256)
+    train_feeder = fluid.DataFeeder(inputs, fluid.CPUPlace())
+    eval_reader = paddle.fluid.io.batch(paddle.dataset.cifar.test10(cycle=False), batch_size=256)
+    eval_feeder = fluid.DataFeeder(inputs, fluid.CPUPlace())
+    return train_reader, train_feeder, eval_reader, eval_feeder
+```
+
+## 5. define function about training
+Start training.
+```python
+def start_train(program, data_reader, data_feeder):
+    outputs = [avg_cost.name, acc_top1.name, acc_top5.name]
+    for data in data_reader():
+        batch_reward = exe.run(program, feed=data_feeder.feed(data), fetch_list = outputs)
+        print("TRAIN: loss: {}, acc1: {}, acc5:{}".format(batch_reward[0], batch_reward[1], batch_reward[2]))
+```
+
+## 6. define funciton about evaluation
+Start evaluating.
+```python
+def start_eval(program, data_reader, data_feeder):
+    reward = []
+    outputs = [avg_cost.name, acc_top1.name, acc_top5.name]
+    for data in data_reader():
+        batch_reward = exe.run(program, feed=data_feeder.feed(data), fetch_list = outputs)
+        reward_avg = np.mean(np.array(batch_reward), axis=1)
+        reward.append(reward_avg)
+        print("TEST: loss: {}, acc1: {}, acc5:{}".format(batch_reward[0], batch_reward[1], batch_reward[2]))
+    finally_reward = np.mean(np.array(reward), axis=0)
+    print("FINAL TEST: avg_cost: {}, acc1: {}, acc5: {}".format(finally_reward[0], finally_reward[1], finally_reward[2]))
+    return finally_reward
+```
+
+## 7. start search
+The following steps describes how to get current model architecture and what need to do after get the model architecture. If you want to start a full example directly, please jump to Step 9.
+
+### 7.1 fetch model architecture
+Get Next model architecture by `next_archs()`.
+```python
+archs = sanas.next_archs()[0]
+```
+
+### 7.2 build program
+Get program according to the function in Step3 and model architecture from Step 7.1.
+```python
+exe, train_program, eval_program, inputs, avg_cost, acc_top1, acc_top5 = build_program(archs)
+```
+
+### 7.3 define input data
+```python
+train_reader, train_feeder, eval_reader, eval_feeder = input_data(inputs)
+```
+
+### 7.4 train model
+Start training according to train program and data.
+```python
+start_train(train_program, train_reader, train_feeder)
+```
+### 7.5 evaluate model
+Start evaluation according to evaluation program and data.
+```python
+finally_reward = start_eval(eval_program, eval_reader, eval_feeder)
+```
+### 7.6 reture score
+```
+sanas.reward(float(finally_reward[1]))
+```
+
+## 8. full example
+The following is a full example about neural architecture search, it uses FLOPs as constraint and includes 3 steps, it means train 3 model architectures which is satisfied constraint, and train 7 epoch for each model architecture.
+```python
+for step in range(3):
+    archs = sanas.next_archs()[0]
+    exe, train_program, eval_progarm, inputs, avg_cost, acc_top1, acc_top5 = build_program(archs)
+    train_reader, train_feeder, eval_reader, eval_feeder = input_data(inputs)
+
+    current_flops = slim.analysis.flops(train_program)
+    if current_flops > 321208544:
+        continue
+
+    for epoch in range(7):
+        start_train(train_program, train_reader, train_feeder)
+
+    finally_reward = start_eval(eval_program, eval_reader, eval_feeder)
+
+    sanas.reward(float(finally_reward[1]))
+```
diff --git a/docs/en/quick_start/pruning_tutorial_en.md b/docs/en/quick_start/pruning_tutorial_en.md
new file mode 100755
index 0000000000000000000000000000000000000000..9107a38b58255f08ce3da78fa18274c05d844604
--- /dev/null
+++ b/docs/en/quick_start/pruning_tutorial_en.md
@@ -0,0 +1,90 @@
+# Channel Pruning for Image Classification
+
+In this tutorial, you will learn how to use channel pruning API of PaddleSlim
+by a demo of MobileNetV1 model on MNIST dataset. This tutorial following workflow:
+
+1. Import dependency
+2. Build model
+3. Prune model
+4. Train pruned model
+
+## 1. Import dependency
+
+PaddleSlim dependents on Paddle1.7. Please ensure that you have installed paddle correctly. Import Paddle and PaddleSlim as below:
+
+```
+import paddle
+import paddle.fluid as fluid
+import paddleslim as slim
+```
+
+## 2. Build Model
+
+This section will build a classsification model based `MobileNetV1` for MNIST task. The shape of the input is `[1, 28, 28]` and the output number is 10.
+
+To make the code simple, we define a function in package `paddleslim.models` to build classification model.
+Excute following code to build a model,
+
+```
+exe, train_program, val_program, inputs, outputs =
+    slim.models.image_classification("MobileNet", [1, 28, 28], 10, use_gpu=False)
+```
+
+>Note：The functions in paddleslim.models is just used in tutorials or demos.
+
+## 3. Prune model
+
+### 3.1 Compute FLOPs bofore pruning
+
+```
+FLOPs = slim.analysis.flops(train_program)
+print("FLOPs: {}".format(FLOPs))
+```
+
+### 3.2 Pruning
+
+The section will prune the parameters named `conv2_1_sep_weights` and `conv2_2_sep_weights` by 20% and 30%.
+
+```
+pruner = slim.prune.Pruner()
+pruned_program, _, _ = pruner.prune(
+        train_program,
+        fluid.global_scope(),
+        params=["conv2_1_sep_weights", "conv2_2_sep_weights"],
+        ratios=[0.33] * 2,
+        place=fluid.CPUPlace())
+```
+
+It will change the shapes of parameters defined in `train_program`. And the parameters` values stored in `fluid.global_scope()` will be pruned.
+
+
+### 3.3 Compute FLOPs after pruning
+
+```
+FLOPs = paddleslim.analysis.flops(train_program)
+print("FLOPs: {}".format(FLOPs))
+```
+
+## 4. Train pruned model
+
+### 4.1 Define dataset
+
+To make you easily run this demo, it will training on MNIST dataset. The package `paddle.dataset.mnist` of Paddle defines the downloading and reading of MNIST dataset.
+Define training data reader and test data reader as below：
+
+```
+import paddle.dataset.mnist as reader
+train_reader = paddle.fluid.io.batch(
+        reader.train(), batch_size=128, drop_last=True)
+train_feeder = fluid.DataFeeder(inputs, fluid.CPUPlace())
+```
+
+### 4.2 Training
+
+Excute following code to run an `epoch` training:
+
+```
+for data in train_reader():
+    acc1, acc5, loss = exe.run(pruned_program, feed=train_feeder.feed(data), fetch_list=outputs)
+    print(acc1, acc5, loss)
+```
diff --git a/docs/en/quick_start/quant_aware_tutorial_en.md b/docs/en/quick_start/quant_aware_tutorial_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..8b169294ce1ae7bde64ed59035b40f7b2f588d0b
--- /dev/null
+++ b/docs/en/quick_start/quant_aware_tutorial_en.md
@@ -0,0 +1,133 @@
+# Training-aware Quantization of image classification model - quick start
+
+This tutorial shows how to do training-aware quantization using [API](https://paddlepaddle.github.io/PaddleSlim/api_en/paddleslim.quant.html#paddleslim.quant.quanter.quant_aware) in PaddleSlim. We use MobileNetV1 to train image classification model as example. The tutorial contains follow sections:
+
+1. Necessary imports
+2. Model architecture
+3. Train normal model
+4. Quantization
+5. Train model after quantization
+6. Save model after quantization
+
+## 1. Necessary imports
+PaddleSlim depends on Paddle1.7. Please make true that you have installed Paddle correctly. Then do the necessary imports:
+
+```python
+import paddle
+import paddle.fluid as fluid
+import paddleslim as slim
+import numpy as np
+```
+
+## 2. Model architecture
+
+The section constructs a classification model, which use ``MobileNetV1`` and MNIST dataset. The model's input size is `[1, 28, 28]` and output size is 10. In order to show tutorial conveniently, we pre-defined a method to get image classification model in `paddleslim.models`.
+
+>note: The APIs in `paddleslim.models` are not formal inferface in PaddleSlim. They are defined to simplify the tutorial such as the definition of model structure and the construction of Program.
+
+
+```python
+exe, train_program, val_program, inputs, outputs = \
+    slim.models.image_classification("MobileNet", [1, 28, 28], 10, use_gpu=True)
+```
+
+## 3. Train normal model
+
+The section shows how to define model inputs, train and test model. The reason for training the normal image classification model first is that the quantization model's training process is performed on the well-trained model. We add quantization and dequantization operators in well-trained model and finetune using smaller learning rate.
+
+### 3.1 input data definition
+
+To speed up training process, we select MNIST dataset to train image classification model. The API `paddle.dataset.mnist` in Paddle framework contains downloading and reading the images in dataset.
+
+```python
+import paddle.dataset.mnist as reader
+train_reader = paddle.fluid.io.batch(
+        reader.train(), batch_size=128, drop_last=True)
+test_reader = paddle.fluid.io.batch(
+        reader.train(), batch_size=128, drop_last=True)
+train_feeder = fluid.DataFeeder(inputs, fluid.CPUPlace())
+```
+
+### 3.2 training model and testing
+
+Define functions to train and test model. We only need call the functions when formal model training and quantization model training. The function does one epoch training because that MNIST dataset is small and top1 accuracy will reach 95% after one epoch.
+
+```python
+def train(prog):
+    iter = 0
+    for data in train_reader():
+        acc1, acc5, loss = exe.run(prog, feed=train_feeder.feed(data), fetch_list=outputs)
+        if iter % 100 == 0:
+            print('train iter={}, top1={}, top5={}, loss={}'.format(iter, acc1.mean(), acc5.mean(), loss.mean()))
+        iter += 1
+
+def test(prog):
+    iter = 0
+    res = [[], []]
+    for data in train_reader():
+        acc1, acc5, loss = exe.run(prog, feed=train_feeder.feed(data), fetch_list=outputs)
+        if iter % 100 == 0:
+            print('test iter={}, top1={}, top5={}, loss={}'.format(iter, acc1.mean(), acc5.mean(), loss.mean()))
+        res[0].append(acc1.mean())
+        res[1].append(acc5.mean())
+        iter += 1
+    print('final test result top1={}, top5={}'.format(np.array(res[0]).mean(), np.array(res[1]).mean()))
+```
+
+Call ``train`` function to train normal classification model. ``train_program`` is defined in 2. Model architecture.
+
+```python
+train(train_program)
+```
+
+Call ``test`` function to test normal classification model. ``val_program`` is defined in 2. Model architecture.
+
+```python
+test(val_program)
+```
+
+
+## 4. Quantization
+
+We call ``quant_aware`` API to add quantization and dequantization operators in ``train_program`` and ``val_program`` according to [default configuration](https://paddlepaddle.github.io/PaddleSlim/api_cn/quantization_api.html#id2).
+
+```python
+quant_program = slim.quant.quant_aware(train_program, exe.place, for_test=False)
+val_quant_program = slim.quant.quant_aware(val_program, exe.place, for_test=True)
+```
+
+
+## 5. Train model after quantization
+
+Finetune the model after quantization. Test model after one epoch training.
+
+```python
+train(quant_program)
+```
+
+Test model after quantization. The top1 and top5 accuracy are close to result in ``3.2 training model and testing``. We preform the training-aware quantization without loss on this image classification model.
+
+```python
+test(val_quant_program)
+```
+
+
+## 6. Save model after quantization
+
+The model in ``4. Quantization`` after calling ``slim.quant.quant_aware`` API is only suitable to train. To get the inference model, we should use [slim.quant.convert](https://paddlepaddle.github.io/PaddleSlim/api_en/paddleslim.quant.html#paddleslim.quant.quanter.convert) API to change model architecture and use [fluid.io.save_inference_model](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/io_cn/save_inference_model_cn.html#save-inference-model) to save model. ``float_prog``'s parameters are float32 dtype but in int8's range which can be used in ``fluid`` or ``paddle-lite``. ``paddle-lite`` will change the parameters' dtype from float32 to int8 first when loading the inference model. ``int8_prog``'s parameters are int8 dtype and we can get model size after quantization by saving it. ``int8_prog`` cannot be used in ``fluid`` or ``paddle-lite``.
+
+
+```python
+float_prog, int8_prog = slim.quant.convert(val_quant_program, exe.place, save_int8=True)
+target_vars = [float_prog.global_block().var(name) for name in outputs]
+fluid.io.save_inference_model(dirname='./inference_model/float',
+        feeded_var_names=[var.name for var in inputs],
+        target_vars=target_vars,
+        executor=exe,
+        main_program=float_prog)
+fluid.io.save_inference_model(dirname='./inference_model/int8',
+        feeded_var_names=[var.name for var in inputs],
+        target_vars=target_vars,
+        executor=exe,
+        main_program=int8_prog)
+```
diff --git a/docs/en/quick_start/quant_post_static_tutorial_en.md b/docs/en/quick_start/quant_post_static_tutorial_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..0f7c5f414c2671aa97fe9189381358a76df7fba0
--- /dev/null
+++ b/docs/en/quick_start/quant_post_static_tutorial_en.md
@@ -0,0 +1,124 @@
+# Post-training Quantization of image classification model - quick start
+
+This tutorial shows how to do post training quantization using [API](https://paddlepaddle.github.io/PaddleSlim/api_en/paddleslim.quant.html#paddleslim.quant.quanter.quant_post) in PaddleSlim. We use MobileNetV1 to train image classification model as example. The tutorial contains follow sections:
+
+1. Necessary imports
+2. Model architecture
+3. Train normal model
+4. Post training quantization
+
+## 1. Necessary imports
+PaddleSlim depends on Paddle1.7. Please make true that you have installed Paddle correctly. Then do the necessary imports:
+
+
+```python
+import paddle
+import paddle.fluid as fluid
+import paddleslim as slim
+import numpy as np
+```
+## 2. Model architecture
+
+The section constructs a classification model, which use ``MobileNetV1`` and MNIST dataset. The model's input size is `[1, 28, 28]` and output size is 10. In order to show tutorial conveniently, we pre-defined a method to get image classification model in `paddleslim.models`.
+
+>note: The APIs in `paddleslim.models` are not formal inferface in PaddleSlim. They are defined to simplify the tutorial such as the definition of model structure and the construction of Program.
+
+
+```python
+exe, train_program, val_program, inputs, outputs = \
+    slim.models.image_classification("MobileNet", [1, 28, 28], 10, use_gpu=True)
+```
+
+## 3. Train normal model
+
+The section shows how to define model inputs, train and test model. The reason for training the normal image classification model first is that the post training quantization is performed on the well-trained model.
+
+### 3.1 input data definition
+
+To speed up training process, we select MNIST dataset to train image classification model. The API `paddle.dataset.mnist` in Paddle framework contains downloading and reading the images in dataset.
+
+```python
+import paddle.dataset.mnist as reader
+train_reader = paddle.fluid.io.batch(
+        reader.train(), batch_size=128, drop_last=True)
+test_reader = paddle.fluid.io.batch(
+cs/en/quick_start/quant_aware_tutorial_en.md
+        reader.train(), batch_size=128, drop_last=True)
+train_feeder = fluid.DataFeeder(inputs, fluid.CPUPlace())
+```
+
+### 3.2 training model and testing
+
+Define functions to train and test model. We only need call the functions when formal model training and quantization model training. The function does one epoch training because that MNIST dataset is small and top1 accuracy will reach 95% after one epoch.
+
+```python
+def train(prog):
+    iter = 0
+    for data in train_reader():
+        acc1, acc5, loss = exe.run(prog, feed=train_feeder.feed(data), fetch_list=outputs)
+        if iter % 100 == 0:
+            print('train', acc1.mean(), acc5.mean(), loss.mean())
+        iter += 1
+
+def test(prog, outputs=outputs):
+    iter = 0
+    res = [[], []]
+    for data in train_reader():
+        acc1, acc5, loss = exe.run(prog, feed=train_feeder.feed(data), fetch_list=outputs)
+        if iter % 100 == 0:
+            print('test', acc1.mean(), acc5.mean(), loss.mean())
+        res[0].append(acc1.mean())
+        res[1].append(acc5.mean())
+        iter += 1
+    print('final test result', np.array(res[0]).mean(), np.array(res[1]).mean())
+```
+
+Call ``train`` function to train normal classification model. ``train_program`` is defined in 2. Model architecture.
+
+
+```python
+train(train_program)
+```
+
+Call ``test`` function to test normal classification model. ``val_program`` is defined in 2. Model architecture.
+
+```python
+test(val_program)
+```
+
+
+Save inference model. Save well-trained model in ``'./inference_model'``. We will load the model when doing post training quantization.
+
+
+```python
+target_vars = [val_program.global_block().var(name) for name in outputs]
+fluid.io.save_inference_model(dirname='./inference_model',
+        feeded_var_names=[var.name for var in inputs],
+        target_vars=target_vars,
+        executor=exe,
+        main_program=val_program)
+```
+
+## 4. Post training quantization
+
+Call ``slim.quant.quant_post`` API to do post training quantization. The API will load the inference model in ``'./inference_model'`` first and calibrate the quantization parameters using data in sample_generator. In this tutorial, we use 10 mini-batch data to calibrate the quantization parameters. There is no need to train model but run forward to get activations for quantization scales calculation. The model after post training quantization are saved in ``'./quant_post_model'``.
+
+```python
+slim.quant.quant_post(
+        executor=exe,
+        model_dir='./inference_model',
+        quantize_model_path='./quant_post_model',
+        sample_generator=reader.test(),
+        batch_nums=10)
+```
+
+Load the model after post training quantization in ``'./quant_post_model'`` and run ``test`` function. The top1 and top5 accuracy are close to result in ``3.2 training model and testing``. We preform the post training quantization without loss on this image classification model.
+
+```python
+quant_post_prog, feed_target_names, fetch_targets = fluid.io.load_inference_model(
+        dirname='./quant_post_model',
+        model_filename='__model__',
+        params_filename='__params__',
+        executor=exe)
+test(quant_post_prog, fetch_targets)
+```
diff --git a/docs/en/tutorials/image_classification_sensitivity_analysis_tutorial_en.md b/docs/en/tutorials/image_classification_sensitivity_analysis_tutorial_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..043e144a1f122fd9abd1598f35c688f5bc7b6f71
--- /dev/null
+++ b/docs/en/tutorials/image_classification_sensitivity_analysis_tutorial_en.md
@@ -0,0 +1,263 @@
+# Pruning of image classification model - sensitivity
+
+In this tutorial, you will learn how to use [sensitivity API of PaddleSlim](https://paddlepaddle.github.io/PaddleSlim/api/prune_api/#sensitivity) by a demo of MobileNetV1 model on MNIST dataset。
+This tutorial following workflow:
+
+1. Import dependency
+2. Build model
+3. Define data reader
+4. Define function for test
+5. Training model
+6. Get names of parameter
+7. Compute sensitivities
+8. Pruning model
+
+
+## 1. Import dependency
+
+PaddleSlim dependents on Paddle1.7. Please ensure that you have installed paddle correctly. Import Paddle and PaddleSlim as below:
+
+```python
+import paddle
+import paddle.fluid as fluid
+import paddleslim as slim
+```
+
+## 2. Build model
+
+This section will build a classsification model based `MobileNetV1` for MNIST task. The shape of the input is `[1, 28, 28]` and the output number is 10.
+
+To make the code simple, we define a function in package `paddleslim.models` to build classification model.
+Excute following code to build a model,
+
+
+```python
+exe, train_program, val_program, inputs, outputs = slim.models.image_classification("MobileNet", [1, 28, 28], 10, use_gpu=True)
+place = fluid.CUDAPlace(0)
+```
+
+>Note：The functions in paddleslim.models is just used in tutorials or demos.
+
+## 3 Define data reader
+
+MNIST dataset is used for making the demo can be executed quickly. It defines some functions for downloading and reading MNIST dataset in package `paddle.dataset.mnist`.
+Show as below：
+
+```python
+import paddle.dataset.mnist as reader
+train_reader = paddle.fluid.io.batch(
+        reader.train(), batch_size=128, drop_last=True)
+test_reader = paddle.fluid.io.batch(
+        reader.test(), batch_size=128, drop_last=True)
+data_feeder = fluid.DataFeeder(inputs, place)
+```
+
+## 4. Define test function
+
+To get the performance of model on test dataset after pruning a convolution layer, we define a test function as below:
+
+```python
+import numpy as np
+def test(program):
+    acc_top1_ns = []
+    acc_top5_ns = []
+    for data in test_reader():
+        acc_top1_n, acc_top5_n, _ = exe.run(
+            program,
+            feed=data_feeder.feed(data),
+            fetch_list=outputs)
+        acc_top1_ns.append(np.mean(acc_top1_n))
+        acc_top5_ns.append(np.mean(acc_top5_n))
+    print("Final eva - acc_top1: {}; acc_top5: {}".format(
+        np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns))))
+    return np.mean(np.array(acc_top1_ns))
+```
+
+## 5. Training model
+
+Sensitivity analysis is dependent on pretrained model. So we should train the model defined in section 2 for some epochs. One epoch training is enough for this simple demo while more epochs may be necessary for other model. Or you can load pretrained model from filesystem.
+
+Training model as below:
+
+
+```python
+for data in train_reader():
+    acc1, acc5, loss = exe.run(train_program, feed=data_feeder.feed(data), fetch_list=outputs)
+print(np.mean(acc1), np.mean(acc5), np.mean(loss))
+```
+
+Get the performance using the test function defined in section 4:
+
+```python
+test(val_program)
+```
+
+## 6. Get names of parameters
+
+```python
+params = []
+for param in train_program.global_block().all_parameters():
+    if "_sep_weights" in param.name:
+        params.append(param.name)
+print(params)
+params = params[:5]
+```
+
+## 7. Compute sensitivities
+
+### 7.1 Compute in single process
+
+Apply sensitivity analysis on pretrained model by calling [sensitivity API](https://paddlepaddle.github.io/PaddleSlim/api/prune_api/#sensitivity).
+
+The sensitivities will be appended into the file given by option `sensitivities_file` during computing.
+The information in this file won`t be computed repeatedly.
+
+Remove the file `sensitivities_0.data` in current directory:
+
+```python
+!rm -rf sensitivities_0.data
+```
+
+Apart from the parameters to be analyzed, it also support for setting the ratios that each convolutoin will be pruned.
+
+If one model losses 90% accuracy on test dataset when its single convolution layer is pruned by 40%, then we can set `pruned_ratios` to `[0.1, 0.2, 0.3, 0.4]`.
+
+The granularity of `pruned_ratios` should be small to get more reasonable sensitivities. But small granularity of `pruned_ratios` will slow down the computing.
+
+```python
+sens_0 = slim.prune.sensitivity(
+        val_program,
+        place,
+        params,
+        test,
+        sensitivities_file="sensitivities_0.data",
+        pruned_ratios=[0.1, 0.2])
+print(sens_0)
+```
+
+### 7.2 Expand sensitivities
+
+We can expand `pruned_ratios` to `[0.1, 0.2, 0.3]` based the sensitivities generated in section 7.1.
+
+```python
+sens_0 = slim.prune.sensitivity(
+        val_program,
+        place,
+        params,
+        test,
+        sensitivities_file="sensitivities_0.data",
+        pruned_ratios=[0.3])
+print(sens_0)
+```
+
+### 7.3 Computing sensitivity in multi-process
+
+The time cost of computing sensitivities is dependent on the count of parameters and the speed of model evaluation on test dataset. We can speed up computing by multi-process.
+
+Split `pruned_ratios` into multi-process, and merge the sensitivities from multi-process.
+
+#### 7.3.1 Computing in each process
+
+We have compute the sensitivities when `pruned_ratios=[0.1, 0.2, 0.3]` and saved the sensitivities into file named `sensitivities_0.data`.
+
+在另一个进程中，The we start a task by setting `pruned_ratios=[0.4]` in another process and save result into file named `sensitivities_1.data`. Show as below：
+
+
+```python
+sens_1 = slim.prune.sensitivity(
+        val_program,
+        place,
+        params,
+        test,
+        sensitivities_file="sensitivities_1.data",
+        pruned_ratios=[0.4])
+print(sens_1)
+```
+
+#### 7.3.2 Load sensitivity file generated in multi-process
+
+```python
+s_0 = slim.prune.load_sensitivities("sensitivities_0.data")
+s_1 = slim.prune.load_sensitivities("sensitivities_1.data")
+print(s_0)
+print(s_1)
+```
+
+#### 7.3.3 Merge sensitivies
+
+
+```python
+s = slim.prune.merge_sensitive([s_0, s_1])
+print(s)
+```
+
+## 8. Pruning model
+
+Pruning model according to the sensitivities generated in section 7.3.3.
+
+### 8.1 Get pruning ratios
+
+Get a group of ratios by calling [get_ratios_by_loss](https://paddlepaddle.github.io/PaddleSlim/api/prune_api/#get_ratios_by_loss) fuction：
+
+
+```python
+loss = 0.01
+ratios = slim.prune.get_ratios_by_loss(s_0, loss)
+print(ratios)
+```
+
+### 8.2 Pruning training network
+
+
+```python
+pruner = slim.prune.Pruner()
+print("FLOPs before pruning: {}".format(slim.analysis.flops(train_program)))
+pruned_program, _, _ = pruner.prune(
+        train_program,
+        fluid.global_scope(),
+        params=ratios.keys(),
+        ratios=ratios.values(),
+        place=place)
+print("FLOPs after pruning: {}".format(slim.analysis.flops(pruned_program)))
+```
+
+### 8.3 Pruning test network
+
+Note：The `only_graph` should be set to True while pruning test network. [Pruner API](https://paddlepaddle.github.io/PaddleSlim/api/prune_api/#pruner)
+
+
+```python
+pruner = slim.prune.Pruner()
+print("FLOPs before pruning: {}".format(slim.analysis.flops(val_program)))
+pruned_val_program, _, _ = pruner.prune(
+        val_program,
+        fluid.global_scope(),
+        params=ratios.keys(),
+        ratios=ratios.values(),
+        place=place,
+        only_graph=True)
+print("FLOPs after pruning: {}".format(slim.analysis.flops(pruned_val_program)))
+```
+
+Get accuracy of pruned model on test dataset:
+
+```python
+test(pruned_val_program)
+```
+
+### 8.4 Training pruned model
+
+Training pruned model:
+
+
+```python
+for data in train_reader():
+    acc1, acc5, loss = exe.run(pruned_program, feed=data_feeder.feed(data), fetch_list=outputs)
+print(np.mean(acc1), np.mean(acc5), np.mean(loss))
+```
+
+Get accuracy of model after training:
+
+```python
+test(pruned_val_program)
+```
diff --git a/docs/en/tutorials/index_en.rst b/docs/en/tutorials/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..10bdbd38761f9f1c72a0cb427636cd3431986cd7
--- /dev/null
+++ b/docs/en/tutorials/index_en.rst
@@ -0,0 +1,8 @@
+Aadvanced Tutorials
+========
+
+.. toctree::
+   :maxdepth: 1
+
+   image_classification_sensitivity_analysis_tutorial_en.md
+    
diff --git a/docs/docs/images/algo/distillation_0.png b/docs/images/algo/distillation_0.png
similarity index 100%
rename from docs/docs/images/algo/distillation_0.png
rename to docs/images/algo/distillation_0.png
diff --git a/docs/docs/images/algo/light-nas-block.png b/docs/images/algo/light-nas-block.png
similarity index 100%
rename from docs/docs/images/algo/light-nas-block.png
rename to docs/images/algo/light-nas-block.png
diff --git a/docs/docs/images/algo/pruning_0.png b/docs/images/algo/pruning_0.png
similarity index 100%
rename from docs/docs/images/algo/pruning_0.png
rename to docs/images/algo/pruning_0.png
diff --git a/docs/docs/images/algo/pruning_1.png b/docs/images/algo/pruning_1.png
similarity index 100%
rename from docs/docs/images/algo/pruning_1.png
rename to docs/images/algo/pruning_1.png
diff --git a/docs/docs/images/algo/pruning_2.png b/docs/images/algo/pruning_2.png
similarity index 100%
rename from docs/docs/images/algo/pruning_2.png
rename to docs/images/algo/pruning_2.png
diff --git a/docs/docs/images/algo/pruning_3.png b/docs/images/algo/pruning_3.png
similarity index 100%
rename from docs/docs/images/algo/pruning_3.png
rename to docs/images/algo/pruning_3.png
diff --git a/docs/docs/images/algo/pruning_4.png b/docs/images/algo/pruning_4.png
similarity index 100%
rename from docs/docs/images/algo/pruning_4.png
rename to docs/images/algo/pruning_4.png
diff --git a/docs/docs/images/algo/quan_bwd.png b/docs/images/algo/quan_bwd.png
similarity index 100%
rename from docs/docs/images/algo/quan_bwd.png
rename to docs/images/algo/quan_bwd.png
diff --git a/docs/docs/images/algo/quan_forward.png b/docs/images/algo/quan_forward.png
similarity index 100%
rename from docs/docs/images/algo/quan_forward.png
rename to docs/images/algo/quan_forward.png
diff --git a/docs/docs/images/algo/quan_fwd_1.png b/docs/images/algo/quan_fwd_1.png
similarity index 100%
rename from docs/docs/images/algo/quan_fwd_1.png
rename to docs/images/algo/quan_fwd_1.png
diff --git a/docs/docs/images/algo/quan_table_0.png b/docs/images/algo/quan_table_0.png
similarity index 100%
rename from docs/docs/images/algo/quan_table_0.png
rename to docs/images/algo/quan_table_0.png
diff --git a/docs/docs/images/algo/quan_table_1.png b/docs/images/algo/quan_table_1.png
similarity index 100%
rename from docs/docs/images/algo/quan_table_1.png
rename to docs/images/algo/quan_table_1.png
diff --git a/docs/images/framework_0.png b/docs/images/framework_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..223f384f23775403129a69967bbe9e891dfa77ff
Binary files /dev/null and b/docs/images/framework_0.png differ
diff --git a/docs/images/framework_1.png b/docs/images/framework_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..642bc13f8e12eace8fe8e70d8f3ec04a39e4275a
Binary files /dev/null and b/docs/images/framework_1.png differ
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
deleted file mode 100644
index 970e4b8f7e0ee7940098ea77929c7214c49d9ba9..0000000000000000000000000000000000000000
--- a/docs/mkdocs.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-site_name: PaddleSlim Docs
-repo_url: https://github.com/PaddlePaddle/PaddleSlim
-nav:
-- Home: index.md
-- 教程:
-  - 离线量化: tutorials/quant_post_demo.md
-  - 量化训练: tutorials/quant_aware_demo.md
-  - Embedding量化: tutorials/quant_embedding_demo.md
-  - SA搜索: tutorials/nas_demo.md
-  - 知识蒸馏: tutorials/distillation_demo.md
-- API:
-  - 量化: api/quantization_api.md
-  - 剪枝与敏感度:  api/prune_api.md
-  - 模型分析: api/analysis_api.md
-  - 知识蒸馏: api/single_distiller_api.md
-  - SA搜索: api/nas_api.md
-  - 搜索空间: api/search_space.md
-  - 硬件延时评估表: table_latency.md
-- 算法原理: algo/algo.md
-
-theme:
-    name: readthedocs
-    highlightjs: true
-
-markdown_extensions:
-  - admonition
-  - codehilite:
-      guess_lang: true
-      linenums: true
-  - toc:
-      permalink: "#"
-  - footnotes
-  - meta
-  - def_list
-  - pymdownx.arithmatex
-  - pymdownx.betterem:
-      smart_enable: all
-  - pymdownx.caret
-  - pymdownx.critic
-  - pymdownx.details
-  - pymdownx.magiclink
-  - pymdownx.mark
-  - pymdownx.smartsymbols
-  - pymdownx.superfences
-  - pymdownx.tasklist
-  - pymdownx.tilde
-  - mdx_math
-
-extra_javascript:
-  - mathjax-config.js
-  - https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS-MML_HTMLorMML
diff --git a/docs/requirements.txt b/docs/requirements.txt
index aa4ce367e579b1c3e004e04c20f261a1e75548fc..b15f2f146d1b52d3f7b2c5edc870131b8c8f54c3 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,4 +1,5 @@
-mkdocs
-markdown
-python-markdown-math
-pymdown-extensions
+sphinx
+recommonmark
+sphinx_markdown_tables
+sphinx_rtd_theme
+m2r
diff --git a/docs/zh_cn/CHANGELOG.md b/docs/zh_cn/CHANGELOG.md
new file mode 100644
index 0000000000000000000000000000000000000000..43105675f071812dec17dd76c3fc30e47a95fbe4
--- /dev/null
+++ b/docs/zh_cn/CHANGELOG.md
@@ -0,0 +1,46 @@
+# 版本更新信息
+
+## 最新版本信息
+
+### v1.1.0(05/2020)
+
+- 量化
+  - 增加无校准数据训练后量化方法，int16精度无损，int8精度损失低于0.1%。
+  - 增强量化功能，完善量化OP的输出scale信息，支持CPU预测端全面适配量化模型。
+- 剪裁
+  - 新增FPGM和BN scale两种剪裁策略, 在MobileNetV3-YOLOV3-COCO任务上，同等压缩率下精度提升0.6%。
+  - 新增自定义剪裁策略接口，方便开发者快速新增压缩策略。
+  - 剪裁功能添加对新增Operator的默认处理逻辑，扩展支持剪裁更多复杂网络。
+- NAS
+  - 新增DARTS系列搜索算法，并提供扩展接口，方便用户调研和实现新的模型结构搜索策略。
+  - 模型结构搜索添加早停机制，提升搜索功能易用性。
+  - 新增一种基于强化学习的模型结构搜索策略，并提供扩展接口，为用户调研实现新策略提供参考。
+
+
+## 历史版本信息
+
+### v1.0.1
+
+  - 拆分PaddleSlim为独立repo。
+  - 重构裁剪、量化、蒸馏、搜索接口，对用户开放底层接口。
+      - 量化: 
+          - 新增基于KL散度的离线量化功能，支持对Embedding层量化。
+          - 新增对FC的QAT MKL-DNN量化策略支持 
+          - 新增PostTrainingQuantization，完整实现训练后量化功能：支持量化30种OP，支持灵活设置需要量化的OP，生成统一格式的量化模型，具有耗时短、易用性强、精度损失较小的优点。
+          - 量化训练支持设定需要量化的OP类型。
+      - 裁剪: 重构剪裁实现，方便扩展支持更多类型的网络。
+      - 搜索:
+          - 支持SA搜索，增加更多的搜索空间，支持用户自定义搜索空间。
+          - 新增one-shot搜索算法，搜索速度比上个版本快20倍。
+  - 新增大规模可扩展知识蒸馏框架 Pantheon
+      - student 与 teacher 、teacher与 teacher 模型之间充分解耦，可分别独立运行在不同的物理设备上，便于充分利用计算资源；
+      - 支持 teacher 模型的单节点多设备大规模预测，在 BERT 等模型上测试加速比达到线性；
+      - 用 TCP/IP 协议实现在线蒸馏模式的通信，支持在同一网络环境下，运行在任意两个物理设备上的 teacher 模型和 student 模型之间进行知识传输；
+      - 统一在线和离线两种蒸馏模式的 API 接口，不同的 teacher 模型可以工作在不同的模式下；
+      - 在 student 端自动完成知识的归并与知识数据的 batch 重组，便于多 teacher 模型的知识融合。
+  - 模型库:
+      - 发布ResNet50、MobileNet模型的压缩benchmark
+      - 打通检测库，并发布YOLOv3系列模型的压缩benchmark
+      - 打通分割库，并发布Deepabv3+系列分割模型的压缩benchmark
+  - 完善文档：
+      - 补充API文档；新增入门教程和高级教程；增加ModelZoo文档，覆盖分类、检测、分割任务。所有文档包含中、英文。
diff --git a/docs/zh_cn/FAQ/index.rst b/docs/zh_cn/FAQ/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a9578403444c471af3512be25e750595419b64ce
--- /dev/null
+++ b/docs/zh_cn/FAQ/index.rst
@@ -0,0 +1,9 @@
+
+FAQ
+========
+
+.. toctree::
+   :maxdepth: 1
+
+   quantization_FAQ.md
+    
diff --git a/docs/zh_cn/FAQ/quantization_FAQ.md b/docs/zh_cn/FAQ/quantization_FAQ.md
new file mode 100644
index 0000000000000000000000000000000000000000..8ac10e43b3ab5bdea3ae0005c1ba76adfaa53195
--- /dev/null
+++ b/docs/zh_cn/FAQ/quantization_FAQ.md
@@ -0,0 +1,89 @@
+## 量化FAQ
+
+1. 量化训练或者离线量化后的模型体积为什么没有变小？
+2. 量化训练或者离线量化后的模型使用fluid加载为什么没有加速？怎样才能加速？
+3. 该怎么设置适合的量化配置？
+4. 离线量化出现'KeyError: '报错
+5. 离线量化或者量化训练时出现CUDNN或者CUDA错误
+6. 量化训练时loss是nan
+7. cpu上跑量化后的模型出nan
+
+#### 1. 量化训练或者离线量化后的模型体积为什么没有变小？
+
+答：这是因为量化后保存的参数是虽然是int8范围，但是类型是float。这是由于fluid没有int8 kernel, 为了方便量化后验证量化精度，必须能让fluid能够加载。
+
+#### 2. 量化训练或者离线量化后的模型使用fluid加载为什么没有加速？怎样才能加速？
+
+答：这是因为量化后保存的参数是虽然是int8范围，但是类型是float。fluid并不具备加速量化模型的能力。量化模型必须配合使用预测库才能加速。
+
+- 如果量化模型在ARM上线，则需要使用[Paddle-Lite](https://paddle-lite.readthedocs.io/zh/latest/index.html).
+
+    -  Paddle-Lite会对量化模型进行模型转化和优化，转化方法见[链接](https://paddle-lite.readthedocs.io/zh/latest/user_guides/model_quantization.html#paddle-lite)。
+
+    - 转化之后可以像非量化模型一样使用[Paddle-Lite API](https://paddle-lite.readthedocs.io/zh/latest/user_guides/tutorial.html#lite)进行加载预测。
+
+- 如果量化模型在GPU上线，则需要使用[Paddle-TensorRT 预测接口](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/advanced_guide/performance_improving/inference_improving/paddle_tensorrt_infer.html).
+
+    - 和非量化模型的区别在于以下参数设置：
+
+```python
+config->EnableTensorRtEngine(1 << 20      /* workspace_size*/,  
+                        batch_size        /* max_batch_size*/,  
+                        3                 /* min_subgraph_size*/,
+                        AnalysisConfig::Precision::kInt8 /* precision*/,
+                        false             /* use_static*/,
+                        false             /* use_calib_mode*/);
+```
+
+-  如果量化模型在x86上线，需要使用[INT8 MKL-DNN](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/contrib/slim/tests/slim_int8_mkldnn_post_training_quantization.md)
+
+    - 首先对模型进行转化，可以参考[脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/contrib/slim/tests/save_quant_model.py)
+
+    - 转化之后可使用预测部署API进行加载。比如[c++ API](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/advanced_guide/inference_deployment/inference/native_infer.html)
+
+
+#### 3. 该怎么设置适合的量化配置？
+
+- 首先需要考虑量化模型上线的平台
+
+   | 平台             | 支持weight量化方式             | 支持activation量化方式                | 支持量化的OP                                                 |
+   | ---------------- | ------------------------------ | ------------------------------------- | ------------------------------------------------------------ |
+   | ARM(Paddle-Lite) | channel_wise_abs_max， abs_max | moving_average_abs_max，range_abs_max | conv2d, depthwise_conv2d, mul                                |
+   | x86(MKL-DNN)     | abs_max                        | moving_average_abs_max，range_abs_max | conv2d, depthwise_conv2d, mul, matmul                        |
+   | GPU(TensorRT)    | channel_wise_abs_max           | moving_average_abs_max，range_abs_max | mul, conv2d, pool2d, depthwise_conv2d, elementwise_add, leaky_relu |
+
+- 部分层跳过量化
+
+   如果量化后精度损失较大，可以考虑跳过部分对量化敏感的计算不量化，比如最后一层或者attention计算。
+
+
+
+#### 4. 离线量化出现'KeyError: '报错
+
+
+
+答： 一般是reader没写对，导致离线量化是前向一次没跑，没有收集到中间的激活值。
+
+
+
+#### 5. 离线量化或者量化训练时出现CUDNN或者CUDA错误
+
+
+
+答：因为离线量化或者量化训练并没有涉及到对cuda或者cudnn做修改， 因此这个错误一般是机器上的cuda或者cudnn版本和Paddle所需的cuda或者cudnn版本不一致。
+
+
+
+#### 6. 量化训练时loss是nan
+
+
+
+答：需要适当调小学习率。如果小学习率依然不能解决问题，则需要考虑是否某些层对量化敏感，需要跳过量化，比如attention.
+
+
+
+#### 7. cpu上跑量化后的模型出nan
+
+
+
+答：可查看使用的paddle版本是否包含[pr](https://github.com/PaddlePaddle/Paddle/pull/22966)， 该pr修复了在对几乎是0的tensor进行量化时的bug。
diff --git a/docs/zh_cn/Makefile b/docs/zh_cn/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..141d0b25f71fd0d96f59c5f682ea537d2ba767ea
--- /dev/null
+++ b/docs/zh_cn/Makefile
@@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = ./
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/docs/algo/algo.md b/docs/zh_cn/algo/algo.md
similarity index 93%
rename from docs/docs/algo/algo.md
rename to docs/zh_cn/algo/algo.md
index 5f268b74eb4c073d0240b1fc2d4651edeb1cf606..67977e70f1f9618352575374aa8605bde3a80a62 100644
--- a/docs/docs/algo/algo.md
+++ b/docs/zh_cn/algo/algo.md
@@ -1,4 +1,6 @@
-﻿## 目录
+﻿# 算法原理
+
+## 目录
 
 - [量化原理介绍](#1-quantization-aware-training量化介绍)
 - [剪裁原理介绍](#2-卷积核剪裁原理)
@@ -12,14 +14,14 @@
 近年来，定点量化使用更少的比特数（如8-bit、3-bit、2-bit等）表示神经网络的权重和激活已被验证是有效的。定点量化的优点包括低内存带宽、低功耗、低计算资源占用以及低模型存储需求等。
 
 <p align="center">
-<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/develop/docs/docs/images/algo/quan_table_0.png" height=258 width=600 hspace='10'/> <br />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/release/1.0.1/docs/images/algo/quan_table_0.png" height=258 width=600 hspace='10'/> <br />
 <strong>表1: 不同类型操作的开销对比</strong>
 </p>
 
 由表1可知，低精度定点数操作的硬件面积大小及能耗比高精度浮点数要少几个数量级。 使用定点量化可带来4倍的模型压缩、4倍的内存带宽提升，以及更高效的cache利用(很多硬件设备，内存访问是主要能耗)。除此之外，计算速度也会更快(通常具有2x-3x的性能提升)。由表2可知，在很多场景下，定点量化操作对精度并不会造成损失。另外，定点量化对神经网络于嵌入式设备上的推断来说是极其重要的。
 
 <p align="center">
-<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/develop/docs/docs/images/algo/quan_table_1.png" height=155 width=500 hspace='10'/> <br />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/release/1.0.1/docs/images/algo/quan_table_1.png" height=155 width=500 hspace='10'/> <br />
 <strong>表2：模型量化前后精度对比</strong>
 </p>
 
@@ -43,7 +45,7 @@ $q = scale * r + b$
 前向传播过程采用模拟量化的方式，具体描述如下：
 
 <p align="center">
-<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/develop/docs/docs/images/algo/quan_forward.png" height=433 width=335 hspace='10'/> <br />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/release/1.0.1/docs/images/algo/quan_forward.png" height=433 width=335 hspace='10'/> <br />
 <strong>图1：基于模拟量化训练的前向过程</strong>
 </p>
 
@@ -67,7 +69,7 @@ $$
 上述公式表明反量化操作可以被移动到`GEMM`之前，即先对$Xq$和$Wq$执行反量化操作再做`GEMM`操作。因此，前向传播的工作流亦可表示为如下方式：
 
 <p align="center">
-<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/develop/docs/docs/images/algo/quan_fwd_1.png" height=435 width=341 hspace='10'/> <br />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/release/1.0.1/docs/images/algo/quan_fwd_1.png" height=435 width=341 hspace='10'/> <br />
 <strong>图2：基于模拟量化训练前向过程的等价工作流</strong>
 </p>
 
@@ -77,7 +79,7 @@ $$
 由图3可知，权重更新所需的梯度值可以由量化后的权重和量化后的激活求得。反向传播过程中的所有输入和输出均为32-bit浮点型数据。注意，梯度更新操作需要在原始权重上进行，即计算出的梯度将被加到原始权重上而非量化后或反量化后的权重上。
 
 <p align="center">
-<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/develop/docs/docs/images/algo/quan_bwd.png" height=300 width=650 hspace='10'/> <br />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/release/1.0.1/docs/images/algo/quan_bwd.png" height=300 width=650 hspace='10'/> <br />
 <strong>图3：基于模拟量化训练的反向传播和权重更新过程</strong>
 </p>
 
@@ -125,7 +127,7 @@ $$ Vt = (1 - k) * V + k * V_{t-1} $$
 
 
 <p align="center">
-<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/develop/docs/docs/images/algo/pruning_0.png" height=200 width=600 hspace='10'/> <br />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/release/1.0.1/docs/images/algo/pruning_0.png" height=200 width=600 hspace='10'/> <br />
 <strong>图4</strong>
 </p>
 
@@ -137,7 +139,7 @@ $$ Vt = (1 - k) * V + k * V_{t-1} $$
 减去被删除的一行：greedy pruning
 
 <p align="center">
-<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/develop/docs/docs/images/algo/pruning_1.png" height=200 width=450 hspace='10'/> <br />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/release/1.0.1/docs/images/algo/pruning_1.png" height=200 width=450 hspace='10'/> <br />
 <strong>图5</strong>
 </p>
 
@@ -147,7 +149,7 @@ $$ Vt = (1 - k) * V + k * V_{t-1} $$
 
 
 <p align="center">
-<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/develop/docs/docs/images/algo/pruning_2.png" height=240 width=600 hspace='10'/> <br />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/release/1.0.1/docs/images/algo/pruning_2.png" height=240 width=600 hspace='10'/> <br />
 <strong>图6</strong>
 </p>
 
@@ -174,7 +176,7 @@ $$ Vt = (1 - k) * V + k * V_{t-1} $$
 #### 敏感度的理解
 
 <p align="center">
-<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/develop/docs/docs/images/algo/pruning_3.png" height=200 width=400 hspace='10'/> <br />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/release/1.0.1/docs/images/algo/pruning_3.png" height=200 width=400 hspace='10'/> <br />
 <strong>图7</strong>
 </p>
 
@@ -187,7 +189,7 @@ $$ Vt = (1 - k) * V + k * V_{t-1} $$
 用户给定一个模型整体的剪裁率，我们通过移动**图5**中的黑色实线来找到一组满足条件的且合法的剪裁率。
 
 <p align="center">
-<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/develop/docs/docs/images/algo/pruning_4.png" height=200 width=400 hspace='10'/> <br />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/release/1.0.1/docs/images/algo/pruning_4.png" height=200 width=400 hspace='10'/> <br />
 <strong>图8</strong>
 </p>
 
@@ -204,12 +206,11 @@ $$ Vt = (1 - k) * V + k * V_{t-1} $$
    一般情况下，模型参数量越多，结构越复杂，其性能越好，但参数也越允余，运算量和资源消耗也越大；模型蒸馏是将复杂网络中的有用信息将复杂网络中的有用信息提取出来提取出来，迁移到一个更小的网络中去，在我们的工具包中，支持两种蒸馏的方法。
     第一种是传统的蒸馏方法（参考论文：[Distilling the Knowledge in a Neural Network](https://arxiv.org/pdf/1503.02531.pdf)）
    使用复杂的网络作为teacher模型去监督训练一个参数量和运算量更少的student模型。teacher模型可以是一个或者多个提前训练好的高性能模型。student模型的训练有两个目标：一个是原始的目标函数，为student模型输出的类别概率和label的交叉熵，记为hard-target；另一个是student模型输出的类别概率和teacher模型输出的类别概率的交叉熵，记为soft target，这两个loss加权后得到最终的训练loss，共同监督studuent模型的训练。
-   第二种是基于FSP的蒸馏方法（参考论文：[A Gift from Knowledge Distillation:
-Fast Optimization, Network Minimization and Transfer Learning](http://openaccess.thecvf.com/content_cvpr_2017/papers/Yim_A_Gift_From_CVPR_2017_paper.pdf)）
+   第二种是基于FSP的蒸馏方法（参考论文：[A Gift from Knowledge Distillation: Fast Optimization, Network Minimization and Transfer Learning](http://openaccess.thecvf.com/content_cvpr_2017/papers/Yim_A_Gift_From_CVPR_2017_paper.pdf)）
    相比传统的蒸馏方法直接用小模型去拟合大模型的输出，该方法用小模型去拟合大模型不同层特征之间的转换关系，其用一个FSP矩阵（特征的内积）来表示不同层特征之间的关系，大模型和小模型不同层之间分别获得多个FSP矩阵，然后使用L2 loss让小模型的对应层FSP矩阵和大模型对应层的FSP矩阵尽量一致，具体如下图所示。这种方法的优势，通俗的解释是，比如将蒸馏类比成teacher（大模型）教student（小模型）解决一个问题，传统的蒸馏是直接告诉小模型问题的答案，让小模型学习，而学习FSP矩阵是让小模型学习解决问题的中间过程和方法，因此其学到的信息更多。
 
 <p align="center">
-<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/develop/docs/docs/images/algo/distillation_0.png" height=300 width=600 hspace='10'/> <br />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/release/1.0.1/docs/images/algo/distillation_0.png" height=300 width=600 hspace='10'/> <br />
 <strong>图9</strong>
 </p>
 
@@ -235,6 +236,7 @@ $$
 T_k = T_0*\theta^k
 $$
 
+$$
 \begin{equation}
 P(r_k) =
 \begin{cases}
@@ -242,6 +244,7 @@ e^{\frac{(r_k-r)}{T_k}} & r_k < r\\
 1 & r_k>=r
 \end{cases}
 \end{equation}
+$$
 
 在第k次迭代，搜到的网络为$N_k$, 对$N_k$训练若干epoch后，在测试集上得到reward为$r_k$, 以概率$P(r_k)$接受$r_k$，即执行$r=r_k$。$r$在搜索过程起始时被初始化为0. $T_0$为初始化温度，$\theta$为温度衰减系数，$T_k$为第k次迭代的温度。
 
@@ -256,7 +259,7 @@ e^{\frac{(r_k-r)}{T_k}} & r_k < r\\
 因为要搜索出在移动端运行速度快的模型，我们参考了MobileNetV2中的Linear Bottlenecks和Inverted residuals结构，搜索每一个Inverted residuals中的具体参数，包括kernelsize、channel扩张倍数、重复次数、channels number。如图10所示：
 
 <p align="center">
-<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/develop/docs/docs/images/algo/light-nas-block.png" height=300 width=600 hspace='10'/> <br />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/PaddleSlim/release/1.0.1/docs/images/algo/light-nas-block.png" height=300 width=600 hspace='10'/> <br />
 <strong>图10</strong>
 </p>
 
diff --git a/docs/zh_cn/api_cn/analysis_api.rst b/docs/zh_cn/api_cn/analysis_api.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0f0bebf4dccf09114d607a8d02179a88cdb53cd1
--- /dev/null
+++ b/docs/zh_cn/api_cn/analysis_api.rst
@@ -0,0 +1,181 @@
+模型分析
+=======
+
+FLOPs
+-----
+
+.. py:function:: paddleslim.analysis.flops(program, detail=False)
+
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/analysis/flops.py>`_
+
+获得指定网络的浮点运算次数(FLOPs)。
+
+**参数：**
+
+- **program(paddle.fluid.Program)** - 待分析的目标网络。更多关于Program的介绍请参考：`Program概念介绍 <https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Program_cn.html#program>`_。
+
+- **detail(bool)** - 是否返回每个卷积层的FLOPs。默认为False。
+
+- **only_conv(bool)** - 如果设置为True，则仅计算卷积层和全连接层的FLOPs，即浮点数的乘加（multiplication-adds）操作次数。如果设置为False，则也会计算卷积和全连接层之外的操作的FLOPs。
+
+**返回值：**
+
+- **flops(float)** - 整个网络的FLOPs。
+
+- **params2flops(dict)** - 每层卷积对应的FLOPs，其中key为卷积层参数名称，value为FLOPs值。
+
+**示例：**
+
+.. code-block:: python
+
+    import paddle.fluid as fluid
+    from paddle.fluid.param_attr import ParamAttr
+    from paddleslim.analysis import flops
+    
+    def conv_bn_layer(input,
+                      num_filters,
+                      filter_size,
+                      name,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            name=name + "_out")
+        bn_name = name + "_bn"
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            name=bn_name + '_output',
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance', )
+    
+    main_program = fluid.Program()
+    startup_program = fluid.Program()
+    #   X       X              O       X              O
+    # conv1-->conv2-->sum1-->conv3-->conv4-->sum2-->conv5-->conv6
+    #     |            ^ |                    ^
+    #     |____________| |____________________|
+    #
+    # X: prune output channels
+    # O: prune input channels
+    with fluid.program_guard(main_program, startup_program):
+        input = fluid.data(name="image", shape=[None, 3, 16, 16])
+        conv1 = conv_bn_layer(input, 8, 3, "conv1")
+        conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+        sum1 = conv1 + conv2
+        conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
+        conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
+        sum2 = conv4 + sum1
+        conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
+        conv6 = conv_bn_layer(conv5, 8, 3, "conv6")
+    
+    print("FLOPs: {}".format(flops(main_program)))
+
+model_size
+----------
+
+.. py:function:: paddleslim.analysis.model_size(program)
+
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/analysis/model_size.py>`_
+
+获得指定网络的参数数量。
+
+**参数：**
+
+- **program(paddle.fluid.Program)** - 待分析的目标网络。更多关于Program的介绍请参考：`Program概念介绍 <https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Program_cn.html#program>`_。
+
+**返回值：**
+
+- **model_size(int)** - 整个网络的参数数量。
+
+**示例：**
+
+.. code-block:: python
+
+    import paddle.fluid as fluid
+    from paddle.fluid.param_attr import ParamAttr
+    from paddleslim.analysis import model_size
+    
+    def conv_layer(input,
+                      num_filters,
+                      filter_size,
+                      name,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            name=name + "_out")
+        return conv
+    
+    main_program = fluid.Program()
+    startup_program = fluid.Program()
+    #   X       X              O       X              O
+    # conv1-->conv2-->sum1-->conv3-->conv4-->sum2-->conv5-->conv6
+    #     |            ^ |                    ^
+    #     |____________| |____________________|
+    #
+    # X: prune output channels
+    # O: prune input channels
+    with fluid.program_guard(main_program, startup_program):
+        input = fluid.data(name="image", shape=[None, 3, 16, 16])
+        conv1 = conv_layer(input, 8, 3, "conv1")
+        conv2 = conv_layer(conv1, 8, 3, "conv2")
+        sum1 = conv1 + conv2
+        conv3 = conv_layer(sum1, 8, 3, "conv3")
+        conv4 = conv_layer(conv3, 8, 3, "conv4")
+        sum2 = conv4 + sum1
+        conv5 = conv_layer(sum2, 8, 3, "conv5")
+        conv6 = conv_layer(conv5, 8, 3, "conv6")
+    
+    print("FLOPs: {}".format(model_size(main_program)))
+
+TableLatencyEvaluator
+---------------------
+
+.. py:class:: paddleslim.analysis.TableLatencyEvaluator(table_file, delimiter=",")
+
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/analysis/latency.py>`_
+
+基于硬件延时表的模型延时评估器。
+
+**参数：**
+
+- **table_file(str)** - 所使用的延时评估表的绝对路径。关于演示评估表格式请参考：PaddleSlim硬件延时评估表格式
+
+- **delimiter(str)** - 在硬件延时评估表中，操作信息之前所使用的分割符，默认为英文字符逗号。
+
+**返回值：**
+
+- **Evaluator** - 硬件延时评估器的实例。
+
+   .. py:method:: latency(graph)
+
+   获得指定网络的预估延时。
+
+   **参数：**
+   
+   - **graph(Program)** - 待预估的目标网络。
+   
+   **返回值：**
+   
+   - **latency** - 目标网络的预估延时。
diff --git a/docs/zh_cn/api_cn/common_index.rst b/docs/zh_cn/api_cn/common_index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3d6e4221c010f118b5ba2de92771858316b7ef36
--- /dev/null
+++ b/docs/zh_cn/api_cn/common_index.rst
@@ -0,0 +1,7 @@
+通用方法 Common
+======================================
+
+.. toctree::
+   :maxdepth: 1
+
+   analysis_api.rst
diff --git a/docs/zh_cn/api_cn/custom_rl_controller.md b/docs/zh_cn/api_cn/custom_rl_controller.md
new file mode 100644
index 0000000000000000000000000000000000000000..f61b1324d851b3185be9153d4e89ea848cbae44e
--- /dev/null
+++ b/docs/zh_cn/api_cn/custom_rl_controller.md
@@ -0,0 +1,54 @@
+# 外部如何自定义强化学习Controller
+
+首先导入必要的依赖:
+```python
+### 引入强化学习Controller基类函数和注册类函数
+from paddleslim.common.rl_controller.utils import RLCONTROLLER
+from paddleslim.common.rl_controller import RLBaseController
+```
+
+通过装饰器的方式把自定义强化学习Controller注册到PaddleSlim，继承基类之后需要重写基类中的`next_tokens`和`update`两个函数。注意：本示例仅说明一些必不可少的步骤，并不能直接运行，完整代码请参考[这里]()
+
+```python
+### 注意: 类名一定要全部大写
+@RLCONTROLLER.register
+class LSTM(RLBaseController):
+    def __init__(self, range_tables, use_gpu=False, **kwargs):
+        ### range_tables 表示tokens的取值范围
+        self.range_tables = range_tables
+        ### use_gpu 表示是否使用gpu来训练controller
+        self.use_gpu = use_gpu
+        ### 定义一些强化学习算法中需要的参数
+        ...
+        ### 构造相应的program, _build_program这个函数会构造两个program，一个是pred_program，一个是learn_program， 并初始化参数
+        self._build_program()
+        self.place = fluid.CUDAPlace(0) if self.args.use_gpu else fluid.CPUPlace()
+        self.exe = fluid.Executor(self.place)
+        self.exe.run(fluid.default_startup_program())
+
+        ### 保存参数到一个字典中，这个字典由server端统一维护更新，因为可能有多个client同时更新一份参数，所以这一步必不可少，由于pred_program和learn_program使用的同一份参数，所以只需要把learn_program中的参数放入字典中即可
+        self.param_dicts = {}
+        self.param_dicts.update(self.learn_program: self.get_params(self.learn_program))
+
+    def next_tokens(self, states, params_dict):
+        ### 把从server端获取参数字典赋值给当前要用到的program
+        self.set_params(self.pred_program, params_dict, self.place)
+        ### 根据states构造输入
+        self.num_archs = states
+        feed_dict = self._create_input()
+        ### 获取当前token
+        actions = self.exe.run(self.pred_program, feed=feed_dict, fetch_list=self.tokens)
+        ...
+        return actions
+
+    def update(self, rewards, params_dict=None):
+        ### 把从server端获取参数字典赋值给当前要用到的program
+        self.set_params(self.learn_program, params_dict, self.place)
+        ### 根据`next_tokens`中的states和`update`中的rewards构造输入
+        feed_dict = self._create_input(is_test=False, actual_rewards = rewards)
+        ### 计算当前step的loss
+        loss = self.exe.run(self.learn_program, feed=feed_dict, fetch_list=[self.loss])
+        ### 获取当前program的参数并返回，client会把本轮的参数传给server端进行参数更新
+        params_dict = self.get_params(self.learn_program)
+        return params_dict
+```
diff --git a/docs/zh_cn/api_cn/darts.rst b/docs/zh_cn/api_cn/darts.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c4fd72562a5dc2b330e0fc0c030eb301a1b1d7eb
--- /dev/null
+++ b/docs/zh_cn/api_cn/darts.rst
@@ -0,0 +1,106 @@
+可微分模型架构搜索DARTS
+=========
+
+DARTSearch
+---------
+
+.. py:class:: paddleslim.nas.DARTSearch(model, train_reader, valid_reader, place, learning_rate=0.025, batchsize=64, num_imgs=50000, arch_learning_rate=3e-4, unrolled=False, num_epochs=50, epochs_no_archopt=0, use_multiprocess=False, use_data_parallel=False, save_dir='./', log_freq=50)
+
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/release/1.1.0/paddleslim/nas/darts/train_search.py>`_
+
+定义一个DARTS搜索示例，用于在特定数据集和搜索空间上启动模型架构搜索。
+
+**参数：**
+
+- **model** (Paddle Dygraph model)-用于搜索的超网络模型，需要以PaddlePaddle动态图的形式定义。
+- **train_reader** (Python Generator)-输入train数据的 `batch generator <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/io_cn/DataLoader_cn.html>`_
+- **valid_reader** (Python Generator)-输入valid数据的 `batch generator <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/io_cn/DataLoader_cn.html>`_
+- **place** (fluid.CPUPlace()|fluid.CUDAPlace(N))-该参数表示程序运行在何种设备上，这里的N为GPU对应的ID
+- **learning_rate** (float)-模型参数的初始学习率。默认值：0.025。
+- **batchsize** (int)-搜索过程数据的批大小。默认值：64。
+- **num_imgs** (int)-数据集总样本数。默认值：50000。
+- **arch_learning_rate** (float)-架构参数的学习率。默认值：3e-4。
+- **unrolled** (bool)-是否使用二阶搜索算法。默认值：False。
+- **num_epochs** (int)-搜索训练的轮数。默认值：50。
+- **epochs_no_archopt** (int)-跳过前若干轮的模型架构参数优化。默认值：0。
+- **use_multiprocess** (bool)-是否使用多进程的dataloader。默认值：False。
+- **use_data_parallel** (bool)-是否使用数据并行的多卡训练。默认值：False。
+- **save_dir** (str)-模型参数保存目录。默认值：'./'。
+- **log_freq** (int)-每多少步输出一条log。默认值：50。
+
+
+   .. py:method:: paddleslim.nas.DARTSearch.train()
+
+   对以上定义好的目标网络和数据进行DARTS搜索
+
+
+**使用示例：**
+
+.. code-block:: python
+
+    import paddle
+    import paddle.fluid as fluid
+    import numpy as np
+    from paddleslim.nas.darts import DARTSearch
+    
+    
+    class SuperNet(fluid.dygraph.Layer):
+        def __init__(self):
+            super(SuperNet, self).__init__()
+            self._method = 'DARTS'
+            self._steps = 1
+            self.stem=fluid.dygraph.nn.Conv2D(
+                num_channels=1,
+                num_filters=3,
+                filter_size=3,
+                padding=1)
+            self.classifier = fluid.dygraph.nn.Linear(
+                input_dim=3072,
+                output_dim=10)
+            self._multiplier = 4
+            self._primitives = ['none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5']
+            self._initialize_alphas()
+    
+        def _initialize_alphas(self):
+            self.alphas_normal = fluid.layers.create_parameter(
+                shape=[14, 8],
+                dtype="float32")
+            self.alphas_reduce = fluid.layers.create_parameter(
+                shape=[14, 8],
+                dtype="float32")
+            self._arch_parameters = [
+                self.alphas_normal,
+                self.alphas_reduce,
+            ]
+    
+        def arch_parameters(self):
+            return self._arch_parameters
+    
+        def forward(self, input):
+            out = self.stem(input) * self.alphas_normal[0][0] * self.alphas_reduce[0][0]
+            out = fluid.layers.reshape(out, [0, -1])
+            logits = self.classifier(out)
+            return logits
+    
+        def _loss(self, input, label):
+            logits = self.forward(input)
+            return fluid.layers.reduce_mean(fluid.layers.softmax_with_cross_entropy(logits, label))
+    
+    def batch_generator_creator():
+        def __reader__():
+            for _ in range(1024):
+                batch_image = np.random.random(size=[64, 1, 32, 32]).astype('float32')
+                batch_label = np.random.random(size=[64, 1]).astype('int64')
+                yield batch_image, batch_label
+    
+        return __reader__
+
+    place = fluid.CUDAPlace(0)
+    with fluid.dygraph.guard(place):
+        model = SuperNet()
+        train_reader = batch_generator_creator()
+        valid_reader = batch_generator_creator()
+        searcher = DARTSearch(model, train_reader, valid_reader, place)
+        searcher.train()
+
+..
diff --git a/docs/zh_cn/api_cn/distill_index.rst b/docs/zh_cn/api_cn/distill_index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8f5c6639fa60d0f108ee4a36229877c6523a51ac
--- /dev/null
+++ b/docs/zh_cn/api_cn/distill_index.rst
@@ -0,0 +1,9 @@
+
+知识蒸馏 Distill
+======================================
+
+.. toctree::
+   :maxdepth: 1
+
+   single_distiller_api.rst
+   pantheon_api.md
diff --git a/docs/zh_cn/api_cn/early_stop.rst b/docs/zh_cn/api_cn/early_stop.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e92a6067a5d6e2d7c1865eb64da1e4010253e8f2
--- /dev/null
+++ b/docs/zh_cn/api_cn/early_stop.rst
@@ -0,0 +1,64 @@
+早停算法
+========
+早停算法接口在实验中如何使用
+
+MedianStop
+------
+
+.. py:class:: paddleslim.nas.early_stop.MedianStop(strategy, start_epoch, mode)
+
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/nas/early_stop/median_stop/median_stop.py>`_
+
+MedianStop是利用历史较好实验的中间结果来判断当前实验是否有运行完成的必要，如果当前实验在中间步骤的结果差于历史记录的实验列表中相同步骤的结果的中值，则代表当前实验是较差的实验，可以提前终止。参考 `Google Vizier: A Service for Black-Box Optimization <https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46180.pdf>`_.
+
+**参数：**
+
+- **strategy<class instance>** - 搜索策略的实例，例如是SANAS的实例。
+- **start_epoch<int>** - 起始epoch，代表从第几个epoch开始监控实验中间结果。
+- **mode<str>** - 中间结果是越大越好还是越小越好，在'minimize'和'maxmize'之间选择。默认：'maxmize'。
+
+**返回：**
+一个MedianStop的实例
+
+**示例代码：**
+
+.. code-block:: python
+
+  from paddleslim.nas import SANAS
+  from paddleslim.nas.early_stop import MedianStop
+  config = [('MobileNetV2Space')]
+  sanas = SANAS(config, server_addr=("", 8732), save_checkpoint=None)
+  earlystop = MedianStop(sanas, start_epoch = 2)
+
+.. py:method:: get_status(step, result, epochs):
+
+获取当前实验当前result的状态。
+
+**参数：**
+
+- **step<int>** - 当前实验是当前client中的第几个实验。
+- **result<float>** - 当前实验的中间步骤的result，可以为损失值，也可以为准确率等指标，只要和`mode`对应即可。
+- **epochs<int>** - 在搜索过程中每个实验需要运行的总得epoch数量。
+
+**返回：**
+返回当前实验在当前epoch的状态，为`GOOD`或者`BAD`，如果为`BAD`，则代表当前实验可以早停。
+
+**示例代码：**
+
+.. code-block:: python
+
+  import paddle
+  from paddleslim.nas import SANAS
+  from paddleslim.nas.early_stop import MedianStop
+  steps = 10
+  epochs = 7
+  
+  config = [('MobileNetV2Space')]
+  sanas = SANAS(config, server_addr=("", 8732), save_checkpoint=None)
+  earlystop = MedianStop(sanas, 2)
+  ### 假设网络中计算出来的loss是1.0，实际使用时需要获取真实的loss或者rewards。
+  avg_loss = 1.0
+  
+  ### 假设我们要获取的是当前实验第7个epoch的状态，实际使用时需要传入真实要获取的steps和实验真实所处的epochs。
+  status = earlystop.get_status(steps, avg_loss, epochs)
+  print(status)
diff --git a/docs/zh_cn/api_cn/index.rst b/docs/zh_cn/api_cn/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e6697a1d94380b5d8432add0711fad5c48a07393
--- /dev/null
+++ b/docs/zh_cn/api_cn/index.rst
@@ -0,0 +1,16 @@
+.. PaddleSlim documentation master file, created by
+   sphinx-quickstart on Wed Feb  5 14:04:52 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+API文档
+======================================
+
+.. toctree::
+   :maxdepth: 1
+
+   common_index.rst
+   quant_index.rst
+   prune_index.rst
+   distill_index.rst
+   nas_index.rst
diff --git a/docs/zh_cn/api_cn/nas_api.rst b/docs/zh_cn/api_cn/nas_api.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9cb0938f1406b7ee70c22730d879eec3f49b992e
--- /dev/null
+++ b/docs/zh_cn/api_cn/nas_api.rst
@@ -0,0 +1,325 @@
+NAS
+========
+
+搜索空间参数的配置
+----------------------
+
+
+通过参数配置搜索空间。更多搜索空间的使用可以参考: `search_space <https://paddlepaddle.github.io/PaddleSlim/api_cn/search_space.html>`_
+
+**参数：**
+
+- **input_size(int|None)**：- ``input_size`` 表示输入 ``feature map`` 的大小。 ``input_size`` 和 ``output_size`` 用来计算整个模型结构中下采样次数。
+
+- **output_size(int|None)**：- ``output_size`` 表示输出feature map的大小。 ``input_size`` 和 ``output_size`` 用来计算整个模型结构中下采样次数。
+
+- **block_num(int|None)**：- ``block_num`` 表示搜索空间中block的数量。
+
+- **block_mask(list|None)**：- ``block_mask`` 是一组由0、1组成的列表，0表示当前block是normal block，1表示当前block是reduction block。reduction block表示经过这个block之后的feature map大小下降为之前的一半，normal block表示经过这个block之后feature map大小不变。如果设置了  ``block_mask`` ，则主要以 ``block_mask`` 为主要配置， ``input_size`` ， ``output_size`` 和 ``block_num`` 三种配置是无效的。
+
+SANAS
+------
+
+.. py:class:: paddleslim.nas.SANAS(configs, server_addr=("", 8881), init_temperature=None, reduce_rate=0.85, init_tokens=None, search_steps=300, save_checkpoint='./nas_checkpoint', load_checkpoint=None, is_server=True)
+
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/nas/sa_nas.py#L36>`_
+
+SANAS（Simulated Annealing Neural Architecture Search）是基于模拟退火算法进行模型结构搜索的算法，一般用于离散搜索任务。
+
+**参数：**
+
+- **configs(list<tuple>)** - 搜索空间配置列表，格式是 ``[(key, {input_size, output_size, block_num, block_mask})]`` 或者 ``[(key)]`` （MobileNetV2、MobilenetV1和ResNet的搜索空间使用和原本网络结构相同的搜索空间，所以仅需指定 ``key`` 即可）, ``input_size`` 和 ``output_size`` 表示输入和输出的特征图的大小， ``block_num`` 是指搜索网络中的block数量， ``block_mask`` 是一组由0和1组成的列表，0代表不进行下采样的block，1代表下采样的block。 更多paddleslim提供的搜索空间配置可以参考[Search Space](../search_space.md)。
+- **server_addr(tuple)** - SANAS的地址，包括server的ip地址和端口号，如果ip地址为None或者为""的话则默认使用本机ip。默认：（"", 8881）。
+- **init_temperature(float)** - 基于模拟退火进行搜索的初始温度。如果init_template为None而且init_tokens为None，则默认初始温度为10.0，如果init_template为None且init_tokens不为None，则默认初始温度为1.0。详细的温度设置可以参考下面的Note。默认：None。
+- **reduce_rate(float)** - 基于模拟退火进行搜索的衰减率。详细的退火率设置可以参考下面的Note。默认：0.85。
+- **init_tokens(list|None)** - 初始化token，若init_tokens为空，则SA算法随机生成初始化tokens。默认：None。
+- **search_steps(int)** - 搜索过程迭代的次数。默认：300。
+- **save_checkpoint(str|None)** - 保存checkpoint的文件目录，如果设置为None的话则不保存checkpoint。默认： ``./nas_checkpoint`` 。
+- **load_checkpoint(str|None)** - 加载checkpoint的文件目录，如果设置为None的话则不加载checkpoint。默认：None。
+- **is_server(bool)** - 当前实例是否要启动一个server。默认：True。
+
+**返回：**
+一个SANAS类的实例
+
+**示例代码：**
+
+.. code-block:: python
+
+   from paddleslim.nas import SANAS
+   config = [('MobileNetV2Space')]
+   sanas = SANAS(configs=config)
+
+.. note::
+
+  - 初始化温度和退火率的意义:
+
+    - SA算法内部会保存一个基础token（初始化token可以自己传入也可以随机生成）和基础score（初始化score为-1），下一个token会在当前SA算法保存的token的基础上产生。在SA的搜索过程中，如果本轮的token训练得到的score大于SA算法中保存的score，则本轮的token一定会被SA算法接收保存为下一轮token产生的基础token。
+
+    - 初始温度越高表示SA算法当前处的阶段越不稳定，本轮的token训练得到的score小于SA算法中保存的score的话，本轮的token和score被SA算法接收的可能性越大。
+
+    - 初始温度越低表示SA算法当前处的阶段越稳定，本轮的token训练得到的score小于SA算法中保存的score的话，本轮的token和score被SA算法接收的可能性越小。
+
+    - 退火率越大，表示SA算法收敛的越慢，即SA算法越慢到稳定阶段。
+
+    - 退火率越低，表示SA算法收敛的越快，即SA算法越快到稳定阶段。
+
+  - 初始化温度和退火率的设置: 
+
+    - 如果原本就有一个较好的初始化token，想要基于这个较好的token来进行搜索的话，SA算法可以处于一个较为稳定的状态进行搜索r这种情况下初始温度可以设置的低一些，例如设置为1.0，退火率设置的大一些，例如设置为0.85。如果想要基于这个较好的token利用贪心算法进行搜索，即只有当本轮token训练得到的score大于SA算法中保存的score，SA算法才接收本轮token，则退火率可设置为一个极小的数字，例如设置为0.85 ** 10。
+
+    - 初始化token如果是随机生成的话，代表初始化token是一个比较差的token，SA算法可以处于一种不稳定的阶段进行搜索，尽可能的随机探索所有可能得token，从而找到一个较好的token。初始温度可以设置的高一些，例如设置为1000，退火率相对设置的小一些。
+
+.. 
+
+   .. py:method:: next_archs()
+
+   获取下一组模型结构。
+   
+   **返回：**
+   返回模型结构实例的列表，形式为list。
+   
+   **示例代码：**
+
+   .. code-block:: python
+
+      import paddle.fluid as fluid
+      from paddleslim.nas import SANAS
+      config = [('MobileNetV2Space')]
+      sanas = SANAS(configs=config)
+      input = fluid.data(name='input', shape=[None, 3, 32, 32], dtype='float32')
+      archs = sanas.next_archs()
+      for arch in archs:
+          output = arch(input)
+          input = output
+      print(output)
+   
+   .. py:method:: reward(score)
+
+   把当前模型结构的得分情况回传。
+   
+   **参数：**
+   
+   - **score<float>:** - 当前模型的得分，分数越大越好。
+   
+   **返回：**
+   模型结构更新成功或者失败，成功则返回 ``True`` ，失败则返回 ``False`` 。
+   
+   **示例代码：**
+
+   .. code-block:: python
+
+      import paddle.fluid as fluid
+      from paddleslim.nas import SANAS
+      config = [('MobileNetV2Space')]
+      sanas = SANAS(configs=config)
+      archs = sanas.next_archs()
+      
+      ### 假设网络计算出来的score是1，实际代码中使用时需要返回真实score。
+      score=float(1.0)
+      sanas.reward(float(score))
+   
+   
+   .. py:method:: tokens2arch(tokens)
+
+   通过一组tokens得到实际的模型结构，一般用来把搜索到最优的token转换为模型结构用来做最后的训练。tokens的形式是一个列表，tokens映射到搜索空间转换成相应的网络结构，一组tokens对应唯一的一个网络结构。
+   
+   **参数：**
+   
+   - **tokens(list):** - 一组tokens。tokens的长度和范围取决于搜索空间。
+   
+   **返回：**
+   根据传入的token得到一个模型结构实例列表。
+   
+   **示例代码：**
+
+   .. code-block:: python
+
+      import paddle.fluid as fluid
+      from paddleslim.nas import SANAS
+      config = [('MobileNetV2Space')]
+      sanas = SANAS(configs=config)
+      input = fluid.data(name='input', shape=[None, 3, 32, 32], dtype='float32')
+      tokens = ([0] * 25)
+      archs = sanas.tokens2arch(tokens)[0]
+      print(archs(input))
+   
+   .. py:method:: current_info()
+
+   返回当前token和搜索过程中最好的token和reward。
+   
+   **返回：**
+   搜索过程中最好的token，reward和当前训练的token，形式为dict。
+   
+   **示例代码：**
+
+   .. code-block:: python
+
+      import paddle.fluid as fluid
+      from paddleslim.nas import SANAS
+      config = [('MobileNetV2Space')]
+      sanas = SANAS(configs=config)
+      print(sanas.current_info())
+
+
+
+RLNAS
+------
+
+.. py:class:: paddleslim.nas.RLNAS(key, configs, use_gpu=False, server_addr=("", 8881), is_server=True, is_sync=False, save_controller=None, load_controller=None, **kwargs)
+
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/nas/rl_nas.py>`_
+
+RLNAS (Reinforcement Learning Neural Architecture Search）是基于强化学习算法进行模型结构搜索的算法。
+
+**参数：**
+
+- **key<str>** - 使用的强化学习Controller名称，目前paddleslim支持的有`LSTM`和`DDPG`，自定义强化学习Controller请参考 `自定义强化学习Controller <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/api_cn/custom_rl_controller.md>`_
+- **configs(list<tuple>)** - 搜索空间配置列表，格式是 ``[(key, {input_size, output_size, block_num, block_mask})]`` 或者 ``[(key)]`` （MobileNetV2、MobilenetV1和ResNet的搜索空间使用和原本网络结构相同的搜索空间，所以仅需指定 ``key`` 即可）, ``input_size`` 和 ``output_size`` 表示输入和输出的特征图的大小， ``block_num`` 是指搜索网络中的block数量， ``block_mask`` 是一组由0和1组成的列表，0代表不进行下采样的block，1代表下采样的block。 更多paddleslim提供的搜索空间配置可以参考[Search Space](../search_space.md)。
+- **use_gpu(bool)** - 是否使用GPU来训练Controller。默认：False。
+- **server_addr(tuple)** - RLNAS中Controller的地址，包括server的ip地址和端口号，如果ip地址为None或者为""的话则默认使用本机ip。默认：（"", 8881）。
+- **is_server(bool)** - 当前实例是否要启动一个server。默认：True。
+- **is_sync(bool)** - 是否使用同步模式更新Controller，该模式仅在多client下有差别。默认：False。
+- **save_controller(str|None|False)** - 保存Controller的checkpoint的文件目录，如果设置为None的话则保存checkpoint到默认路径 ``./.rlnas_controller`` ，如果设置为False的话则不保存checkpoint。默认：None 。
+- **load_controller(str|None)** - 加载Controller的checkpoint的文件目录，如果设置为None的话则不加载checkpoint。默认：None。
+- **\*\*kwargs** - 附加的参数，由具体强化学习算法决定，`LSTM`和`DDPG`的附加参数请参考note。
+
+.. note::
+
+  - **`LSTM`算法的附加参数：**
+
+    - lstm_num_layers(int, optional): - Controller中堆叠的LSTM的层数。默认：1.
+    - hidden_size(int, optional): - LSTM中隐藏层的大小。默认：100.
+    - temperature(float, optional): - 是否在计算每个token过程中做温度平均。默认：None.
+    - tanh_constant(float, optional): 是否在计算每个token过程中做tanh激活，并乘上`tanh_constant`值。 默认：None。
+    - decay(float, optional): LSTM中记录rewards的baseline的平滑率。默认：0.99.
+    - weight_entropy(float, optional): 在更新controller参数时是否为接收到的rewards加上计算token过程中的带权重的交叉熵值。默认：None。
+    - controller_batch_size(int, optional): controller的batch_size，即每运行一次controller可以拿到几组token。默认：1.
+    - controller_lr(float, optional): controller的学习率，默认：1e-4。
+    - controller_decay_steps(int, optional): controller学习率下降步长，设置为None的时候学习率不下降。默认：None。
+    - controller_decay_rate(float, optional): controller学习率衰减率，默认：None。
+
+
+  - **`DDPG`算法的附加参数：**
+
+    **注意：** 使用`DDPG`算法的话必须安装parl。安装方法: `pip install parl`
+
+    - obs_dim(int): observation的维度。
+    - model(class，optional): DDPG算法中使用的具体的模型，一般是个类，包含actor_model和critic_model，需要实现两个方法，一个是policy用来获得策略，另一个是value，需要获得Q值。可以参考默认的 `default_model <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/common/RL_controller/DDPG/ddpg_model.py>`_  实现您自己的model。默认：`default_ddpg_model`.
+    - actor_lr(float, optional): actor网络的学习率。默认：1e-4.
+    - critic_lr(float, optional): critic网络的学习率。默认：1e-3.
+    - gamma(float, optional): 接收到rewards之后的折扣因子。默认：0.99.
+    - tau(float, optional): DDPG中把models的参数同步累积到target_model上时的折扣因子。默认：0.001.
+    - memory_size(int, optional): DDPG中记录历史信息的池子大小。默认：10.
+    - reward_scale(float, optional): 记录历史信息时，对rewards信息进行的折扣因子。默认：0.1.
+    - controller_batch_size(int, optional): controller的batch_size，即每运行一次controller可以拿到几个token。默认：1.
+    - actions_noise(class, optional): 通过DDPG拿到action之后添加的噪声，设置为False或者None时不添加噪声。默认：default_noise.
+..
+
+**返回：**
+一个RLNAS类的实例
+
+**示例代码：**
+
+.. code-block:: python
+
+   from paddleslim.nas import RLNAS
+   config = [('MobileNetV2Space')]
+   rlnas = RLNAS(key='lstm', configs=config)
+
+
+.. py:method:: next_archs(obs=None)
+
+获取下一组模型结构。
+
+**参数：**
+
+- **obs<int|np.array>** - 需要获取的模型结构数量或者当前模型的observations。
+
+**返回：**
+返回模型结构实例的列表，形式为list。
+ 
+**示例代码：**
+
+.. code-block:: python
+
+  import paddle.fluid as fluid
+  from paddleslim.nas import RLNAS
+  config = [('MobileNetV2Space')]
+  rlnas = RLNAS(key='lstm', configs=config)
+  input = fluid.data(name='input', shape=[None, 3, 32, 32], dtype='float32')
+  archs = rlnas.next_archs(1)[0]
+  for arch in archs:
+      output = arch(input)
+      input = output
+  print(output)
+
+.. py:method:: reward(rewards, **kwargs):
+
+把当前模型结构的rewards回传。
+
+**参数：**
+
+- **rewards<float|list<float>>:** - 当前模型的rewards，分数越大越好。
+- **\*\*kwargs:** - 附加的参数，取决于具体的强化学习算法。
+
+**示例代码：**
+
+.. code-block:: python
+
+  import paddle.fluid as fluid
+  from paddleslim.nas import RLNAS
+  config = [('MobileNetV2Space')]
+  rlnas = RLNAS(key='lstm', configs=config)
+  rlnas.next_archs(1)
+  rlnas.reward(1.0)
+
+.. note::
+  reward这一步必须在`next_token`之后执行。
+..
+
+.. py:method:: final_archs(batch_obs):
+
+获取最终的模型结构。一般在controller训练完成之后会获取几十个模型结构进行完整的实验。
+
+**参数：**
+
+- **obs<int|np.array>** - 需要获取的模型结构数量或者当前模型的observations。
+
+**返回：**
+返回模型结构实例的列表，形式为list。
+ 
+**示例代码：**
+
+.. code-block:: python
+
+  import paddle.fluid as fluid
+  from paddleslim.nas import RLNAS
+  config = [('MobileNetV2Space')]
+  rlnas = RLNAS(key='lstm', configs=config)
+  archs = rlnas.final_archs(1)
+  print(archs)
+
+.. py:method:: tokens2arch(tokens):
+
+通过一组tokens得到实际的模型结构，一般用来把搜索到最优的token转换为模型结构用来做最后的训练。tokens的形式是一个列表，tokens映射到搜索空间转换成相应的网络结构，一组tokens对应唯一的一个网络结构。
+
+**参数：**
+
+- **tokens(list):** - 一组tokens。tokens的长度和范围取决于搜索空间。
+
+**返回：**
+根据传入的token得到一个模型结构实例列表。
+
+**示例代码：**
+
+.. code-block:: python
+
+  import paddle.fluid as fluid
+  from paddleslim.nas import RLNAS
+  config = [('MobileNetV2Space')]
+  rlnas = RLNAS(key='lstm', configs=config)
+  input = fluid.data(name='input', shape=[None, 3, 32, 32], dtype='float32')
+  tokens = ([0] * 25)
+  archs = rlnas.tokens2arch(tokens)[0]
+  print(archs(input))
+
diff --git a/docs/zh_cn/api_cn/nas_index.rst b/docs/zh_cn/api_cn/nas_index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a42d781d84772f1ca24acc9e4d83f22976ff9c0a
--- /dev/null
+++ b/docs/zh_cn/api_cn/nas_index.rst
@@ -0,0 +1,13 @@
+
+模型搜索 NAS
+======================================
+
+.. toctree::
+   :maxdepth: 1
+
+   nas_api.rst
+   search_space.md
+   table_latency.md
+   early_stop.rst
+   one_shot_api.rst
+   darts.rst
diff --git a/docs/zh_cn/api_cn/one_shot_api.rst b/docs/zh_cn/api_cn/one_shot_api.rst
new file mode 100644
index 0000000000000000000000000000000000000000..57ab5e5b936de5d800cb1dd9657a87d0b2fc2232
--- /dev/null
+++ b/docs/zh_cn/api_cn/one_shot_api.rst
@@ -0,0 +1,157 @@
+OneShotNAS
+=============
+
+OneShotSearch
+------------------
+
+.. py:function:: paddleslim.nas.one_shot.OneShotSearch(model, eval_func, strategy='sa', search_steps=100)
+
+从超级网络中搜索出一个最佳的子网络。
+
+**参数：**
+
+- **model(fluid.dygraph.layer):** 通过在 ``OneShotSuperNet`` 前后添加若该模块构建的动态图模块。因为 ``OneShotSuperNet`` 是一个超网络，所以 ``model`` 也是一个超网络。换句话说，在 ``model`` 模块的子模块中，至少有一个是 ``OneShotSuperNet`` 的实例。该方法从 ``model`` 超网络中搜索得到一个最佳的子网络。超网络 ``model`` 需要先被训练，具体细节请参考[OneShotSuperNet]()。
+
+- **eval_func:** 用于评估子网络性能的回调函数。该回调函数需要接受 ``model`` 为参数，并调用 ``model`` 的 ``forward`` 方法进行性能评估。
+
+- **strategy(str):** 搜索策略的名称。默认为 ``sa`` ， 当前仅支持 ``sa`` .
+
+- **search_steps(int):** 搜索轮次数。默认为100。
+
+**返回：**
+
+- **best_tokens:** 表示最佳子网络的编码信息（tokens）。
+
+**示例代码：**
+
+请参考[one-shot NAS示例]()
+
+
+OneShotSuperNet
+-----------------
+
+.. py:class:: paddleslim.nas.one_shot.OneShotSuperNet(name_scope)
+
+用于`OneShot`搜索策略的超级网络的基类，所有超级网络的实现要继承该类。
+
+**参数：**
+
+- **name_scope:(str) **超级网络的命名空间。
+
+**返回：**
+
+- **super_net:** 一个`OneShotSuperNet`实例。
+
+
+   .. py:method:: init_tokens()
+   
+   获得当前超级网络的初始化子网络的编码，主要用于搜索。
+   
+   **返回：**
+   
+   - **tokens(list<int>):** 一个子网络的编码。
+   
+   range_table()
+   
+   : 超级网络中各个子网络由一组整型数字编码表示，该方法返回编码每个位置的取值范围。
+   
+   **返回：**
+   
+   - **range_table(tuple):** 子网络编码每一位的取值范围。 ``range_table`` 格式为 ``(min_values, max_values)`` ，其中， ``min_values`` 为一个整型数组，表示每个编码位置可选取的最小值； ``max_values`` 表示每个编码位置可选取的最大值。
+   
+   .. py:method:: _forward_impl(input, tokens)
+   
+   前向计算函数。 ``OneShotSuperNet`` 的子类需要实现该函数。
+   
+   **参数：**
+   
+   - **input(Variable):** 超级网络的输入。
+   
+   - **tokens(list<int>):** 执行前向计算所用的子网络的编码。默认为 ``None`` ，即随机选取一个子网络执行前向。
+   
+   **返回：**
+   
+   - **output(Variable):** 前向计算的输出
+   
+   .. py:method:: forward(self, input, tokens=None)
+   
+   执行前向计算。
+   
+   **参数：**
+   
+   - **input(Variable):** 超级网络的输入。
+   
+   - **tokens(list<int>):** 执行前向计算所用的子网络的编码。默认为 ``None`` ，即随机选取一个子网络执行前向。
+   
+   **返回：**
+   
+   - **output(Variable):** 前向计算的输出
+   
+   
+   .. py:method:: _random_tokens()
+   
+   随机选取一个子网络，并返回其编码。
+   
+   **返回：**
+   
+   - **tokens(list<int>):** 一个子网络的编码。
+
+SuperMnasnet
+--------------
+
+
+.. py:class:: paddleslim.nas.one_shot.SuperMnasnet(name_scope, input_channels=3, out_channels=1280, repeat_times=[6, 6, 6, 6, 6, 6], stride=[1, 1, 1, 1, 2, 1], channels=[16, 24, 40, 80, 96, 192, 320], use_auxhead=False)
+
+在 `Mnasnet <https://arxiv.org/abs/1807.11626>`_ 基础上修改得到的超级网络, 该类继承自 ``OneShotSuperNet`` .
+
+**参数：**
+
+- **name_scope(str):** 命名空间。
+
+- **input_channels(str):** 当前超级网络的输入的特征图的通道数量。
+
+- **out_channels(str):** 当前超级网络的输出的特征图的通道数量。
+
+- **repeat_times(list):** 每种 ``block`` 重复的次数。
+
+- **stride(list):** 一种 ``block`` 重复堆叠成 ``repeat_block`` ， ``stride`` 表示每个 ``repeat_block`` 的下采样比例。
+
+- **channels(list):** ``channels[i]`` 和 ``channels[i+1]`` 分别表示第i个 ``repeat_block`` 的输入特征图的通道数和输出特征图的通道数。
+
+- **use_auxhead(bool):** 是否使用辅助特征图。如果设置为 ``True`` ，则 ``SuperMnasnet`` 除了返回输出特征图，还还返回辅助特征图。默认为False.
+
+**返回：**
+
+- **instance(SuperMnasnet):** 一个 ``SuperMnasnet`` 实例
+
+**示例：**
+.. code-block:: python
+
+   import paddle
+   import paddle.fluid as fluid
+   class MNIST(fluid.dygraph.Layer):
+       def __init__(self):
+           super(MNIST, self).__init__()
+           self.arch = SuperMnasnet(
+               name_scope="super_net", input_channels=20, out_channels=20)
+           self.pool_2_shape = 50 * 13 * 13
+           SIZE = 10
+           scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5
+           self._fc = Linear(
+               self.pool_2_shape,
+               10,
+               param_attr=fluid.param_attr.ParamAttr(
+                   initializer=fluid.initializer.NormalInitializer(
+                       loc=0.0, scale=scale)),
+               act="softmax")
+   
+       def forward(self, inputs, label=None, tokens=None):
+   
+           x = self.arch(inputs, tokens=tokens)
+           x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape])
+           x = self._fc(x)
+           if label is not None:
+               acc = fluid.layers.accuracy(input=x, label=label)
+               return x, acc
+           else:
+               return x
diff --git a/docs/zh_cn/api_cn/pantheon_api.md b/docs/zh_cn/api_cn/pantheon_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..87fc67249c5dd91368e0c256cf552fb09aa48685
--- /dev/null
+++ b/docs/zh_cn/api_cn/pantheon_api.md
@@ -0,0 +1,268 @@
+# 大规模可扩展知识蒸馏框架 Pantheon
+
+## Teacher
+
+pantheon.Teacher() [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/teacher.py#L78)
+
+: The class defined for the teacher model. Generate knowledge data and transfer them to the student model.
+
+**Args:**
+
+- **out\_path (str|None)** - The path to dump knowledge data for offline mode.
+
+- **out\_port (int|None)** - The IP port number to send out knowledge for online mode, should be unique when launching multiple teachers in the same node.
+
+**Return:** An object of class Teacher
+
+
+pantheon.Teacher.start() [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/teacher.py#L133)
+
+: Start teacher service, sychronize with student and launch the thread
+  to monitor commands from student.
+
+**Args:** None
+
+**Return:** None
+
+
+pantheon.Teacher.send(data) [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/teacher.py#L181)
+
+: Send one data object to student.
+
+**Args:**
+
+- **data (Python data):** - The data to be sent, can be any type of Python data object.
+
+**Return:** None
+
+
+pantheon.Teacher.recv() [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/teacher.py#L196)
+
+: Recieve one data object from student.
+
+**Args:** None
+
+**Return:**
+
+- The received data, can be any type of Python data object.
+
+
+pantheon.Teacher.dump(knowledge) [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/teacher.py#L214)
+
+: Dump one batch knowledge data into the output file, only used in the offline mode.
+
+**Args:**
+
+- **knowledge (dict):** - The knowledge data to be dumped.  
+
+**Return:** None
+
+
+pantheon.Teacher.start\_knowledge\_service(feed\_list, schema, program, reader\_config, exe, buf\_size=10, times=1) [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/teacher.py#L259)
+
+: Start the knowledge service to generate and transfer knowledge data. In GPU mode, the devices to execute knowledge prediction will be determined by the
+  environment variable **FLAGS\_selected\_gpus**, or by **CUDA\_VISIBLE\_DEVICES** if it is not set, and by **CPU\_NUM** (default 1) in CPU mode. Only supported in static graph.
+
+ **Args:**
+
+ - **feed\_list (list):** - A list of feed Variables or their names for the
+                              input teacher Program.
+ - **schema (dict):** - A dict to specify keys and fetched Variables  
+                        to generate knowledge.
+ - **program (fluid.Program):** - Inference Program of the teacher model.
+ - **reader\_config (dict):** - The config for data reader. Support all the three types of generators used by [fluid.io.PyReader](https://www.paddlepaddle.org.cn/documentation/docs/en/api/io/PyReader.html) and [fluid.io.DataLoader](https://www.paddlepaddle.org.cn/documentation/docs/en/api/io/DataLoader.html#dataloader), and their configs contain the key-value pair of the generator type and a generator object, plus other necessary argument pairs. See the following:
+
+     1) **sample generator:**
+
+     ```
+     reader_config={"sample_generator": some_sample_generator,
+                    "batch_size": batch_size, "drop_last": drop_last}
+     # drop_last set to True by default
+     ```
+
+     2) **sample list generator:**
+
+     ```
+     reader_config={"sample_list_generator": some_sample_list_generator}
+     ```
+
+     3) **batch generator:**
+
+     ```
+     reader_config={"batch_generator": some_batch_genrator}
+     ```
+
+     The trial to parse config will be in the order of 1) -> 3), and any other unrelated keys in these configs will be ignored.
+
+- **exe (fluid.Executor):** The executor to run the input program.
+- **buf\_size (int):** The size of buffers for data reader and knowledge
+                            writer on each device.
+- **times (int):** The maximum repeated serving times, default 1. Whenever
+                         the public method **get\_knowledge\_generator()** in **Student**
+                         object called once, the serving times will be added one,
+                         until reaching the maximum and ending the service. Only
+                         valid in online mode, and will be ignored in offline mode.
+
+**Return:** None
+
+**Examples:**
+
+```python
+import paddle
+import paddle.fluid as fluid
+from paddleslim.pantheon import Teacher
+
+startup = fluid.Program()
+program = fluid.Program()
+with fluid.program_guard(program, startup):
+    images = fluid.data(
+            name='pixel', shape=[None, 3 * 32 * 32], dtype='float32')
+    labels = fluid.data(name='label', shape=[None, 1], dtype='int64')
+    logits = fluid.layers.fc(input=images, size=10)
+    loss = fluid.layers.softmax_with_cross_entropy(logits, labels)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+exe.run(startup)
+
+train_reader = paddle.fluid.io.batch(
+        paddle.dataset.cifar.train10(), batch_size=32)
+
+teacher = Teacher(out_path="example_knowledge.dat", # offline mode
+                  #out_port=5000                    # online mode
+                  )
+teacher.start()
+
+teacher.start_knowledge_service(
+    feed_list=[images, labels],
+    schema={"logits": logits,
+            "labels": labels},
+    program=program,
+    reader_config={"sample_list_generator": train_reader},
+    exe=exe)
+```
+
+!!! note "Note"
+    This example should be run with the example of class **Student**.
+
+
+## Student
+
+pantheon.Student(merge_strategy=None) [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/student.py#L34)
+
+: The class defined for the student model. Receive knowledge data from
+    teacher model and carry out knowledge merging.  
+
+ **Args:**
+
+ - **merge\_strategy (dict|None):** - A dict whose keys are the common schemas shared by different teachers, and each corresponding value specifies the merging strategy for different schemas respectively, supporting **sum** and **mean** now.
+
+**Return:** An object of class Student.
+
+
+pantheon.Student.register\_teacher(in\_path=None, in\_address=None) [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/student.py#L72)
+
+: Register one teacher model and assign the order number to it as  its id, with the file path (offline mode) or IP address (online  mode) that the teacher model writes knowledge data to.
+
+**Args:**
+
+- **in\_path (str|None):** The input file path. Default None.
+- **in\_address (str|None):** The input IP address, in the format "&lt;IP\_address&gt;:&lt;IP\_port&gt;" (e.g. "127.0.0.1:8080"). Default None.
+
+**Return:**  None
+
+
+pantheon.Student.start() [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/student.py#L213)
+
+: End teachers' registration and synchronize with all of them.
+
+**Args:** None
+
+**Return:**  None
+
+pantheon.Student.send(self, data, teacher_ids=None) [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/student.py#L240)
+
+: Send data to teachers.
+
+**Args:**
+
+- **data (Python data):** - A Python data object to be sent.
+- **teacher_ids (list|None):** - A list of teacher ids to send data. If set to None, send the data to all teachers. Default None.
+
+**Return:**  None
+
+pantheon.Student.recv(teacher_id) [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/student.py#L262)
+
+: Receive data from one teacher.
+
+ **Args:**
+
+- **teacher\_id (int):** - The id of teacher that receives data from.
+
+**Return:**  
+
+- The received data object.
+
+pantheon.Student.get\_knowledge\_desc() [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/student.py#L283)
+
+ : Get description for knowledge, including shape, data type and lod level for each schema.
+
+ **Args:** None
+
+ **Return:**  
+
+ - Knowledge description, which is a dict.
+
+
+pantheon.Student.get\_knowledge\_qsize() [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/student.py#L318)
+
+ : Get the real-time size of knowledge queue. If this size is denoted as
+   **qsize**, it means that there are **qsize** batch knowledge data
+   already pushed into knowledge queue and waiting for the knowledge
+   generator to pop out. It's dynamic and limited up to 100, the capacity
+   of the knowledge queue.
+
+ **Args:** None
+
+ **Return:**  
+
+ - The real-time size of knowledge queue.
+
+pantheon.Student.get\_knowledge\_generator(batch\_size, drop\_last=False) [source](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/pantheon/student.py#L334)
+
+: Get the generator for knowledge data, return None if last generator doesn't finish yet.
+
+**Args:**
+
+- **batch\_size (int):** - The batch size of returned knowledge data.
+- **drop\_last (bool):** - Whether to drop the last batch if its size is less than batch size.
+
+**Return:**
+
+- The wrapper of knowledge data generator.
+
+**Examples:**
+
+```python
+from paddleslim.pantheon import Student
+
+student = Student()
+
+student.register_teacher(in_path="example_knowledge.dat",  # offline mode
+                         #in_address="127.0.0.1:5000"      # online mode
+                         )
+student.start()
+
+knowledge_desc = student.get_knowledge_desc()
+data_generator = student.get_knowledge_generator(
+    batch_size=128, drop_last=True)
+
+# get knowledge data
+for knowledge in data_generator():
+    print("knowledge queue size: {}".format(student.get_knowledge_qsize()))
+
+    # do something else
+```
+
+!!! note "Note"
+    This example should be run with the example of class **Teacher**.
diff --git a/docs/zh_cn/api_cn/prune_api.rst b/docs/zh_cn/api_cn/prune_api.rst
new file mode 100644
index 0000000000000000000000000000000000000000..540dae043d9c81dd3d9f5146cbaf318bbd4e091c
--- /dev/null
+++ b/docs/zh_cn/api_cn/prune_api.rst
@@ -0,0 +1,430 @@
+卷积层通道剪裁
+================
+
+Pruner
+----------
+
+.. py:class:: paddleslim.prune.Pruner(criterion="l1_norm")
+
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/prune/pruner.py#L28>`_
+
+对卷积网络的通道进行一次剪裁。剪裁一个卷积层的通道，是指剪裁该卷积层输出的通道。卷积层的权重形状为 ``[output_channel, input_channel, kernel_size, kernel_size]`` ，通过剪裁该权重的第一纬度达到剪裁输出通道数的目的。
+
+**参数：**
+
+- **criterion** - 评估一个卷积层内通道重要性所参考的指标。目前支持 ``l1_norm`` , ``bn_scale`` , ``geometry_median``  。默认为 ``l1_norm`` 。若该参数设为 ``bn_scale`` , 则表示剪枝算法将根据卷积层后连接的BatchNorm层的Scale参数的绝对值大小作为评估卷积层内通道重要性所参考的指标。若参数设为 ``geometry_median``, 则表示剪枝算法将基于卷基层内通道的几何中心作为评估卷积层内通道重要性参考指标。 在初始化Pruner()类实例时，若没有传入该参数，则表示Pruner()使用criterion默认参数值 ``l1_norm`` ；可以显示地传入criterion的值以改变剪枝算法的剪枝策略。
+- **idx_selector** - 基于卷积层内通道重要性分数，指示选择裁剪的卷积层内通道索引的策略。目前支持 ``default_idx_selector`` 和 ``optimal_threshold`` 两种选择策略。默认为 ``default_idx_selector`` 。 ``default_idx_selector`` 策略表示根据卷积层内通道的重要性分数进行选择要被裁剪的通道。 ``optimal_threshold`` 策略和 ``bn_scale`` 准则配合使用，即将 ``criterion`` 设置为 ``bn_scale`` ， 并将该参数设置为 ``optimal_threshold``,  表示根据卷积层后链接的BatchNorm层的Scale参数计算出要裁剪的最优裁剪阈值，并根据该阈值进行通道裁剪。在初始话Pruner()实例时，若没有传入该参数，则表示Pruner()使用idx_selector默认参数 ``default_idx_selector`` 。
+
+**返回：** 一个Pruner类的实例
+
+**示例代码：**
+
+.. code-block:: python
+
+   from paddleslim.prune import Pruner
+   pruner = Pruner()       
+..
+ 
+   .. py:method:: paddleslim.prune.Pruner.prune(program, scope, params, ratios, place=None, lazy=False, only_graph=False, param_backup=False, param_shape_backup=False)
+
+   对目标网络的一组卷积层的权重进行裁剪。
+   
+   **参数：**
+   
+   - **program(paddle.fluid.Program)** - 要裁剪的目标网络。更多关于Program的介绍请参考：`Program概念介绍 <https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Program_cn.html#program>`_。
+   
+   - **scope(paddle.fluid.Scope)** - 要裁剪的权重所在的 ``scope`` ，Paddle中用 ``scope`` 实例存放模型参数和运行时变量的值。Scope中的参数值会被 ``inplace`` 的裁剪。更多介绍请参考: `Scope概念介绍 <>`_
+   
+   - **params(list<str>)** - 需要被裁剪的卷积层的参数的名称列表。可以通过以下方式查看模型中所有参数的名称:
+   
+   .. code-block:: python
+   
+      for block in program.blocks:
+          for param in block.all_parameters():
+              print("param: {}; shape: {}".format(param.name, param.shape))
+   
+   - **ratios(list<float>)** - 用于裁剪 ``params`` 的剪切率，类型为列表。该列表长度必须与 ``params`` 的长度一致。
+   
+   - **place(paddle.fluid.Place)** - 待裁剪参数所在的设备位置，可以是 ``CUDAPlace`` 或 ``CPUPlace`` 。[Place概念介绍]()
+   
+   - **lazy(bool)** - ``lazy`` 为True时，通过将指定通道的参数置零达到裁剪的目的，参数的 ``shape保持不变`` ； ``lazy`` 为False时，直接将要裁的通道的参数删除，参数的 ``shape`` 会发生变化。
+   
+   - **only_graph(bool)** - 是否只裁剪网络结构。在Paddle中，Program定义了网络结构，Scope存储参数的数值。一个Scope实例可以被多个Program使用，比如定义了训练网络的Program和定义了测试网络的Program是使用同一个Scope实例的。 ``only_graph`` 为True时，只对Program中定义的卷积的通道进行剪裁； ``only_graph`` 为false时，Scope中卷积参数的数值也会被剪裁。默认为False。
+   
+   - **param_backup(bool)** - 是否返回对参数值的备份。默认为False。
+   
+   - **param_shape_backup(bool)** - 是否返回对参数 ``shape`` 的备份。默认为False。
+   
+   **返回：**
+   
+   - **pruned_program(paddle.fluid.Program)** - 被裁剪后的Program。
+   
+   - **param_backup(dict)** - 对参数数值的备份，用于恢复Scope中的参数数值。
+   
+   - **param_shape_backup(dict)** - 对参数形状的备份。
+   
+   **示例：**
+   
+   点击 `AIStudio <https://aistudio.baidu.com/aistudio/projectDetail/200786>`_ 执行以下示例代码。
+
+   .. code-block:: python
+   
+      import paddle.fluid as fluid
+      from paddle.fluid.param_attr import ParamAttr
+      from paddleslim.prune import Pruner
+      
+      def conv_bn_layer(input,
+                        num_filters,
+                        filter_size,
+                        name,
+                        stride=1,
+                        groups=1,
+                        act=None):
+          conv = fluid.layers.conv2d(
+              input=input,
+              num_filters=num_filters,
+              filter_size=filter_size,
+              stride=stride,
+              padding=(filter_size - 1) // 2,
+              groups=groups,
+              act=None,
+              param_attr=ParamAttr(name=name + "_weights"),
+              bias_attr=False,
+              name=name + "_out")
+          bn_name = name + "_bn"
+          return fluid.layers.batch_norm(
+              input=conv,
+              act=act,
+              name=bn_name + '_output',
+              param_attr=ParamAttr(name=bn_name + '_scale'),
+              bias_attr=ParamAttr(bn_name + '_offset'),
+              moving_mean_name=bn_name + '_mean',
+              moving_variance_name=bn_name + '_variance', )
+      
+      main_program = fluid.Program()
+      startup_program = fluid.Program()
+      #   X       X              O       X              O
+      # conv1-->conv2-->sum1-->conv3-->conv4-->sum2-->conv5-->conv6
+      #     |            ^ |                    ^
+      #     |____________| |____________________|
+      #
+      # X: prune output channels
+      # O: prune input channels
+      with fluid.program_guard(main_program, startup_program):
+          input = fluid.data(name="image", shape=[None, 3, 16, 16])
+          conv1 = conv_bn_layer(input, 8, 3, "conv1")
+          conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+          sum1 = conv1 + conv2
+          conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
+          conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
+          sum2 = conv4 + sum1
+          conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
+          conv6 = conv_bn_layer(conv5, 8, 3, "conv6")
+      
+      place = fluid.CPUPlace()
+      exe = fluid.Executor(place)
+      scope = fluid.Scope()
+      exe.run(startup_program, scope=scope)
+      # Initiallize Pruner() instance with default criterion and idx_selector
+      pruner = Pruner()
+      # Set criterion
+      # criterion = 'geometry_median'
+      # pruner = Pruner(criterion=criterion)
+      # Set criterion and idx_selector
+      # criterion = 'bn_scale'
+      # idx_selector = 'optimal_threshold'
+      # pruner = Pruner(criterion=criterion, idx_selector=idx_selector)
+     
+      main_program, _, _ = pruner.prune(
+          main_program,
+          scope,
+          params=["conv4_weights"],
+          ratios=[0.5],
+          place=place,
+          lazy=False,
+          only_graph=False,
+          param_backup=False,
+          param_shape_backup=False)
+      
+      for param in main_program.global_block().all_parameters():
+          if "weights" in param.name:
+              print("param name: {}; param shape: {}".format(param.name, param.shape))
+      
+
+sensitivity
+--------------
+
+.. py:function:: paddleslim.prune.sensitivity(program, place, param_names, eval_func, sensitivities_file=None, pruned_ratios=None)
+
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/prune/sensitive.py>`_
+
+计算网络中每个卷积层的敏感度。每个卷积层的敏感度信息统计方法为：依次剪掉当前卷积层不同比例的输出通道数，在测试集上计算剪裁后的精度损失。得到敏感度信息后，可以通过观察或其它方式确定每层卷积的剪裁率。
+
+**参数：**
+
+- **program(paddle.fluid.Program)** - 待评估的目标网络。更多关于Program的介绍请参考：`Program概念介绍 <https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Program_cn.html#program>`_。
+
+- **place(paddle.fluid.Place)** - 待分析的参数所在的设备位置，可以是 ``CUDAPlace`` 或 ``CPUPlace`` 。[Place概念介绍]()
+
+- **param_names(list<str>)** - 待分析的卷积层的参数的名称列表。可以通过以下方式查看模型中所有参数的名称:
+
+.. code-block:: python
+   for block in program.blocks:
+       for param in block.all_parameters():
+           print("param: {}; shape: {}".format(param.name, param.shape))
+
+- **eval_func(function)** - 用于评估裁剪后模型效果的回调函数。该回调函数接受被裁剪后的 ``program`` 为参数，返回一个表示当前program的精度，用以计算当前裁剪带来的精度损失。
+
+- **sensitivities_file(str)** - 保存敏感度信息的本地文件系统的文件。在敏感度计算过程中，会持续将新计算出的敏感度信息追加到该文件中。重启任务后，文件中已有敏感度信息不会被重复计算。该文件可以用 ``pickle`` 加载。
+
+- **pruned_ratios(list<float>)** - 计算卷积层敏感度信息时，依次剪掉的通道数比例。默认为 ``[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]`` 。
+
+**返回：**
+
+- **sensitivities(dict)** - 存放敏感度信息的dict，其格式为：
+
+.. code-block:: python
+
+  {"weight_0":
+     {0.1: 0.22,
+      0.2: 0.33
+     },
+   "weight_1":
+     {0.1: 0.21,
+      0.2: 0.4
+     }
+  }
+
+其中， ``weight_0`` 是卷积层参数的名称， ``sensitivities['weight_0']`` 的 ``value`` 为剪裁比例， ``value`` 为精度损失的比例。
+
+**示例：**
+
+点击 `AIStudio <https://aistudio.baidu.com/aistudio/projectdetail/201401>`_ 运行以下示例代码。
+
+.. code-block:: python
+
+   import paddle
+   import numpy as np
+   import paddle.fluid as fluid
+   from paddle.fluid.param_attr import ParamAttr
+   from paddleslim.prune import sensitivity
+   import paddle.dataset.mnist as reader
+   
+   def conv_bn_layer(input,
+                     num_filters,
+                     filter_size,
+                     name,
+                     stride=1,
+                     groups=1,
+                     act=None):
+       conv = fluid.layers.conv2d(
+           input=input,
+           num_filters=num_filters,
+           filter_size=filter_size,
+           stride=stride,
+           padding=(filter_size - 1) // 2,
+           groups=groups,
+           act=None,
+           param_attr=ParamAttr(name=name + "_weights"),
+           bias_attr=False,
+           name=name + "_out")
+       bn_name = name + "_bn"
+       return fluid.layers.batch_norm(
+           input=conv,
+           act=act,
+           name=bn_name + '_output',
+           param_attr=ParamAttr(name=bn_name + '_scale'),
+           bias_attr=ParamAttr(bn_name + '_offset'),
+           moving_mean_name=bn_name + '_mean',
+           moving_variance_name=bn_name + '_variance', )
+   
+   main_program = fluid.Program()
+   startup_program = fluid.Program()
+   #   X       X              O       X              O
+   # conv1-->conv2-->sum1-->conv3-->conv4-->sum2-->conv5-->conv6
+   #     |            ^ |                    ^
+   #     |____________| |____________________|
+   #
+   # X: prune output channels
+   # O: prune input channels
+   image_shape = [1,28,28]
+   with fluid.program_guard(main_program, startup_program):
+       image = fluid.data(name='image', shape=[None]+image_shape, dtype='float32')
+       label = fluid.data(name='label', shape=[None, 1], dtype='int64')  
+       conv1 = conv_bn_layer(image, 8, 3, "conv1")
+       conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+       sum1 = conv1 + conv2
+       conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
+       conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
+       sum2 = conv4 + sum1
+       conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
+       conv6 = conv_bn_layer(conv5, 8, 3, "conv6")
+       out = fluid.layers.fc(conv6, size=10, act="softmax")
+   #    cost = fluid.layers.cross_entropy(input=out, label=label)
+   #    avg_cost = fluid.layers.mean(x=cost)
+       acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+   #    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+   
+   
+   place = fluid.CPUPlace()
+   exe = fluid.Executor(place)
+   exe.run(startup_program)
+   
+   val_reader = paddle.fluid.io.batch(reader.test(), batch_size=128)
+   val_feeder = feeder = fluid.DataFeeder(
+           [image, label], place, program=main_program)
+   
+   def eval_func(program):
+   
+       acc_top1_ns = []
+       for data in val_reader():
+           acc_top1_n = exe.run(program,
+                                feed=val_feeder.feed(data),
+                                fetch_list=[acc_top1.name])
+           acc_top1_ns.append(np.mean(acc_top1_n))
+       return np.mean(acc_top1_ns)
+   param_names = []
+   for param in main_program.global_block().all_parameters():
+       if "weights" in param.name:
+           param_names.append(param.name)
+   sensitivities = sensitivity(main_program,
+                               place,
+                               param_names,
+                               eval_func,
+                               sensitivities_file="./sensitive.data",
+                               pruned_ratios=[0.1, 0.2, 0.3])
+   print(sensitivities)
+   
+
+merge_sensitive
+----------------
+
+.. py:function:: paddleslim.prune.merge_sensitive(sensitivities)
+
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/prune/sensitive.py>`_
+
+合并多个敏感度信息。
+
+参数：
+
+- **sensitivities(list<dict> | list<str>)** - 待合并的敏感度信息，可以是字典的列表，或者是存放敏感度信息的文件的路径列表。
+
+返回：
+
+- **sensitivities(dict)** - 合并后的敏感度信息。其格式为：
+
+.. code-block:: bash
+
+   {"weight_0":
+      {0.1: 0.22,
+       0.2: 0.33
+      },
+    "weight_1":
+      {0.1: 0.21,
+       0.2: 0.4
+      }
+   }
+   
+
+其中， ``weight_0`` 是卷积层参数的名称， ``sensitivities['weight_0']`` 的 ``value`` 为剪裁比例， ``value`` 为精度损失的比例。
+
+示例：
+
+.. code-block:: python
+
+   from paddleslim.prune import merge_sensitive
+   sen0 = {"weight_0":
+      {0.1: 0.22,
+       0.2: 0.33
+      },
+    "weight_1":
+      {0.1: 0.21,
+       0.2: 0.4
+      }
+   }
+   sen1 = {"weight_0":
+      {0.3: 0.41,
+      },
+    "weight_2":
+      {0.1: 0.10,
+       0.2: 0.35
+      }
+   }
+   sensitivities = merge_sensitive([sen0, sen1])
+   print(sensitivities)
+
+
+load_sensitivities
+---------------------
+
+.. py:function:: paddleslim.prune.load_sensitivities(sensitivities_file)
+
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/prune/sensitive.py#L184>`_
+
+从文件中加载敏感度信息。
+
+参数：
+
+- **sensitivities_file(str)** - 存放敏感度信息的本地文件.
+
+返回：
+
+- **sensitivities(dict)** - 敏感度信息。
+
+示例：
+
+.. code-block:: python
+
+  import pickle
+  from paddleslim.prune import load_sensitivities
+  sen = {"weight_0":
+     {0.1: 0.22,
+      0.2: 0.33
+     },
+   "weight_1":
+     {0.1: 0.21,
+      0.2: 0.4
+     }
+  }
+  sensitivities_file = "sensitive_api_demo.data"
+  with open(sensitivities_file, 'wb') as f:
+      pickle.dump(sen, f)
+  sensitivities = load_sensitivities(sensitivities_file)
+  print(sensitivities)
+
+get_ratios_by_loss
+-------------------
+
+.. py:function:: paddleslim.prune.get_ratios_by_loss(sensitivities, loss)
+
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/prune/sensitive.py>`_
+
+根据敏感度和精度损失阈值计算出一组剪切率。对于参数 ``w`` , 其剪裁率为使精度损失低于 ``loss`` 的最大剪裁率。
+
+**参数：**
+
+- **sensitivities(dict)** - 敏感度信息。
+
+- **loss** - 精度损失阈值。
+
+**返回：**
+
+- **ratios(dict)** - 一组剪切率。 ``key`` 是待剪裁参数的名称。 ``value`` 是对应参数的剪裁率。
+
+**示例：**
+
+.. code-block:: python
+   
+  from paddleslim.prune import get_ratios_by_loss
+  sen = {"weight_0":
+     {0.1: 0.22,
+      0.2: 0.33
+     },
+   "weight_1":
+     {0.1: 0.21,
+      0.2: 0.4
+     }
+  }
+  
+  ratios = get_ratios_by_loss(sen, 0.3)
+  print(ratios)
diff --git a/docs/zh_cn/api_cn/prune_index.rst b/docs/zh_cn/api_cn/prune_index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ad2c8ffd3cf7784aeb3a3578c63efcb937775254
--- /dev/null
+++ b/docs/zh_cn/api_cn/prune_index.rst
@@ -0,0 +1,8 @@
+
+模型剪裁 Prune
+======================================
+
+.. toctree::
+   :maxdepth: 1
+
+   prune_api.rst
diff --git a/docs/zh_cn/api_cn/quant_index.rst b/docs/zh_cn/api_cn/quant_index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..27fa8f298594188457a9c85e514862d7db95b6eb
--- /dev/null
+++ b/docs/zh_cn/api_cn/quant_index.rst
@@ -0,0 +1,8 @@
+
+模型量化 Quant
+======================================
+
+.. toctree::
+   :maxdepth: 1
+
+   quantization_api.rst
diff --git a/docs/zh_cn/api_cn/quantization_api.rst b/docs/zh_cn/api_cn/quantization_api.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1b3358b16cf772550990e2f824872f901bc2c4bb
--- /dev/null
+++ b/docs/zh_cn/api_cn/quantization_api.rst
@@ -0,0 +1,431 @@
+量化
+====
+
+模型量化包含三种量化方法，分别是动态离线量化方法、静态离线量化方法和量化训练方法。
+
+下图展示了如何选择模型量化方法。
+
+
+.. image:: https://user-images.githubusercontent.com/52520497/83991261-cbe55800-a97e-11ea-880c-d83fb7924454.png
+   :scale: 80 %
+   :alt: 图1：选择模型量化方法
+   :align: center
+
+下图综合对比了模型量化方法的使用条件、易用性、精度损失和预期收益。
+
+.. image:: https://user-images.githubusercontent.com/52520497/83991268-cee04880-a97e-11ea-9ecd-2d0f04a15205.png
+   :scale: 80 %
+   :alt: 图2：综合对比模型量化方法
+   :align: center
+
+quant_post_dynamic
+-------------------
+
+.. py:function:: paddleslim.quant.quant_post_dynamic(model_dir, save_model_dir, model_filename=None, params_filename=None, save_model_filename=None, save_params_filename=None, quantizable_op_type=["conv2d", "mul"], weight_bits=8, generate_test_model=False)
+
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/quant/quanter.py>`_
+
+动态离线量化，将模型中特定OP的权重从FP32类型量化成INT8/16类型。
+
+该量化模型有两种预测方式：第一种是反量化预测方式，即是首先将INT8/16类型的权重反量化成FP32类型，然后再使用FP32浮运算运算进行预测；第二种量化预测方式，即是预测中动态计算量化OP输入的量化信息，基于量化的输入和权重进行INT8整形运算。
+
+注意，目前只有PaddleLite仅仅支持第一种反量化预测方式，server端预测（PaddleInference）不支持加载该量化模型。
+
+**使用条件：**
+
+* 有训练好的预测模型
+
+**使用步骤：**
+
+* 产出量化模型：使用PaddlePaddle调用动态离线量化离线量化接口，产出量化模型
+* 量化模型预测：使用PaddleLite加载量化模型进行预测推理
+
+**优点：**
+
+* 权重量化成INT16类型，模型精度不受影响，模型大小为原始的1/2
+* 权重量化成INT8类型，模型精度会受到影响，模型大小为原始的1/4
+
+**缺点：**
+
+* 目前PaddleLite只支持反量化预测方式，主要可以减小模型大小，对特定加载权重费时的模型可以起到一定加速效果
+
+
+**参数:**
+
+- **model_dir(str)** - 需要量化的模型的存储路径。
+- **save_model_dir(str)** - 量化后的模型的存储路径。
+- **model_filename(str, optional)** - 模型文件名，如果需要量化的模型的参数存在一个文件中，则需要设置 ``model_filename`` 为模型文件的名称，否则设置为 ``None`` 即可。默认值是 ``None`` 。
+- **params_filename(str, optional)** - 参数文件名，如果需要量化的模型的参数存在一个文件中，则需要设置 ``params_filename`` 为参数文件的名称，否则设置为 ``None`` 即可。默认值是 ``None`` 。
+- **save_model_filename(str, optional)** - 用于保存量化模型的模型文件名，如果想让参数存在一个文件中，则需要设置 ``save_model_filename`` 为模型文件的名称，否则设置为 ``None`` 即可。默认值是 None 。
+- **save_params_filename(str, optional)** - 用于保存模型的参数文件名，如果想让参数存在一个文件中，则需要设置 ``save_params_filename`` 为参数文件的名称，否则设置为 ``None`` 即可。默认值是 None 。
+- **quantizable_op_type(list[str])** -  需要量化的 op 类型列表。可选范围为 ``["conv2d", "depthwise_conv2d", "mul"]`` 。 默认值是 ``["conv2d", "mul"]`` 。
+- **weight_bits(int)** - weight的量化比特位数, 可选8或者16。 默认值为8。
+- **generate_test_model(bool)** - 如果为True, 则会保存一个fake quantized模型，这个模型可用PaddlePaddle加载测试精度。默认为False.
+
+**返回**
+
+无
+
+**返回类型**
+
+无
+
+**代码示例**
+
+.. warning::
+
+   此示例不能直接运行，因为需要加载 ``${model_dir}`` 下的模型，所以不能直接运行。
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+   import paddle.dataset.mnist as reader
+   from paddleslim.quant import quant_post_dynamic
+   
+   quant_post_dynamic(
+           model_dir='./model_path',
+           save_model_dir='./save_path',
+           model_filename='__model__',
+           params_filename='__params__',
+           save_model_filename='__model__',
+           save_params_filename='__params__')
+
+
+
+
+
+quant_post_static
+---------------
+
+.. py:function:: paddleslim.quant.quant_post_static(executor,model_dir, quantize_model_path, batch_generator=None, sample_generator=None, model_filename=None, params_filename=None, save_model_filename='__model__', save_params_filename='__params__', batch_size=16, batch_nums=None, scope=None, algo='KL', quantizable_op_type=["conv2d","depthwise_conv2d","mul"], is_full_quantize=False, weight_bits=8, activation_bits=8, activation_quantize_type='range_abs_max', weight_quantize_type='channel_wise_abs_max', is_use_cache_file=False, cache_dir="./temp_post_training")
+
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/quant/quanter.py>`_
+
+静态离线量化，使用少量校准数据计算量化因子，可以快速得到量化模型。使用该量化模型进行预测，可以减少计算量、降低计算内存、减小模型大小。
+
+注意：在PaddleSlim 1.1.0版本，我们将 `quant_post` 改名为 `quant_post_static`。前者就还可以使用，但是即将被废弃，请使用 `quant_post_static`。
+
+**使用条件:**
+
+* 有训练好的预测模型
+* 有少量校准数据，比如100~500张图片
+
+**使用步骤：**
+
+* 产出量化模型：使用PaddleSlim调用静态离线量化接口，产出量化模型
+* 量化模型预测：使用PaddleLite或者PaddleInference加载量化模型进行预测推理
+
+**优点：**
+
+* 减小计算量、降低计算内存、减小模型大小
+* 不需要大量训练数据
+* 快速产出量化模型，简单易用
+
+**缺点：**
+
+* 对少部分的模型，尤其是计算量小、精简的模型，量化后精度可能会受到影响
+
+**参数:**
+
+- **executor (fluid.Executor)** - 执行模型的executor，可以在cpu或者gpu上执行。
+- **model_dir（str)** - 需要量化的模型所在的文件夹。
+- **quantize_model_path(str)** - 保存量化后的模型的路径
+- **batch_generator(python generator)** - 读取数据样本，每次返回一个batch的数据。和 `sample_generator` 只能设置一个。
+- **sample_generator(python generator)** - 读取数据样本，每次返回一个样本。
+- **model_filename(str, optional)** - 模型文件名，如果需要量化的模型的参数存在一个文件中，则需要设置 ``model_filename`` 为模型文件的名称，否则设置为 ``None`` 即可。默认值是 ``None`` 。
+- **params_filename(str, optional)** - 参数文件名，如果需要量化的模型的参数存在一个文件中，则需要设置 ``params_filename`` 为参数文件的名称，否则设置为 ``None`` 即可。默认值是 ``None`` 。
+- **save_model_filename(str)** - 用于保存量化模型的模型文件名，如果想让参数存在一个文件中，则需要设置 ``save_model_filename`` 为模型文件的名称，否则设置为 ``None`` 即可。默认值是 ``__model__`` 。
+- **save_params_filename(str)** - 用于保存模型的参数文件名，如果想让参数存在一个文件中，则需要设置 ``save_params_filename`` 为参数文件的名称，否则设置为 ``None`` 即可。默认值是 ``__params__`` 。
+- **batch_size(int)** - 每个batch的图片数量。默认值为16 。
+- **batch_nums(int, optional)** - 迭代次数。如果设置为 ``None`` ，则会一直运行到 ``sample_generator`` 迭代结束， 否则，迭代次数为 ``batch_nums``, 也就是说参与对 ``Scale`` 进行校正的样本个数为 ``'batch_nums' * 'batch_size'`` .
+- **scope(fluid.Scope, optional)** - 用来获取和写入 ``Variable`` , 如果设置为 ``None`` ,则使用 `fluid.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_ . 默认值是 ``None`` .
+- **algo(str)** - 量化时使用的算法名称，可为 ``'KL'`` 或者 ``'abs_max'`` 。该参数仅针对激活值的量化，因为参数值的量化使用的方式为 ``'channel_wise_abs_max'`` . 当 ``algo`` 设置为 ``'abs_max'`` 时，使用校正数据的激活值的绝对值的最大值当作 ``Scale`` 值，当设置为 ``'KL'`` 时，则使用KL散度的方法来计算 ``Scale`` 值。默认值为 ``'KL'`` 。
+- **quantizable_op_type(list[str])** -  需要量化的 op 类型列表。默认值为 ``["conv2d", "depthwise_conv2d", "mul"]`` 。
+- **is_full_quantize(bool)** - 是否量化所有可支持的op类型。如果设置为False, 则按照 ``'quantizable_op_type'`` 的设置进行量化。如果设置为True, 则按照 `量化配置 <#id2>`_  中 ``QUANT_DEQUANT_PASS_OP_TYPES + QUANT_DEQUANT_PASS_OP_TYPES`` 定义的op进行量化。  
+- **weight_bits(int)** - weight的量化比特位数, 默认值为8。
+- **activation_bits(int)** - 激活值的量化比特位数, 默认值为8。
+- **weight_quantize_type(str)** - weight的量化方式，可选 `abs_max` 或者 `channel_wise_abs_max` ,通常情况下选 `channel_wise_abs_max` 模型量化精度更高。
+- **activation_quantize_type(str)** - 激活值的量化方式, 可选 `range_abs_max` 和 `moving_average_abs_max` 。设置激活量化方式不会影响计算scale的算法，只是影响在保存模型时使用哪种operator。
+- **is_use_cache_file(bool)** - 是否使用硬盘对中间结果进行存储。如果为False, 则将中间结果存储在内存中。默认值为False。
+- **cache_dir(str)** - 如果 ``'is_use_cache_file'`` 为True, 则将中间结果存储在此参数设置的路径下。默认值为 ``./temp_post_training``  。
+
+**返回**
+
+无。
+
+.. note::
+
+   - 因为该接口会收集校正数据的所有的激活值，当校正图片比较多时，请设置 ``'is_use_cache_file'`` 为True, 将中间结果存储在硬盘中。另外，``'KL'`` 散度的计算比较耗时。
+   - 目前 ``Paddle-Lite`` 有int8 kernel来加速的op只有 ``['conv2d', 'depthwise_conv2d', 'mul']`` , 其他op的int8 kernel将陆续支持。
+
+**代码示例**
+
+.. warning::
+
+   此示例不能直接运行，因为需要加载 ``${model_dir}`` 下的模型，所以不能直接运行。
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+   import paddle.dataset.mnist as reader
+   from paddleslim.quant import quant_post_static
+   val_reader = reader.train()
+   use_gpu = True
+   place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+   
+   exe = fluid.Executor(place)
+   quant_post_static(
+           executor=exe,
+           model_dir='./model_path',
+           quantize_model_path='./save_path',
+           sample_generator=val_reader,
+           model_filename='__model__',
+           params_filename='__params__',
+           batch_size=16,
+           batch_nums=10)
+
+更详细的用法请参考 `离线量化demo <https://github.com/PaddlePaddle/PaddleSlim/tree/develop/demo/quant/quant_post>`_ 。
+
+
+
+
+quant_aware
+------------
+
+.. py:function:: paddleslim.quant.quant_aware(program, place, config, scope=None, for_test=False, weight_quantize_func=None, act_quantize_func=None, weight_preprocess_func=None, act_preprocess_func=None, optimizer_func=None, executor=None))
+
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/quant/quanter.py>`_
+
+在 program 中加入量化和反量化op, 用于量化训练。
+
+
+**参数：**
+
+- **program (fluid.Program)** -  传入训练或测试program 。
+- **place(fluid.CPUPlace | fluid.CUDAPlace)** -  该参数表示 ``Executor`` 执行所在的设备。
+- **config(dict)** -  量化配置表。
+- **scope(fluid.Scope, optional)** -  传入用于存储 ``Variable`` 的 ``scope`` ，需要传入 ``program`` 所使用的 ``scope`` ，一般情况下，是 `fluid.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_ 。设置为 ``None`` 时将使用 `fluid.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_ ，默认值为 ``None`` 。
+- **for_test(bool)** -  如果 ``program`` 参数是一个测试 ``program`` ， ``for_test`` 应设为True，否则设为False 。
+-  **weight_quantize_func(function)** - 自定义对权重量化的函数，该函数的输入是待量化的权重，输出是反量化之后的权重，可以快速验证此量化函数是否有效。此参数设置后，将会替代量化配置中 `weight_quantize_type` 定义的方法，如果此参数不设置，将继续使用 `weight_quantize_type` 定义的方法。默认为None。
+- **act_quantize_func(function)** - 自定义对激活量化的函数，该函数的输入是待量化的激活，输出是反量化之后的激活，可以快速验证此量化函数是否有效。将会替代量化配置中 `activation_quantize_type` 定义的方法，如果此参数不设置，将继续使用 `activation_quantize_type` 定义的方法。默认为None.
+- **weight_preprocess_func(function)** - 自定义在对权重做量化之前，对权重进行处理的函数。此方法的意义在于网络中的参数不一定适合于直接量化，如果对参数分布先进行处理再进行量化，或许可以提高量化精度。默认为None.
+
+- **act_preprocess_func(function)** - 自定义在对激活做量化之前，对激活进行处理的函数。此方法的意义在于网络中的激活值不一定适合于直接量化，如果对激活值先进行处理再进行量化，或许可以提高量化精度。默认为None.
+
+- **optimizer_func(function)** - 该参数是一个返回optimizer的函数。定义的optimizer函数将用于定义上述自定义函数中的参数的优化参数。默认为None.
+- **executor(fluid.Executor)** - 用于初始化上述自定义函数中的变量。默认为None.
+
+**返回**
+
+含有量化和反量化 operator 的 program 。
+
+**返回类型**
+
+- 当 ``for_test=False`` ，返回类型为 ``fluid.CompiledProgram`` ， **注意，此返回值不能用于保存参数** 。
+- 当 ``for_test=True`` ，返回类型为 ``fluid.Program`` 。
+
+.. note::
+
+   - 此接口会改变program 结构，并且可能增加一些persistable的变量，所以加载模型参数时请注意和相应的 program 对应。
+   - 此接口底层经历了 fluid.Program -> fluid.framework.IrGraph -> fluid.Program 的转变，在 ``fluid.framework.IrGraph`` 中没有 ``Parameter`` 的概念，``Variable`` 只有 persistable 和not persistable的区别，所以在保存和加载参数时，请使用 ``fluid.io.save_persistables`` 和 ``fluid.io.load_persistables`` 接口。
+   - 由于此接口会根据 program 的结构和量化配置来对program 添加op，所以 ``Paddle`` 中一些通过 ``fuse op`` 来加速训练的策略不能使用。已知以下策略在使用量化时必须设为False ： ``fuse_all_reduce_ops, sync_batch_norm`` 。
+   - 如果传入的 program 中存在和任何op都没有连接的 ``Variable`` ，则会在量化的过程中被优化掉。
+
+
+
+convert
+---------
+
+.. py:function:: paddleslim.quant.convert(program, place, config, scope=None, save_int8=False)
+
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/quant/quanter.py>`_
+
+
+把训练好的量化 program ，转换为可用于保存 ``inference model`` 的 program 。
+
+**参数：**
+
+- **program (fluid.Program)** -  传入测试 program 。
+- **place(fluid.CPUPlace | fluid.CUDAPlace)** - 该参数表示 ``Executor`` 执行所在的设备。
+- **config(dict)** -  量化配置表。
+- **scope(fluid.Scope)** - 传入用于存储 ``Variable`` 的 ``scope`` ，需要传入 ``program`` 所使用的 ``scope`` ，一般情况下，是 `fluid.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_ 。设置为 ``None`` 时将使用 `fluid.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_ ，默认值为 ``None`` 。
+- **save_int8（bool)** -  是否需要返回参数为 ``int8`` 的 program 。该功能目前只能用于确认模型大小。默认值为 ``False`` 。
+
+**返回**
+
+- **program (fluid.Program)** - freezed program，可用于保存inference model，参数为 ``float32`` 类型，但其数值范围可用int8表示。
+- **int8_program (fluid.Program)** - freezed program，可用于保存inference model，参数为 ``int8`` 类型。当 ``save_int8`` 为False 时，不返回该值。
+
+.. note::
+
+   因为该接口会对 op 和 Variable 做相应的删除和修改，所以此接口只能在训练完成之后调用。如果想转化训练的中间模型，可加载相应的参数之后再使用此接口。
+
+**代码示例**
+
+.. code-block:: python
+
+   #encoding=utf8
+   import paddle.fluid as fluid
+   import paddleslim.quant as quant
+   
+   
+   train_program = fluid.Program()
+   
+   with fluid.program_guard(train_program):
+       image = fluid.data(name='x', shape=[None, 1, 28, 28])
+       label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+       conv = fluid.layers.conv2d(image, 32, 1)
+       feat = fluid.layers.fc(conv, 10, act='softmax')
+       cost = fluid.layers.cross_entropy(input=feat, label=label)
+       avg_cost = fluid.layers.mean(x=cost)
+   
+   use_gpu = True
+   place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+   exe = fluid.Executor(place)
+   exe.run(fluid.default_startup_program())
+   eval_program = train_program.clone(for_test=True)
+   #配置
+   config = {'weight_quantize_type': 'abs_max',
+           'activation_quantize_type': 'moving_average_abs_max'}
+   build_strategy = fluid.BuildStrategy()
+   exec_strategy = fluid.ExecutionStrategy()
+   #调用api
+   quant_train_program = quant.quant_aware(train_program, place, config, for_test=False)
+   quant_eval_program = quant.quant_aware(eval_program, place, config, for_test=True)
+   #关闭策略
+   build_strategy.fuse_all_reduce_ops = False
+   build_strategy.sync_batch_norm = False
+   quant_train_program = quant_train_program.with_data_parallel(
+       loss_name=avg_cost.name,
+       build_strategy=build_strategy,
+       exec_strategy=exec_strategy)
+   
+   inference_prog = quant.convert(quant_eval_program, place, config)
+
+更详细的用法请参考 `量化训练demo <https://github.com/PaddlePaddle/PaddleSlim/tree/develop/demo/quant/quant_aware>`_ 。
+
+
+量化训练方法的参数配置
+---------------
+通过字典配置量化参数
+
+.. code-block:: python
+
+
+   TENSORRT_OP_TYPES = [
+       'mul', 'conv2d', 'pool2d', 'depthwise_conv2d', 'elementwise_add',
+       'leaky_relu'
+   ]
+   TRANSFORM_PASS_OP_TYPES = ['conv2d', 'depthwise_conv2d', 'mul']
+   
+   QUANT_DEQUANT_PASS_OP_TYPES = [
+           "pool2d", "elementwise_add", "concat", "softmax", "argmax", "transpose",
+           "equal", "gather", "greater_equal", "greater_than", "less_equal",
+           "less_than", "mean", "not_equal", "reshape", "reshape2",
+           "bilinear_interp", "nearest_interp", "trilinear_interp", "slice",
+           "squeeze", "elementwise_sub", "relu", "relu6", "leaky_relu", "tanh", "swish"
+       ]
+   
+   _quant_config_default = {
+       # weight quantize type, default is 'channel_wise_abs_max'
+       'weight_quantize_type': 'channel_wise_abs_max',
+       # activation quantize type, default is 'moving_average_abs_max'
+       'activation_quantize_type': 'moving_average_abs_max',
+       # weight quantize bit num, default is 8
+       'weight_bits': 8,
+       # activation quantize bit num, default is 8
+       'activation_bits': 8,
+       # ops of name_scope in not_quant_pattern list, will not be quantized
+       'not_quant_pattern': ['skip_quant'],
+       # ops of type in quantize_op_types, will be quantized
+       'quantize_op_types': ['conv2d', 'depthwise_conv2d', 'mul'],
+       # data type after quantization, such as 'uint8', 'int8', etc. default is 'int8'
+       'dtype': 'int8',
+       # window size for 'range_abs_max' quantization. defaulf is 10000
+       'window_size': 10000,
+       # The decay coefficient of moving average, default is 0.9
+       'moving_rate': 0.9,
+       # if True, 'quantize_op_types' will be TENSORRT_OP_TYPES
+       'for_tensorrt': False,
+       # if True, 'quantoze_op_types' will be TRANSFORM_PASS_OP_TYPES + QUANT_DEQUANT_PASS_OP_TYPES
+       'is_full_quantize': False
+   }
+
+**参数：**
+
+- **weight_quantize_type(str)** - 参数量化方式。可选 ``'abs_max'`` ,  ``'channel_wise_abs_max'`` , ``'range_abs_max'`` , ``'moving_average_abs_max'`` 。如果使用 ``TensorRT`` 加载量化后的模型来预测，请使用 ``'channel_wise_abs_max'`` 。 默认 ``'channel_wise_abs_max'`` 。
+- **activation_quantize_type(str)** - 激活量化方式，可选 ``'abs_max'`` ,  ``'range_abs_max'`` ,  ``'moving_average_abs_max'`` 。如果使用 ``TensorRT`` 加载量化后的模型来预测，请使用 ``'range_abs_max', 'moving_average_abs_max'`` 。，默认 ``'moving_average_abs_max'`` 。
+- **weight_bits(int)** - 参数量化bit数，默认8, 可选1-8，推荐设为8，因为量化后的数据类型是 ``int8`` 。
+- **activation_bits(int)** -  激活量化bit数，默认8，可选1-8，推荐设为8，因为量化后的数据类型是 ``int8`` 。
+- **not_quant_pattern(str | list[str])** - 所有 ``name_scope`` 包含 ``'not_quant_pattern'`` 字符串的 op ，都不量化, 设置方式请参考 `fluid.name_scope <https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/name_scope_cn.html#name-scope>`_ 。
+- **quantize_op_types(list[str])** -  需要进行量化的 op 类型，可选的op类型为 ``TRANSFORM_PASS_OP_TYPES + QUANT_DEQUANT_PASS_OP_TYPES`` 。
+- **dtype(int8)** - 量化后的参数类型，默认 ``int8`` , 目前仅支持 ``int8`` 。
+- **window_size(int)** -  ``'range_abs_max'`` 量化方式的 ``window size`` ，默认10000。
+- **moving_rate(int)** - ``'moving_average_abs_max'`` 量化方式的衰减系数，默认 0.9。
+- **for_tensorrt(bool)** - 量化后的模型是否使用 ``TensorRT`` 进行预测。如果是的话，量化op类型为： ``TENSORRT_OP_TYPES`` 。默认值为False.
+- **is_full_quantize(bool)** - 是否量化所有可支持op类型。可量化op为 ``TRANSFORM_PASS_OP_TYPES + QUANT_DEQUANT_PASS_OP_TYPES`` 。 默认值为False.
+
+.. :note::
+
+   目前 ``Paddle-Lite`` 有int8 kernel来加速的op只有 ``['conv2d', 'depthwise_conv2d', 'mul']``, 其他op的int8 kernel将陆续支持。
+
+
+quant_embedding
+-------------------
+
+.. py:function:: paddleslim.quant.quant_embedding(program, place, config=None, scope=None)
+
+`源代码 <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/quant/quant_embedding.py>`_
+
+对 ``Embedding`` 参数进行量化。
+
+**参数:**
+
+- **program(fluid.Program)** - 需要量化的program
+- **scope(fluid.Scope, optional)** - 用来获取和写入 ``Variable``, 如果设置为 ``None``,则使用 `fluid.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_ .
+- **place(fluid.CPUPlace | fluid.CUDAPlace)** - 运行program的设备
+- **config(dict, optional)** - 定义量化的配置。可以配置的参数有 `'quantize_op_types'`, 指定需要量化的op，如果不指定，则设为 `['lookup_table', 'fused_embedding_seq_pool', 'pyramid_hash']` ,目前仅支持这三种op。对于每个op，可指定以下配置： ``'quantize_type'`` (str, optional): 量化的类型，目前支持的类型是 ``'abs_max', 'log'``, 默认值是 ``'abs_max'`` 。 ``'quantize_bits'`` （int, optional): 量化的bit数，目前支持的bit数为8。默认值是8. ``'dtype'`` (str, optional): 量化之后的数据类型， 目前支持的是 ``'int8'``. 默认值是 ``int8`` 。举个配置例子，可以是 `{'quantize_op_types': ['lookup_table'], 'lookup_table': {'quantize_type': 'abs_max'}}` 。
+
+**返回**
+
+量化之后的program
+
+**返回类型**
+
+fluid.Program
+
+**代码示例**
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+   import paddleslim.quant as quant
+   
+   train_program = fluid.Program()
+   with fluid.program_guard(train_program):
+       input_word = fluid.data(name="input_word", shape=[None, 1], dtype='int64')
+       input_emb = fluid.embedding(
+           input=input_word,
+           is_sparse=False,
+           size=[100, 128],
+           param_attr=fluid.ParamAttr(name='emb',
+           initializer=fluid.initializer.Uniform(-0.005, 0.005)))
+   
+   infer_program = train_program.clone(for_test=True)
+   
+   use_gpu = True
+   place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+   exe = fluid.Executor(place)
+   exe.run(fluid.default_startup_program())
+   
+   config = {
+            'quantize_op_types': ['lookup_table'], 
+            'lookup_table': {
+                'quantize_type': 'abs_max'
+                }
+            }
+   quant_program = quant.quant_embedding(infer_program, place, config)
+
+更详细的用法请参考 `Embedding量化demo <https://github.com/PaddlePaddle/PaddleSlim/tree/develop/demo/quant/quant_embedding>`_ 
+
+
diff --git a/docs/docs/api/search_space.md b/docs/zh_cn/api_cn/search_space.md
similarity index 53%
rename from docs/docs/api/search_space.md
rename to docs/zh_cn/api_cn/search_space.md
index 682b0eac801bae4ae59b523475e8fa3c66586190..d566c259a2915669cc04e8a1fb86df0943f5eb45 100644
--- a/docs/docs/api/search_space.md
+++ b/docs/zh_cn/api_cn/search_space.md
@@ -1,53 +1,51 @@
-# paddleslim.nas 提供的搜索空间：
+# 搜索空间
+搜索空间是神经网络搜索中的一个概念。搜索空间是一系列模型结构的汇集, SANAS主要是利用模拟退火的思想在搜索空间中搜索到一个比较小的模型结构或者一个精度比较高的模型结构。
 
-1. 根据原本模型结构构造搜索空间：
+## paddleslim.nas 提供的搜索空间
 
-  1.1 MobileNetV2Space
-  
-  1.2 MobileNetV1Space
-  
-  1.3 ResNetSpace
+#### 根据初始模型结构构造搜索空间:
 
+1. MobileNetV2Space<br>
+&emsp; MobileNetV2的网络结构可以参考：[代码](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/image_classification/models/mobilenet_v2.py#L29)，[论文](https://arxiv.org/abs/1801.04381)
 
-2. 根据相应模型的block构造搜索空间
+2. MobileNetV1Space<br>
+&emsp; MobilNetV1的网络结构可以参考：[代码](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/image_classification/models/mobilenet_v1.py#L29)，[论文](https://arxiv.org/abs/1704.04861)
 
-  2.1 MobileNetV1BlockSpace
-  
-  2.2 MobileNetV2BlockSpace
-  
-  2.3 ResNetBlockSpace
-  
-  2.4 InceptionABlockSpace
-  
-  2.5 InceptionCBlockSpace
+3. ResNetSpace<br>
+&emsp; ResNetSpace的网络结构可以参考：[代码](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/image_classification/models/resnet.py#L30)，[论文](https://arxiv.org/pdf/1512.03385.pdf)
 
 
-##搜索空间的配置介绍：
+#### 根据相应模型的block构造搜索空间:
+1. MobileNetV1BlockSpace<br>
+&emsp; MobileNetV1Block的结构可以参考：[代码](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/image_classification/models/mobilenet_v1.py#L173)
 
-**input_size(int|None)**：`input_size`表示输入feature map的大小。
-**output_size(int|None)**：`output_size`表示输出feature map的大小。
-**block_num(int|None)**：`block_num`表示搜索空间中block的数量。
-**block_mask(list|None)**：`block_mask`表示当前的block是一个reduction block还是一个normal block，是一组由0、1组成的列表，0表示当前block是normal block，1表示当前block是reduction block。如果设置了`block_mask`，则主要以`block_mask`为主要配置，`input_size`，`output_size`和`block_num`三种配置是无效的。
+2. MobileNetV2BlockSpace<br>
+&emsp; MobileNetV2Block的结构可以参考：[代码](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/image_classification/models/mobilenet_v2.py#L174)
 
-**Note:** 
-1. reduction block表示经过这个block之后的feature map大小下降为之前的一半，normal block表示经过这个block之后feature map大小不变。
-2. `input_size`和`output_size`用来计算整个模型结构中reduction block数量。
+3. ResNetBlockSpace<br>
+&emsp; ResNetBlock的结构可以参考：[代码](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/image_classification/models/resnet.py#L148)
 
+4. InceptionABlockSpace<br>
+&emsp; InceptionABlock的结构可以参考：[代码](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/image_classification/models/inception_v4.py#L140)
 
-##搜索空间示例：
+5. InceptionCBlockSpace<br>
+&emsp; InceptionCBlock结构可以参考：[代码](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/image_classification/models/inception_v4.py#L291)
 
-1. 使用paddleslim中提供用原本的模型结构来构造搜索空间的话，仅需要指定搜索空间名字即可。例如：如果使用原本的MobileNetV2的搜索空间进行搜索的话，传入SANAS中的config直接指定为[('MobileNetV2Space')]。
-2. 使用paddleslim中提供的block搜索空间构造搜索空间：
-  2.1 使用`input_size`, `output_size`和`block_num`来构造搜索空间。例如：传入SANAS的config可以指定为[('MobileNetV2BlockSpace', {'input_size': 224, 'output_size': 32, 'block_num': 10})]。
-  2.2 使用`block_mask`构造搜索空间。例如：传入SANAS的config可以指定为[('MobileNetV2BlockSpace', {'block_mask': [0, 1, 1, 1, 1, 0, 1, 0]})]。
 
+## 搜索空间使用示例
 
-# 自定义搜索空间(search space)
+1. 使用paddleslim中提供用初始的模型结构来构造搜索空间的话，仅需要指定搜索空间名字即可。例如：如果使用原本的MobileNetV2的搜索空间进行搜索的话，传入SANAS中的configs直接指定为[('MobileNetV2Space')]。
+2. 使用paddleslim中提供的block搜索空间构造搜索空间：<br>
+  2.1 使用`input_size`, `output_size`和`block_num`来构造搜索空间。例如：传入SANAS的configs可以指定为[('MobileNetV2BlockSpace', {'input_size': 224, 'output_size': 32, 'block_num': 10})]。<br>
+  2.2 使用`block_mask`构造搜索空间。例如：传入SANAS的configs可以指定为[('MobileNetV2BlockSpace', {'block_mask': [0, 1, 1, 1, 1, 0, 1, 0]})]。
 
-自定义搜索空间类需要继承搜索空间基类并重写以下几部分：
-  1. 初始化的tokens(`init_tokens`函数)，可以设置为自己想要的tokens列表, tokens列表中的每个数字指的是当前数字在相应的搜索列表中的索引。例如本示例中若tokens=[0, 3, 5]，则代表当前模型结构搜索到的通道数为[8, 40, 128]。
-  2. token中每个数字的搜索列表长度(`range_table`函数)，tokens中每个token的索引范围。
-  3. 根据token产生模型结构(`token2arch`函数)，根据搜索到的tokens列表产生模型结构。
+
+## 自定义搜索空间(search space)
+
+自定义搜索空间类需要继承搜索空间基类并重写以下几部分：<br>
+&emsp; 1. 初始化的tokens(`init_tokens`函数)，可以设置为自己想要的tokens列表, tokens列表中的每个数字指的是当前数字在相应的搜索列表中的索引。例如本示例中若tokens=[0, 3, 5]，则代表当前模型结构搜索到的通道数为[8, 40, 128]。<br>
+&emsp; 2. tokens中每个数字的搜索列表长度(`range_table`函数)，tokens中每个token的索引范围。<br>
+&emsp; 3. 根据tokens产生模型结构(`token2arch`函数)，根据搜索到的tokens列表产生模型结构。 <br>
 
 以新增reset block为例说明如何构造自己的search space。自定义的search space不能和已有的search space同名。
 
@@ -70,17 +68,18 @@ class ResNetBlockSpace2(SearchSpaceBase):
     def init_tokens(self):
         return [0] * 3 * len(self.block_mask)
 
-    ### 定义
+    ### 定义token的index的取值范围
     def range_table(self):
         return [len(self.filter_num)] * 3 * len(self.block_mask)
 
+    ### 把token转换成模型结构
     def token2arch(self, tokens=None):
         if tokens == None:
             tokens = self.init_tokens()
 
         self.bottleneck_params_list = []
         for i in range(len(self.block_mask)):
-            self.bottleneck_params_list.append(self.filter_num[tokens[i * 3 + 0]], 
+            self.bottleneck_params_list.append(self.filter_num[tokens[i * 3 + 0]],
                                                self.filter_num[tokens[i * 3 + 1]],
                                                self.filter_num[tokens[i * 3 + 2]],
                                                2 if self.block_mask[i] == 1 else 1)
@@ -113,4 +112,4 @@ class ResNetBlockSpace2(SearchSpaceBase):
         conv = fluid.layers.conv2d(input, num_filters, filter_size, stride, name=name+'_conv')
         bn = fluid.layers.batch_norm(conv, act=act, name=name+'_bn')
         return bn
-``` 
+```
diff --git a/docs/zh_cn/api_cn/single_distiller_api.rst b/docs/zh_cn/api_cn/single_distiller_api.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dce99541cbcede508d9953f2f308c664e71bdc56
--- /dev/null
+++ b/docs/zh_cn/api_cn/single_distiller_api.rst
@@ -0,0 +1,251 @@
+单进程蒸馏
+=========
+
+merge
+---------
+
+.. py:function:: paddleslim.dist.merge(teacher_program, student_program, data_name_map, place, scope=None, name_prefix='teacher_')
+
+`[源代码] <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L19>`_
+
+将teacher_program融合到student_program中。
+
+在融合的program中，可以方便地联合原本两个Program中的Tensor做计算。
+
+**参数：**
+
+- **teacher_program** (Program)-定义了teacher模型的 `paddle program <https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Program_cn.html#program>`_
+- **student_program** (Program)-定义了student模型的 `paddle program <https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/Program_cn.html#program>`_
+- **data_name_map** (dict)-teacher输入接口名与student输入接口名的映射，其中dict的 *key* 为teacher的输入名，*value* 为student的输入名
+- **place** (fluid.CPUPlace()|fluid.CUDAPlace(N))-该参数表示程序运行在何种设备上，这里的N为GPU对应的ID
+- **scope** (Scope)-该参数表示程序使用的变量作用域，如果不指定将使用默认的全局作用域 `global_scope <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/paddle_cn/global_scope_cn.html#global-scope>`_ 。默认值： None
+- **name_prefix** (str)-为避免同名参数冲突，merge操作将统一为teacher的 `Variables <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/beginners_guide/basic_concept/variable.html#variable>`_ 添加的名称前缀name_prefix。默认值：teacher_
+
+**返回：** 无
+
+**使用示例：**
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+   import paddleslim.dist as dist
+   student_program = fluid.Program()
+   with fluid.program_guard(student_program):
+       x = fluid.layers.data(name='x', shape=[1, 28, 28])
+       conv = fluid.layers.conv2d(x, 32, 1)
+       out = fluid.layers.conv2d(conv, 64, 3, padding=1)
+   teacher_program = fluid.Program()
+   with fluid.program_guard(teacher_program):
+       y = fluid.layers.data(name='y', shape=[1, 28, 28])
+       conv = fluid.layers.conv2d(y, 32, 1)
+       conv = fluid.layers.conv2d(conv, 32, 3, padding=1)
+       out = fluid.layers.conv2d(conv, 64, 3, padding=1)
+   data_name_map = {'y':'x'}
+   USE_GPU = False
+   place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
+   dist.merge(teacher_program, student_program,
+                             data_name_map, place)
+
+
+fsp_loss
+---------
+
+.. py:function:: paddleslim.dist.fsp_loss(teacher_var1_name, teacher_var2_name, student_var1_name, student_var2_name, program=None)
+
+`[源代码] <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L90>`_
+
+为program内的teacher var和student var添加fsp_loss.
+
+fsp_loss出自论文 `A Gift from Knowledge Distillation: Fast Optimization, Network Minimization and Transfer Learning <http://openaccess.thecvf.com/content_cvpr_2017/papers/Yim_A_Gift_From_CVPR_2017_paper.pdf>`_
+
+**参数：**
+
+- **teacher_var1_name** (str): teacher_var1的名称. 对应的variable是一个形为 ``[batch_size, x_channel, height, width]`` 的4-D特征图Tensor，数据类型为float32或float64
+- **teacher_var2_name** (str): teacher_var2的名称. 对应的variable是一个形为 ``[batch_size, y_channel, height, width]`` 的4-D特征图Tensor，数据类型为float32或float64。只有y_channel可以与teacher_var1的x_channel不同，其他维度必须与teacher_var1相同
+- **student_var1_name** (str): student_var1的名称. 对应的variable需与teacher_var1尺寸保持一致，是一个形为 ``[batch_size, x_channel, height, width]`` 的4-D特征图Tensor，数据类型为float32或float64
+- **student_var2_name** (str): student_var2的名称. 对应的variable需与teacher_var2尺寸保持一致，是一个形为 ``[batch_size, y_channel, height, width]`` 的4-D特征图Tensor，数据类型为float32或float64。只有y_channel可以与student_var1的x_channel不同，其他维度必须与student_var1相同
+- **program** (Program): 用于蒸馏训练的fluid program, 如果未指定则使用 `fluid.default_main_program() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/fluid_cn/default_main_program_cn.html#default-main-program>`_ 。默认值：None
+
+**返回：**
+
+- (Variable): 由teacher_var1, teacher_var2, student_var1, student_var2组合得到的fsp_loss
+
+**使用示例：**
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+   import paddleslim.dist as dist
+   student_program = fluid.Program()
+   with fluid.program_guard(student_program):
+       x = fluid.layers.data(name='x', shape=[1, 28, 28])
+       conv = fluid.layers.conv2d(x, 32, 1, name='s1')
+       out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='s2')
+   teacher_program = fluid.Program()
+   with fluid.program_guard(teacher_program):
+       y = fluid.layers.data(name='y', shape=[1, 28, 28])
+       conv = fluid.layers.conv2d(y, 32, 1, name='t1')
+       conv = fluid.layers.conv2d(conv, 32, 3, padding=1)
+       out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='t2')
+   data_name_map = {'y':'x'}
+   USE_GPU = False
+   place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
+   dist.merge(teacher_program, student_program, data_name_map, place)
+   with fluid.program_guard(student_program):
+       distillation_loss = dist.fsp_loss('teacher_t1.tmp_1', 'teacher_t2.tmp_1',
+                                         's1.tmp_1', 's2.tmp_1', student_program)
+
+
+
+l2_loss
+------------
+
+.. py:function:: paddleslim.dist.l2_loss(teacher_var_name, student_var_name, program=None)
+
+`[源代码] <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L118>`_
+
+为program内的teacher var和student var添加l2 loss
+
+**参数：**
+
+- **teacher_var_name** (str): teacher_var的名称.
+- **student_var_name** (str): student_var的名称.
+- **program** (Program): 用于蒸馏训练的fluid program。如果未指定则使用 `fluid.default_main_program() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/fluid_cn/default_main_program_cn.html#default-main-program>`_ 。默认值：None
+
+**返回：**
+
+- (Variable): 由teacher_var, student_var组合得到的l2_loss
+
+**使用示例：**
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+   import paddleslim.dist as dist
+   student_program = fluid.Program()
+   with fluid.program_guard(student_program):
+       x = fluid.layers.data(name='x', shape=[1, 28, 28])
+       conv = fluid.layers.conv2d(x, 32, 1, name='s1')
+       out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='s2')
+   teacher_program = fluid.Program()
+   with fluid.program_guard(teacher_program):
+       y = fluid.layers.data(name='y', shape=[1, 28, 28])
+       conv = fluid.layers.conv2d(y, 32, 1, name='t1')
+       conv = fluid.layers.conv2d(conv, 32, 3, padding=1)
+       out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='t2')
+   data_name_map = {'y':'x'}
+   USE_GPU = False
+   place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
+   dist.merge(teacher_program, student_program, data_name_map, place)
+   with fluid.program_guard(student_program):
+       distillation_loss = dist.l2_loss('teacher_t2.tmp_1', 's2.tmp_1',
+                                        student_program)
+
+
+
+soft_label_loss
+-------------------
+
+.. py:function:: paddleslim.dist.soft_label_loss(teacher_var_name, student_var_name, program=None, teacher_temperature=1., student_temperature=1.)
+
+`[源代码] <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L136>`_
+
+为program内的teacher var和student var添加soft label loss
+
+soft_label_loss出自论文 `Distilling the Knowledge in a Neural Network <https://arxiv.org/pdf/1503.02531.pdf>`_
+
+**参数：**
+
+- **teacher_var_name** (str): teacher_var的名称.
+- **student_var_name** (str): student_var的名称.
+- **program** (Program): 用于蒸馏训练的fluid program。如果未指定则使用 `fluid.default_main_program() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/fluid_cn/default_main_program_cn.html#default-main-program>`_ 。默认值：None
+- **teacher_temperature** (float): 对teacher_var进行soft操作的温度值，温度值越大得到的特征图越平滑
+- **student_temperature** (float): 对student_var进行soft操作的温度值，温度值越大得到的特征图越平滑
+
+**返回：**
+
+- (Variable): 由teacher_var, student_var组合得到的soft_label_loss
+
+**使用示例：**
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+   import paddleslim.dist as dist
+   student_program = fluid.Program()
+   with fluid.program_guard(student_program):
+       x = fluid.layers.data(name='x', shape=[1, 28, 28])
+       conv = fluid.layers.conv2d(x, 32, 1, name='s1')
+       out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='s2')
+   teacher_program = fluid.Program()
+   with fluid.program_guard(teacher_program):
+       y = fluid.layers.data(name='y', shape=[1, 28, 28])
+       conv = fluid.layers.conv2d(y, 32, 1, name='t1')
+       conv = fluid.layers.conv2d(conv, 32, 3, padding=1)
+       out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='t2')
+   data_name_map = {'y':'x'}
+   USE_GPU = False
+   place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
+   dist.merge(teacher_program, student_program, data_name_map, place)
+   with fluid.program_guard(student_program):
+       distillation_loss = dist.soft_label_loss('teacher_t2.tmp_1',
+                                                's2.tmp_1', student_program, 1., 1.)
+
+
+
+loss
+--------
+
+.. py:function:: paddleslim.dist.loss(loss_func, program=None, **kwargs)
+
+`[源代码] <https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L165>`_
+
+支持对teacher_var和student_var使用任意自定义损失函数
+
+**参数：**
+
+- **loss_func** (python function): 自定义的损失函数，输入为teacher var和student var，输出为自定义的loss
+- **program** (Program): 用于蒸馏训练的fluid program。如果未指定则使用 `fluid.default_main_program() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/fluid_cn/default_main_program_cn.html#default-main-program>`_ 。默认值：None
+- **kwargs** : loss_func输入名与对应variable名称
+
+**返回：**
+
+- (Variable): 自定义的损失函数loss
+
+**使用示例：**
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+   import paddleslim.dist as dist
+   student_program = fluid.Program()
+   with fluid.program_guard(student_program):
+       x = fluid.layers.data(name='x', shape=[1, 28, 28])
+       conv = fluid.layers.conv2d(x, 32, 1, name='s1')
+       out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='s2')
+   teacher_program = fluid.Program()
+   with fluid.program_guard(teacher_program):
+       y = fluid.layers.data(name='y', shape=[1, 28, 28])
+       conv = fluid.layers.conv2d(y, 32, 1, name='t1')
+       conv = fluid.layers.conv2d(conv, 32, 3, padding=1)
+       out = fluid.layers.conv2d(conv, 64, 3, padding=1, name='t2')
+   data_name_map = {'y':'x'}
+   USE_GPU = False
+   place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
+   dist.merge(teacher_program, student_program, data_name_map, place)
+   def adaptation_loss(t_var, s_var):
+       teacher_channel = t_var.shape[1]
+       s_hint = fluid.layers.conv2d(s_var, teacher_channel, 1)
+       hint_loss = fluid.layers.reduce_mean(fluid.layers.square(s_hint - t_var))
+       return hint_loss
+   with fluid.program_guard(student_program):
+       distillation_loss = dist.loss(adaptation_loss, student_program,
+               t_var='teacher_t2.tmp_1', s_var='s2.tmp_1')
+
+.. note::
+
+    在添加蒸馏loss时会引入新的variable，需要注意新引入的variable不要与student variables命名冲突。这里建议两种用法（两种方法任选其一即可）：
+
+    1. 建议与student_program使用同一个命名空间，以避免一些未指定名称的variables(例如tmp_0, tmp_1...)多次定义为同一名称出现命名冲突
+
+    2. 建议在添加蒸馏loss时指定一个命名空间前缀，具体用法请参考Paddle官方文档 `fluid.name_scope <https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/name_scope_cn.html#name-scope>`_
diff --git a/docs/docs/table_latency.md b/docs/zh_cn/api_cn/table_latency.md
similarity index 99%
rename from docs/docs/table_latency.md
rename to docs/zh_cn/api_cn/table_latency.md
index cc8d524b0999eed2d3c030f87a3f6d0cd4c246a5..125e2ad87712f1e5bbaec847ec15c2840b25ae40 100644
--- a/docs/docs/table_latency.md
+++ b/docs/zh_cn/api_cn/table_latency.md
@@ -32,7 +32,7 @@
 
 **格式**
 
-```
+```text
 op_type,flag_bias,flag_relu,n_in,c_in,h_in,w_in,c_out,groups,kernel,padding,stride,dilation\tlatency
 ```
 
@@ -57,7 +57,7 @@ op_type,flag_bias,flag_relu,n_in,c_in,h_in,w_in,c_out,groups,kernel,padding,stri
 
 **格式**
 
-```
+```text
 op_type,n_in,c_in,h_in,w_in\tlatency
 ```
 
@@ -74,7 +74,7 @@ op_type,n_in,c_in,h_in,w_in\tlatency
 
 **格式**
 
-```
+```text
 op_type,active_type,n_in,c_in,h_in,w_in\tlatency
 ```
 
@@ -92,7 +92,7 @@ op_type,active_type,n_in,c_in,h_in,w_in\tlatency
 
 **格式**
 
-```
+```text
 op_type,n_in,c_in,h_in,w_in\tlatency
 ```
 
@@ -109,7 +109,7 @@ op_type,n_in,c_in,h_in,w_in\tlatency
 
 **格式**
 
-```
+```text
 op_type,flag_global_pooling,n_in,c_in,h_in,w_in,kernel,padding,stride,ceil_mode,pool_type\tlatency
 ```
 
@@ -132,7 +132,7 @@ op_type,flag_global_pooling,n_in,c_in,h_in,w_in,kernel,padding,stride,ceil_mode,
 
 **格式**
 
-```
+```text
 op_type,axis,n_in,c_in,h_in,w_in\tlatency
 ```
 
diff --git a/docs/zh_cn/conf.py b/docs/zh_cn/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9425e93f98789efd68f29d25f7e7d9e83953940
--- /dev/null
+++ b/docs/zh_cn/conf.py
@@ -0,0 +1,170 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../../../'))
+
+# -- Project information -----------------------------------------------------
+
+project = u'PaddleSlim'
+copyright = u'2020, paddleslim'
+author = u'paddleslim'
+
+# The short X.Y version
+version = u''
+# The full version, including alpha/beta/rc tags
+release = u''
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.doctest',
+    'sphinx.ext.coverage',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.githubpages',
+    'sphinx.ext.napoleon',
+    'recommonmark',
+    'sphinx_markdown_tables',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = ['.rst', '.md']
+#source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = u'zh_CN'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'PaddleSlimdoc'
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [(master_doc, 'PaddleSlim.tex', u'PaddleSlim Documentation',
+                    u'paddleslim', 'manual'), ]
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(master_doc, 'paddleslim', u'PaddleSlim Documentation', [author],
+              1)]
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'PaddleSlim', u'PaddleSlim Documentation', author,
+     'PaddleSlim', 'One line description of project.', 'Miscellaneous'),
+]
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+# -- Extension configuration -------------------------------------------------
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8c6debe0a5e1768a3c1edeabbbcb43eb638bd496
--- /dev/null
+++ b/docs/zh_cn/index.rst
@@ -0,0 +1,22 @@
+.. PaddleSlim documentation master file, created by
+   sphinx-quickstart on Wed Feb  5 14:04:52 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+首页
+==================
+
+.. toctree::
+   :maxdepth: 1
+
+   intro.md
+   install.md
+   quick_start/index
+   tutorials/index
+   api_cn/index
+   FAQ/index
+   model_zoo/index
+   algo/algo.md
+   CHANGELOG.md
+
+.. mdinclude:: ./intro.md
diff --git a/docs/zh_cn/install.md b/docs/zh_cn/install.md
new file mode 100644
index 0000000000000000000000000000000000000000..cc39f6311692184532659bc302e252e5a59281ac
--- /dev/null
+++ b/docs/zh_cn/install.md
@@ -0,0 +1,23 @@
+# 安装
+
+安装PaddleSlim前，请确认已正确安装Paddle1.6版本或更新版本。Paddle安装请参考：[Paddle安装教程](https://www.paddlepaddle.org.cn/install/quick)。
+
+
+- 安装develop版本
+
+
+```bash
+git clone https://github.com/PaddlePaddle/PaddleSlim.git
+cd PaddleSlim
+python setup.py install
+```
+
+- 安装官方发布的最新版本
+
+```bash
+pip install paddleslim -i https://pypi.org/simple
+```
+
+- 安装历史版本
+
+请点击[pypi.org](https://pypi.org/project/paddleslim/#history)查看可安装历史版本。
diff --git a/docs/zh_cn/intro.md b/docs/zh_cn/intro.md
new file mode 100644
index 0000000000000000000000000000000000000000..814f1eeef837c0400abfdf1c6264114bb9c5b3c2
--- /dev/null
+++ b/docs/zh_cn/intro.md
@@ -0,0 +1,77 @@
+# 介绍
+
+PaddleSlim是一个模型压缩工具库，包含模型剪裁、定点量化、知识蒸馏、超参搜索和模型结构搜索等一系列模型压缩策略。
+
+对于业务用户，PaddleSlim提供完整的模型压缩解决方案，可用于图像分类、检测、分割等各种类型的视觉场景。
+同时也在持续探索NLP领域模型的压缩方案。另外，PaddleSlim提供且在不断完善各种压缩策略在经典开源任务的benchmark,
+以便业务用户参考。
+
+对于模型压缩算法研究者或开发者，PaddleSlim提供各种压缩策略的底层辅助接口，方便用户复现、调研和使用最新论文方法。
+PaddleSlim会从底层能力、技术咨询合作和业务场景等角度支持开发者进行模型压缩策略相关的创新工作。
+
+
+## 功能
+
+- 模型剪裁
+  - 卷积通道均匀剪裁
+  - 基于敏感度的卷积通道剪裁
+  - 基于进化算法的自动剪裁
+
+- 定点量化
+  - 在线量化训练（training aware）
+  - 静态离线量化（static post training）
+  - 动态离线量化（dynamic post training）
+
+- 知识蒸馏
+  - 支持单进程知识蒸馏
+  - 支持多进程分布式知识蒸馏
+
+- 神经网络结构自动搜索（NAS）
+  - 支持基于进化算法的轻量神经网络结构自动搜索
+  - 支持One-Shot网络结构自动搜索
+  - 支持基于梯度的DARTS网络结构自动搜索
+  - 支持 FLOPS / 硬件延时约束
+  - 支持多平台模型延时评估
+  - 支持用户自定义搜索算法和搜索空间
+
+
+## 部分压缩策略效果
+
+### 分类模型
+
+数据: ImageNet2012; 模型: MobileNetV1;
+
+|压缩策略 |精度收益(baseline: 70.91%) |模型大小(baseline: 17.0M)|
+|:---:|:---:|:---:|
+| 知识蒸馏(ResNet50)| **+1.06%** | |
+| 知识蒸馏(ResNet50) + int8量化训练 |**+1.10%**| **-71.76%**|
+| 剪裁(FLOPs-50%) + int8量化训练|**-1.71%**|**-86.47%**|
+
+
+### 图像检测模型
+
+#### 数据：Pascal VOC；模型：MobileNet-V1-YOLOv3
+
+|        压缩方法           | mAP(baseline: 76.2%)         | 模型大小(baseline: 94MB)      |
+| :---------------------:   | :------------: | :------------:|
+| 知识蒸馏(ResNet34-YOLOv3) | **+2.8%**    |              |
+| 剪裁 FLOPs -52.88%        | **+1.4%**      | **-67.76%**   |
+|知识蒸馏(ResNet34-YOLOv3)+剪裁(FLOPs-69.57%)| **+2.6%**|**-67.00%**|
+
+
+#### 数据：COCO；模型：MobileNet-V1-YOLOv3
+
+|        压缩方法           | mAP(baseline: 29.3%) | 模型大小|
+| :---------------------:   | :------------: | :------:|
+| 知识蒸馏(ResNet34-YOLOv3) |  **+2.1%**     | |
+| 知识蒸馏(ResNet34-YOLOv3)+剪裁(FLOPs-67.56%) | **-0.3%** | **-66.90%**|
+
+### 搜索
+
+数据：ImageNet2012; 模型：MobileNetV2
+
+|硬件环境           | 推理耗时 | Top1准确率(baseline:71.90%) |
+|:---------------:|:---------:|:--------------------:|
+| RK3288  | **-23%**    | +0.07%    |
+| Android cellphone  | **-20%**    | +0.16% |
+| iPhone 6s   | **-17%**  | +0.32%  |
diff --git a/docs/zh_cn/model_zoo.md b/docs/zh_cn/model_zoo.md
new file mode 100644
index 0000000000000000000000000000000000000000..a662fb2ac05a78f49dce6fb976777abf494103a4
--- /dev/null
+++ b/docs/zh_cn/model_zoo.md
@@ -0,0 +1,260 @@
+# 模型库
+
+## 1. 图像分类
+
+数据集：ImageNet1000类
+
+### 1.1 量化
+
+| 模型 | 压缩方法 | Top-1/Top-5 Acc | 模型体积（MB） | TensorRT时延(V100, ms) | 下载 |
+|:--:|:---:|:--:|:--:|:--:|:--:|
+|MobileNetV1|-|70.99%/89.68%| 17 | -| [下载链接](http://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV1_pretrained.tar) |
+|MobileNetV1|quant_post|70.18%/89.25% (-0.81%/-0.43%)| 4.4 | - | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV1_quant_post.tar) |
+|MobileNetV1|quant_aware|70.60%/89.57% (-0.39%/-0.11%)| 4.4 | -| [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV1_quant_aware.tar) |
+| MobileNetV2 | - |72.15%/90.65%| 15 | - | [下载链接](https://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV2_pretrained.tar) |
+| MobileNetV2 | quant_post | 71.15%/90.11% (-1%/-0.54%)| 4.0   | - | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV2_quant_post.tar) |
+| MobileNetV2 | quant_aware |72.05%/90.63% (-0.1%/-0.02%)| 4.0 | - | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV2_quant_aware.tar) |
+|ResNet50|-|76.50%/93.00%| 99 | 2.71 | [下载链接](http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_pretrained.tar) |
+|ResNet50|quant_post|76.33%/93.02% (-0.17%/+0.02%)| 25.1| 1.19 | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/ResNet50_quant_post.tar) |
+|ResNet50|quant_aware|    76.48%/93.11% (-0.02%/+0.11%)| 25.1 | 1.17 | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/ResNet50_quant_awre.tar) |
+
+分类模型Lite时延(ms)
+
+| 设备    | 模型类型    | 压缩策略      | armv7 Thread 1 | armv7 Thread 2 | armv7 Thread 4 | armv8 Thread 1 | armv8 Thread 2 | armv8 Thread 4 |
+| ------- | ----------- | ------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- |
+| 高通835 | MobileNetV1 | FP32 baseline | 96.1942        | 53.2058        | 32.4468        | 88.4955        | 47.95          | 27.5189        |
+| 高通835 | MobileNetV1 | quant_aware   | 60.8186        | 32.1931        | 16.4275        | 56.4311        | 29.5446        | 15.1053        |
+| 高通835 | MobileNetV1 | quant_post    | 60.5615        | 32.4016        | 16.6596        | 56.5266        | 29.7178        | 15.1459        |
+| 高通835 | MobileNetV2 | FP32 baseline | 65.715         | 38.1346        | 25.155         | 61.3593        | 36.2038        | 22.849         |
+| 高通835 | MobileNetV2 | quant_aware   | 48.3655        | 30.2021        | 21.9303        | 46.1487        | 27.3146        | 18.3053        |
+| 高通835 | MobileNetV2 | quant_post    | 48.3495        | 30.3069        | 22.1506        | 45.8715        | 27.4105        | 18.2223        |
+| 高通835 | ResNet50    | FP32 baseline | 526.811        | 319.6486       | 205.8345       | 506.1138       | 335.1584       | 214.8936       |
+| 高通835 | ResNet50    | quant_aware   | 475.4538       | 256.8672       | 139.699        | 461.7344       | 247.9506       | 145.9847       |
+| 高通835 | ResNet50    | quant_post    | 476.0507       | 256.5963       | 139.7266       | 461.9176       | 248.3795       | 149.353        |
+| 高通855 | MobileNetV1 | FP32 baseline | 33.5086        | 19.5773        | 11.7534        | 31.3474        | 18.5382        | 10.0811        |
+| 高通855 | MobileNetV1 | quant_aware   | 36.7067        | 21.628         | 11.0372        | 14.0238        | 8.199          | 4.2588         |
+| 高通855 | MobileNetV1 | quant_post    | 37.0498        | 21.7081        | 11.0779        | 14.0947        | 8.1926         | 4.2934         |
+| 高通855 | MobileNetV2 | FP32 baseline | 25.0396        | 15.2862        | 9.6609         | 22.909         | 14.1797        | 8.8325         |
+| 高通855 | MobileNetV2 | quant_aware   | 28.1583        | 18.3317        | 11.8103        | 16.9158        | 11.1606        | 7.4148         |
+| 高通855 | MobileNetV2 | quant_post    | 28.1631        | 18.3917        | 11.8333        | 16.9399        | 11.1772        | 7.4176         |
+| 高通855 | ResNet50    | FP32 baseline | 185.3705       | 113.0825       | 87.0741        | 177.7367       | 110.0433       | 74.4114        |
+| 高通855 | ResNet50    | quant_aware   | 327.6883       | 202.4536       | 106.243        | 243.5621       | 150.0542       | 78.4205        |
+| 高通855 | ResNet50    | quant_post    | 328.2683       | 201.9937       | 106.744        | 242.6397       | 150.0338       | 79.8659        |
+| 麒麟970 | MobileNetV1 | FP32 baseline | 101.2455       | 56.4053        | 35.6484        | 94.8985        | 51.7251        | 31.9511        |
+| 麒麟970 | MobileNetV1 | quant_aware   | 62.5012        | 32.1863        | 16.6018        | 57.7477        | 29.2116        | 15.0703        |
+| 麒麟970 | MobileNetV1 | quant_post    | 62.4412        | 32.2585        | 16.6215        | 57.825         | 29.2573        | 15.1206        |
+| 麒麟970 | MobileNetV2 | FP32 baseline | 70.4176        | 42.0795        | 25.1939        | 68.9597        | 39.2145        | 22.6617        |
+| 麒麟970 | MobileNetV2 | quant_aware   | 52.9961        | 31.5323        | 22.1447        | 49.4858        | 28.0856        | 18.7287        |
+| 麒麟970 | MobileNetV2 | quant_post    | 53.0961        | 31.7987        | 21.8334        | 49.383         | 28.2358        | 18.3642        |
+| 麒麟970 | ResNet50    | FP32 baseline | 586.8943       | 344.0858       | 228.2293       | 573.3344       | 351.4332       | 225.8006       |
+| 麒麟970 | ResNet50    | quant_aware   | 488.361        | 260.1697       | 142.416        | 479.5668       | 249.8485       | 138.1742       |
+| 麒麟970 | ResNet50    | quant_post    | 489.6188       | 258.3279       | 142.6063       | 480.0064       | 249.5339       | 138.5284       |
+
+
+
+
+
+### 1.2 剪裁
+
+
+PaddleLite推理耗时说明：
+
+环境：Qualcomm SnapDragon 845 + armv8
+
+速度指标：Thread1/Thread2/Thread4耗时
+
+PaddleLite版本： v2.3
+
+
+| 模型 | 压缩方法 | Top-1/Top-5 Acc | 模型体积（MB） | GFLOPs |PaddleLite推理耗时|TensorRT推理速度(FPS)| 下载 |
+|:--:|:---:|:--:|:--:|:--:|:--:|:--:|:--:|
+| MobileNetV1 |    Baseline    |         70.99%/89.68%         |       17       |  1.11  |66.052\35.8014\19.5762|-| [下载链接](http://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV1_pretrained.tar) |
+| MobileNetV1 |  uniform -50%  | 69.4%/88.66% (-1.59%/-1.02%)  |       9        |  0.56  | 33.5636\18.6834\10.5076|-|[下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV1_uniform-50.tar) |
+| MobileNetV1 | sensitive -30% |  70.4%/89.3% (-0.59%/-0.38%)  |       12       |  0.74  | 46.5958\25.3098\13.6982|-|[下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV1_sensitive-30.tar) |
+| MobileNetV1 | sensitive -50% | 69.8% / 88.9% (-1.19%/-0.78%) |       9        |  0.56  |37.9892\20.7882\11.3144|-| [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV1_sensitive-50.tar) |
+| MobileNetV2 |       -        |         72.15%/90.65%         |       15       |  0.59  |41.7874\23.375\13.3998|-| [下载链接](https://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV2_pretrained.tar) |
+| MobileNetV2 |  uniform -50%  | 65.79%/86.11% (-6.35%/-4.47%) |       11       | 0.296  |23.8842\13.8698\8.5572|-| [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV2_uniform-50.tar) |
+|  ResNet34   |       -        |         74.57%/92.14%         |       84       |  7.36  |217.808\139.943\96.7504|342.32| [下载链接](https://paddle-imagenet-models-name.bj.bcebos.com/ResNet34_pretrained.tar) |
+|  ResNet34   |  uniform -50%  | 70.99%/89.95% (-3.58%/-2.19%) |       41       |  3.67  |114.787\75.0332\51.8438|452.41| [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/ResNet34_uniform-50.tar) |
+|  ResNet34   |  auto -55.05%  | 70.24%/89.63% (-4.33%/-2.51%) |       33       |  3.31  |105.924\69.3222\48.0246|457.25| [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/ResNet34_auto-55.tar) |
+
+
+### 1.3 蒸馏
+
+| 模型 | 压缩方法 | Top-1/Top-5 Acc | 模型体积（MB） | 下载 |
+|:--:|:---:|:--:|:--:|:--:|
+| MobileNetV1 |                     student                     |  70.99%/89.68%  |       17       | [下载链接](http://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV1_pretrained.tar) |
+|ResNet50_vd|teacher|79.12%/94.44%| 99 | [下载链接](https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_pretrained.tar) |
+|MobileNetV1|ResNet50_vd<sup>[1](#trans1)</sup> distill|72.77%/90.68% (+1.78%/+1.00%)| 17 | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV1_distilled.tar) |
+| MobileNetV2 |                     student                     |  72.15%/90.65%  |       15       | [下载链接](https://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV2_pretrained.tar) |
+| MobileNetV2 |            ResNet50_vd distill             |  74.28%/91.53% (+2.13%/+0.88%)  |       15       | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/MobileNetV2_distilled.tar) |
+|  ResNet50   |                     student                     |  76.50%/93.00%  |       99       | [下载链接](http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_pretrained.tar) |
+|ResNet101|teacher|77.56%/93.64%| 173 | [下载链接](http://paddle-imagenet-models-name.bj.bcebos.com/ResNet101_pretrained.tar) |
+|  ResNet50   |             ResNet101 distill              |  77.29%/93.65% (+0.79%/+0.65%)  |       99       | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/ResNet50_distilled.tar) |
+
+注意：带"_vd"后缀代表该预训练模型使用了Mixup，Mixup相关介绍参考[mixup: Beyond Empirical Risk Minimization](https://arxiv.org/abs/1710.09412)
+
+### 1.4 搜索
+
+数据集: ImageNet1000
+
+| 模型 | 压缩方法 | Top-1/Top-5 Acc | 模型体积（MB） | GFLOPs | 下载 |
+|:--:|:---:|:--:|:--:|:--:|:--:|
+| MobileNetV2 |       -        |            72.15%/90.65%           |     15      |  0.59  | [下载链接](https://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV2_pretrained.tar) |
+| MobileNetV2 |     SANAS      |  71.518%/90.208% (-0.632%/-0.442%) |     14      | 0.295  | [下载链接](https://paddlemodels.cdn.bcebos.com/PaddleSlim/MobileNetV2_sanas.tar) |
+
+数据集: Cifar10
+| 模型 |压缩方法 |  Acc  | 模型参数（MB） | 下载 |
+|:---:|:--:|:--:|:--:|:--:|
+|          Darts               |    -    |     97.135%        |        3.767        |  -  |
+| Darts_SA(基于Darts搜索空间)  |  SANAS  | 97.276%(+0.141%)   |    3.344(-11.2%)    |  -  |
+
+Note: MobileNetV2_NAS 的token是：[4, 4, 5, 1, 1, 2, 1, 1, 0, 2, 6, 2, 0, 3, 4, 5, 0, 4, 5, 5, 1, 4, 8, 0, 0]. Darts_SA的token是：[5, 5, 0, 5, 5, 10, 7, 7, 5, 7, 7, 11, 10, 12, 10, 0, 5, 3, 10, 8].
+
+
+
+## 2. 目标检测
+
+### 2.1 量化
+
+数据集： COCO 2017
+
+|              模型              |  压缩方法   | 数据集 | Image/GPU | 输入608 Box AP | 输入416 Box AP | 输入320 Box AP | 模型体积（MB） |   TensorRT时延(V100, ms) |  下载     |
+| :----------------------------: | :---------: | :----: | :-------: | :------------: | :------------: | :------------: | :------------: | :----------: |:----------: |
+|      MobileNet-V1-YOLOv3       |      -      |  COCO  |     8     |      29.3      |      29.3      |      27.1      |       95       |  - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) |
+|      MobileNet-V1-YOLOv3       | quant_post  |  COCO  |     8     |     27.9 (-1.4)|    28.0 (-1.3)      |    26.0 (-1.0) |       25       | -  | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_quant_post.tar) |
+|      MobileNet-V1-YOLOv3       | quant_aware |  COCO  |     8     |     28.1 (-1.2)|  28.2 (-1.1)      |    25.8 (-1.2) |       26.3     | -  | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenet_coco_quant_aware.tar) |
+|      R34-YOLOv3                |      -      |  COCO  |     8     |      36.2      |      34.3      |      31.4      |       162       |  - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34.tar) |
+|      R34-YOLOv3                | quant_post  |  COCO  |     8     | 35.7 (-0.5)    |      -         |      -         |       42.7      |  - | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r34_coco_quant_post.tar) |
+|      R34-YOLOv3                | quant_aware |  COCO  |     8     |  35.2 (-1.0)   | 33.3 (-1.0)    |     30.3 (-1.1)|       44       |  - | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r34_coco_quant_aware.tar) |
+| R50-dcn-YOLOv3 obj365_pretrain |      -      |  COCO  |     8     |      41.4      |       -      |       -       |       177       | 18.56  |[下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r50vd_dcn_obj365_pretrained_coco.tar) |
+| R50-dcn-YOLOv3 obj365_pretrain | quant_aware |  COCO  |     8     |   40.6 (-0.8)  |       37.5   |       34.1    |       66       |  14.64 | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r50vd_dcn_obj365_pretrained_coco_quant_aware.tar) |
+
+
+
+数据集：WIDER-FACE
+
+
+
+|      模型      |  压缩方法   | Image/GPU | 输入尺寸 |        Easy/Medium/Hard         | 模型体积（MB） |                             下载                             |
+| :------------: | :---------: | :-------: | :------: | :-----------------------------: | :------------: | :----------------------------------------------------------: |
+|   BlazeFace    |      -      |     8     |   640    |         91.5/89.2/79.7          |      815       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/blazeface_original.tar) |
+|   BlazeFace    | quant_post  |     8     |   640    | 87.8/85.1/74.9 (-3.7/-4.1/-4.8) |      228       | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/blazeface_origin_quant_post.tar) |
+|   BlazeFace    | quant_aware |     8     |   640    | 90.5/87.9/77.6 (-1.0/-1.3/-2.1) |      228       | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/blazeface_origin_quant_aware.tar) |
+| BlazeFace-Lite |      -      |     8     |   640    |         90.9/88.5/78.1          |      711       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/blazeface_lite.tar) |
+| BlazeFace-Lite | quant_post  |     8     |   640    | 89.4/86.7/75.7 (-1.5/-1.8/-2.4) |      211       | [下载链接]((https://paddlemodels.bj.bcebos.com/PaddleSlim/blazeface_lite_quant_post.tar)) |
+| BlazeFace-Lite | quant_aware |     8     |   640    | 89.7/87.3/77.0 (-1.2/-1.2/-1.1) |      211       | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/blazeface_lite_quant_aware.tar) |
+| BlazeFace-NAS  |      -      |     8     |   640    |         83.7/80.7/65.8          |      244       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/blazeface_nas.tar) |
+| BlazeFace-NAS  | quant_post  |     8     |   640    | 81.6/78.3/63.6 (-2.1/-2.4/-2.2) |       71       | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/blazeface_nas_quant_post.tar) |
+| BlazeFace-NAS  | quant_aware |     8     |   640    | 83.1/79.7/64.2 (-0.6/-1.0/-1.6) |       71       | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/blazeface_nas_quant_aware.tar) |
+
+### 2.2 剪裁
+
+
+数据集：Pasacl VOC & COCO 2017
+
+PaddleLite推理耗时说明：
+
+环境：Qualcomm SnapDragon 845 + armv8
+
+速度指标：Thread1/Thread2/Thread4耗时
+
+PaddleLite版本： v2.3
+
+|              模型              |     压缩方法      |   数据集   | Image/GPU | 输入608 Box AP | 输入416 Box AP | 输入320 Box AP | 模型体积(MB) | GFLOPs (608*608) | PaddleLite推理耗时(ms)(608*608) | TensorRT推理速度(FPS)(608*608) | 下载 |
+| :----------------------------: | :---------------: | :--------: | :-------: | :------------: | :------------: | :------------: | :----------: | :--------------: | :--------------: | :--------------: | :-----------------------------------: |
+|      MobileNet-V1-YOLOv3       |     Baseline      | Pascal VOC |     8     |      76.2      |      76.7      |      75.3      |      94      |      40.49       | 1238\796.943\520.101|60.04| [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) |
+|      MobileNet-V1-YOLOv3       | sensitive -52.88% | Pascal VOC |     8     |  77.6 (+1.4)   |   77.7 (1.0)   |  75.5 (+0.2)   |      31      |      19.08       | 602.497\353.759\222.427 |99.36| [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenet_v1_voc_prune.tar) |
+|      MobileNet-V1-YOLOv3       |         -         |    COCO    |     8     |      29.3      |      29.3      |      27.0      |      95      |      41.35       |-|-| [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) |
+|      MobileNet-V1-YOLOv3       | sensitive -51.77% |    COCO    |     8     |  26.0 (-3.3)   |  25.1 (-4.2)   |  22.6 (-4.4)   |      32      |      19.94       |-|73.93| [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenet_v1_prune.tar) |
+|         R50-dcn-YOLOv3         |         -         |    COCO    |     8     |      39.1      |       -        |       -        |     177      |      89.60       |-|27.68| [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r50vd_dcn.tar) |
+|         R50-dcn-YOLOv3         | sensitive -9.37%  |    COCO    |     8     |  39.3 (+0.2)   |       -        |       -        |     150      |      81.20       |-|30.08| [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r50vd_dcn_prune.tar) |
+|         R50-dcn-YOLOv3         | sensitive -24.68% |    COCO    |     8     |  37.3 (-1.8)   |       -        |       -        |     113      |      67.48       |-|34.32| [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r50vd_dcn_prune578.tar) |
+| R50-dcn-YOLOv3 obj365_pretrain |         -         |    COCO    |     8     |      41.4      |       -        |       -        |     177      |      89.60       |-|-| [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r50vd_dcn_obj365_pretrained_coco.tar) |
+| R50-dcn-YOLOv3 obj365_pretrain | sensitive -9.37%  |    COCO    |     8     |  40.5 (-0.9)   |       -        |       -        |     150      |      81.20       |-|-| [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r50vd_dcn_obj365_pretrained_coco_prune.tar) |
+| R50-dcn-YOLOv3 obj365_pretrain | sensitive -24.68% |    COCO    |     8     |  37.8 (-3.3)   |       -        |       -        |     113      |      67.48       |-|-| [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r50vd_dcn_obj365_pretrained_coco_prune578.tar) |
+
+### 2.3 蒸馏
+
+数据集：Pasacl VOC & COCO 2017
+
+
+|        模型         |        压缩方法         |   数据集   | Image/GPU | 输入608 Box AP | 输入416 Box AP | 输入320 Box AP | 模型体积（MB） |                             下载                             |
+| :-----------------: | :---------------------: | :--------: | :-------: | :------------: | :------------: | :------------: | :------------: | :----------------------------------------------------------: |
+| MobileNet-V1-YOLOv3 |            -            | Pascal VOC |     8     |      76.2      |      76.7      |      75.3      |       94       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) |
+|   ResNet34-YOLOv3   |            -            | Pascal VOC |     8     |      82.6      |      81.9      |      80.1      |      162       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34_voc.tar) |
+| MobileNet-V1-YOLOv3 | ResNet34-YOLOv3 distill | Pascal VOC |     8     |  79.0 (+2.8)   |  78.2 (+1.5)   |  75.5 (+0.2)   |       94       | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_voc_distilled.tar) |
+| MobileNet-V1-YOLOv3 |            -            |    COCO    |     8     |      29.3      |      29.3      |      27.0      |       95       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) |
+|   ResNet34-YOLOv3   |            -            |    COCO    |     8     |      36.2      |      34.3      |      31.4      |      163       | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34.tar) |
+| MobileNet-V1-YOLOv3 | ResNet34-YOLOv3 distill |    COCO    |     8     |  31.4 (+2.1)   |  30.0 (+0.7)   |  27.1 (+0.1)   |       95       | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_distilled.tar) |
+
+
+### 2.4 搜索
+
+数据集：WIDER-FACE
+
+|      模型      |  压缩方法   | Image/GPU | 输入尺寸 |        Easy/Medium/Hard         | 模型体积（KB） |    硬件延时（ms）|                         下载                             |
+| :------------: | :---------: | :-------: | :------: | :-----------------------------: | :------------: | :------------: | :----------------------------------------------------------: |
+|   BlazeFace    |      -      |     8     |   640    |         91.5/89.2/79.7          |      815       |       71.862     | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/blazeface_original.tar) |
+| BlazeFace-NAS  |      -      |     8     |   640    |         83.7/80.7/65.8          |      244       |       21.117     |[下载链接](https://paddlemodels.bj.bcebos.com/object_detection/blazeface_nas.tar) |
+| BlazeFace-NASV2 |    SANAS    |     8     |   640    |         87.0/83.7/68.5          |      389       |       22.558     | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/blazeface_nas2.tar) |
+
+Note: 硬件延时时间是利用提供的硬件延时表得到的，硬件延时表是在855芯片上基于PaddleLite测试的结果。BlazeFace-NASV2的详细配置在[这里](https://github.com/PaddlePaddle/PaddleDetection/blob/master/configs/face_detection/blazeface_nas_v2.yml).
+
+## 3. 图像分割
+
+数据集：Cityscapes
+
+### 3.1 量化
+
+|          模型          |  压缩方法   |     mIoU      | 模型体积（MB） |                             下载                             |
+| :--------------------: | :---------: | :-----------: | :------------: | :----------------------------------------------------------: |
+| DeepLabv3+/MobileNetv1 |      -      |     63.26     |      6.6       | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/deeplabv3_mobilenetv1.tar )                         |
+| DeepLabv3+/MobileNetv1 | quant_post  | 58.63 (-4.63) |      1.8       | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/deeplabv3_mobilenetv1_2049x1025_quant_post.tar) |
+| DeepLabv3+/MobileNetv1 | quant_aware | 62.03 (-1.23) |      1.8       | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/deeplabv3_mobilenetv1_2049x1025_quant_aware.tar) |
+| DeepLabv3+/MobileNetv2 |      -      |     69.81     |      7.4       | [下载链接](https://paddleseg.bj.bcebos.com/models/mobilenet_cityscapes.tgz) |
+| DeepLabv3+/MobileNetv2 | quant_post  | 67.59 (-2.22) |      2.1       | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/deeplabv3_mobilenetv2_2049x1025_quant_post.tar) |
+| DeepLabv3+/MobileNetv2 | quant_aware | 68.33 (-1.48) |      2.1       | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/deeplabv3_mobilenetv2_2049x1025_quant_aware.tar) |
+
+图像分割模型Lite时延(ms), 输入尺寸769x769
+
+| 设备    | 模型类型               | 压缩策略      | armv7 Thread 1 | armv7 Thread 2 | armv7 Thread 4 | armv8 Thread 1 | armv8 Thread 2 | armv8 Thread 4 |
+| ------- | ---------------------- | ------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- |
+| 高通835 | Deeplabv3- MobileNetV1 | FP32 baseline | 1227.9894      | 734.1922       | 527.9592       | 1109.96        | 699.3818       | 479.0818       |
+| 高通835 | Deeplabv3- MobileNetV1 | quant_aware   | 848.6544       | 512.785        | 382.9915       | 752.3573       | 455.0901       | 307.8808       |
+| 高通835 | Deeplabv3- MobileNetV1 | quant_post    | 840.2323       | 510.103        | 371.9315       | 748.9401       | 452.1745       | 309.2084       |
+| 高通835 | Deeplabv3-MobileNetV2  | FP32 baseline | 1282.8126      | 793.2064       | 653.6538       | 1193.9908      | 737.1827       | 593.4522       |
+| 高通835 | Deeplabv3-MobileNetV2  | quant_aware   | 976.0495       | 659.0541       | 513.4279       | 892.1468       | 582.9847       | 484.7512       |
+| 高通835 | Deeplabv3-MobileNetV2  | quant_post    | 981.44         | 658.4969       | 538.6166       | 885.3273       | 586.1284       | 484.0018       |
+| 高通855 | Deeplabv3- MobileNetV1 | FP32 baseline | 568.8748       | 339.8578       | 278.6316       | 420.6031       | 281.3197       | 217.5222       |
+| 高通855 | Deeplabv3- MobileNetV1 | quant_aware   | 608.7578       | 347.2087       | 260.653        | 241.2394       | 177.3456       | 143.9178       |
+| 高通855 | Deeplabv3- MobileNetV1 | quant_post    | 609.0142       | 347.3784       | 259.9825       | 239.4103       | 180.1894       | 139.9178       |
+| 高通855 | Deeplabv3-MobileNetV2  | FP32 baseline | 639.4425       | 390.1851       | 322.7014       | 477.7667       | 339.7411       | 262.2847       |
+| 高通855 | Deeplabv3-MobileNetV2  | quant_aware   | 703.7275       | 497.689        | 417.1296       | 394.3586       | 300.2503       | 239.9204       |
+| 高通855 | Deeplabv3-MobileNetV2  | quant_post    | 705.7589       | 474.4076       | 427.2951       | 394.8352       | 297.4035       | 264.6724       |
+| 麒麟970 | Deeplabv3- MobileNetV1 | FP32 baseline | 1682.1792      | 1437.9774      | 1181.0246      | 1261.6739      | 1068.6537      | 690.8225       |
+| 麒麟970 | Deeplabv3- MobileNetV1 | quant_aware   | 1062.3394      | 1248.1014      | 878.3157       | 774.6356       | 710.6277       | 528.5376       |
+| 麒麟970 | Deeplabv3- MobileNetV1 | quant_post    | 1109.1917      | 1339.6218      | 866.3587       | 771.5164       | 716.5255       | 500.6497       |
+| 麒麟970 | Deeplabv3-MobileNetV2  | FP32 baseline | 1771.1301      | 1746.0569      | 1222.4805      | 1448.9739      | 1192.4491      | 760.606        |
+| 麒麟970 | Deeplabv3-MobileNetV2  | quant_aware   | 1320.2905      | 921.4522       | 676.0732       | 1145.8801      | 821.5685       | 590.1713       |
+| 麒麟970 | Deeplabv3-MobileNetV2  | quant_post    | 1320.386       | 918.5328       | 672.2481       | 1020.753       | 820.094        | 591.4114       |
+
+
+
+
+
+### 3.2 剪裁
+
+PaddleLite推理耗时说明：
+
+环境：Qualcomm SnapDragon 845 + armv8
+
+速度指标：Thread1/Thread2/Thread4耗时
+
+PaddleLite版本： v2.3
+
+|   模型    |     压缩方法      |     mIoU      | 模型体积（MB） | GFLOPs | PaddleLite推理耗时 | TensorRT推理速度(FPS) |                             下载                             |
+| :-------: | :---------------: | :-----------: | :------------: | :----: | :------------: | :----: | :--------------------------------------: |
+| fast-scnn |     baseline      |     69.64     |       11       | 14.41  | 1226.36\682.96\415.664 |39.53| [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/fast_scnn_cityscape.tar) |
+| fast-scnn | uniform  -17.07%  | 69.58 (-0.06) |      8.5       | 11.95  | 1140.37\656.612\415.888 |42.01| [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/fast_scnn_cityscape_uniform-17.tar) |
+| fast-scnn | sensitive -47.60% | 66.68 (-2.96) |      5.7       |  7.55  | 866.693\494.467\291.748 |51.48| [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/fast_scnn_cityscape_sensitive-47.tar) |
diff --git a/docs/zh_cn/model_zoo/distillation_model_zoo.md b/docs/zh_cn/model_zoo/distillation_model_zoo.md
new file mode 100644
index 0000000000000000000000000000000000000000..bba7a41d2334fa5292cb7f97c3ba3708ff316f3e
--- /dev/null
+++ b/docs/zh_cn/model_zoo/distillation_model_zoo.md
@@ -0,0 +1 @@
+# 蒸馏模型库
diff --git a/docs/zh_cn/model_zoo/index.rst b/docs/zh_cn/model_zoo/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d38a3ac11934ddf98cef5e5787b4de7c5b46bc85
--- /dev/null
+++ b/docs/zh_cn/model_zoo/index.rst
@@ -0,0 +1,13 @@
+
+模型库
+========
+
+.. toctree::
+   :maxdepth: 1
+
+   model_zoo.md
+   quant_model_zoo.md
+   distillation_model_zoo.md
+   prune_model_zoo.md
+   nas_model_zoo.md
+    
diff --git a/docs/zh_cn/model_zoo/model_zoo.md b/docs/zh_cn/model_zoo/model_zoo.md
new file mode 100644
index 0000000000000000000000000000000000000000..a990f4db22e28fa327f355fa2af6a8ee892fcc9b
--- /dev/null
+++ b/docs/zh_cn/model_zoo/model_zoo.md
@@ -0,0 +1,27 @@
+## 模型库概览
+
+### 量化主要结论
+
+| 任务 | 模型 | 数据集 | 结论 | 更多细节和模型下载 |
+|:--:|:---:|:--:|:--:|:--:|
+| 分类| MobileNetV1 | ImageNet | top1 -0.39% <br> 高通835 <font color='red'>1.6-2倍</font>加速<br> 高通855 armv8 <font color='red'>2倍</font>加速<br> 麒麟970 <font color='red'>1.6-2倍</font>加速 |[详细数据和模型下载]() |
+
+### 蒸馏主要结论
+
+| 任务 | 模型 | 数据集 | 结论 | 更多细节和模型下载 |
+|:--:|:---:|:--:|:--:|:--:|
+| 分类| MobileNetV1 | ImageNet | top1 -0.39% <br> 高通835 <font color='red'>1.6-2倍</font>加速<br> 高通855 armv8 <font color='red'>2倍</font>加速<br> 麒麟970 <font color='red'>1.6-2倍</font>加速 |[详细数据和模型下载]() |
+
+### 剪裁主要结论
+
+
+| 任务 | 模型 | 数据集 | 结论 | 更多细节和模型下载 |
+|:--:|:---:|:--:|:--:|:--:|
+| 分类| MobileNetV1 | ImageNet | top1 -0.39% <br> 高通835 <font color='red'>1.6-2倍</font>加速<br> 高通855 armv8 <font color='red'>2倍</font>加速<br> 麒麟970 <font color='red'>1.6-2倍</font>加速 |[详细数据和模型下载]() |
+
+### nas 主要结论
+
+
+| 任务 | 模型 | 数据集 | 结论 | 更多细节和模型下载 |
+|:--:|:---:|:--:|:--:|:--:|
+| 分类| MobileNetV1 | ImageNet | top1 -0.39% <br> 高通835 <font color='red'>1.6-2倍</font>加速<br> 高通855 armv8 <font color='red'>2倍</font>加速<br> 麒麟970 <font color='red'>1.6-2倍</font>加速 |[详细数据和模型下载]() |
diff --git a/docs/zh_cn/model_zoo/nas_model_zoo.md b/docs/zh_cn/model_zoo/nas_model_zoo.md
new file mode 100644
index 0000000000000000000000000000000000000000..f924d4167f0045878343d0afa9647b1bd0605144
--- /dev/null
+++ b/docs/zh_cn/model_zoo/nas_model_zoo.md
@@ -0,0 +1 @@
+# 模型结构搜索模型库
diff --git a/docs/zh_cn/model_zoo/prune_model_zoo.md b/docs/zh_cn/model_zoo/prune_model_zoo.md
new file mode 100644
index 0000000000000000000000000000000000000000..c9e4c16f193923b838cf220317a3cef8f4693570
--- /dev/null
+++ b/docs/zh_cn/model_zoo/prune_model_zoo.md
@@ -0,0 +1 @@
+# 剪裁模型库
diff --git a/docs/zh_cn/model_zoo/quant_model_zoo.md b/docs/zh_cn/model_zoo/quant_model_zoo.md
new file mode 100644
index 0000000000000000000000000000000000000000..c5062253a7ac12032e3b2e6df38d25758cc7fd10
--- /dev/null
+++ b/docs/zh_cn/model_zoo/quant_model_zoo.md
@@ -0,0 +1 @@
+# 量化模型库
diff --git a/docs/zh_cn/quick_start/distillation_tutorial.md b/docs/zh_cn/quick_start/distillation_tutorial.md
new file mode 100755
index 0000000000000000000000000000000000000000..aa2dd3e9f33d6cb8dc87251c600eab1e88bfac59
--- /dev/null
+++ b/docs/zh_cn/quick_start/distillation_tutorial.md
@@ -0,0 +1,113 @@
+#  图像分类模型知识蒸馏-快速开始
+
+该教程以图像分类模型MobileNetV1为例，说明如何快速使用[PaddleSlim的知识蒸馏接口](https://paddlepaddle.github.io/PaddleSlim/api/single_distiller_api/)。
+该示例包含以下步骤：
+
+1. 导入依赖
+2. 定义student_program和teacher_program
+3. 选择特征图
+4. 合并program(merge)并添加蒸馏loss
+5. 模型训练
+
+以下章节依次介绍每个步骤的内容。
+
+## 1. 导入依赖
+
+PaddleSlim依赖Paddle1.7版本，请确认已正确安装Paddle，然后按以下方式导入Paddle和PaddleSlim:
+
+```
+import paddle
+import paddle.fluid as fluid
+import paddleslim as slim
+```
+
+## 2. 定义student_program和teacher_program
+
+本教程在MNIST数据集上进行知识蒸馏的训练和验证，输入图片尺寸为`[1, 28, 28]`，输出类别数为10。
+选择`ResNet50`作为teacher对`MobileNet`结构的student进行蒸馏训练。
+
+```python
+model = slim.models.MobileNet()
+student_program = fluid.Program()
+student_startup = fluid.Program()
+with fluid.program_guard(student_program, student_startup):
+    image = fluid.data(
+        name='image', shape=[None] + [1, 28, 28], dtype='float32')
+    label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+    out = model.net(input=image, class_dim=10)
+    cost = fluid.layers.cross_entropy(input=out, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+```
+
+
+
+```python
+model = slim.models.ResNet50()
+teacher_program = fluid.Program()
+teacher_startup = fluid.Program()
+with fluid.program_guard(teacher_program, teacher_startup):
+    with fluid.unique_name.guard():
+        image = fluid.data(
+            name='image', shape=[None] + [1, 28, 28], dtype='float32')
+        predict = teacher_model.net(image, class_dim=10)
+exe = fluid.Executor(fluid.CPUPlace())
+exe.run(teacher_startup)
+```
+
+## 3. 选择特征图
+
+我们可以用student_的list_vars方法来观察其中全部的Variables，从中选出一个或多个变量（Variable）来拟合teacher相应的变量。
+
+```python
+# get all student variables
+student_vars = []
+for v in student_program.list_vars():
+    student_vars.append((v.name, v.shape))
+#uncomment the following lines to observe student's variables for distillation
+#print("="*50+"student_model_vars"+"="*50)
+#print(student_vars)
+
+# get all teacher variables
+teacher_vars = []
+for v in teacher_program.list_vars():
+    teacher_vars.append((v.name, v.shape))
+#uncomment the following lines to observe teacher's variables for distillation
+#print("="*50+"teacher_model_vars"+"="*50)
+#print(teacher_vars)
+```
+
+经过筛选我们可以看到，teacher_program中的'bn5c_branch2b.output.1.tmp_3'和student_program的'depthwise_conv2d_11.tmp_0'尺寸一致，可以组成蒸馏损失函数。
+
+## 4. 合并program (merge)并添加蒸馏loss
+merge操作将student_program和teacher_program中的所有Variables和Op都将被添加到同一个Program中，同时为了避免两个program中有同名变量会引起命名冲突，merge也会为teacher_program中的Variables添加一个同一的命名前缀name_prefix，其默认值是'teacher_'
+
+为了确保teacher网络和student网络输入的数据是一样的，merge操作也会对两个program的输入数据层进行合并操作，所以需要指定一个数据层名称的映射关系data_name_map，key是teacher的输入数据名称，value是student的
+
+```python
+data_name_map = {'image': 'image'}
+main = slim.dist.merge(teacher_program, student_program, data_name_map, fluid.CPUPlace())
+with fluid.program_guard(student_program, student_startup):
+    l2_loss = slim.dist.l2_loss('teacher_bn5c_branch2b.output.1.tmp_3', 'depthwise_conv2d_11.tmp_0', student_program)
+    loss = l2_loss + avg_cost
+    opt = fluid.optimizer.Momentum(0.01, 0.9)
+    opt.minimize(loss)
+exe.run(student_startup)
+```
+
+## 5. 模型训练
+
+为了快速执行该示例，我们选取简单的MNIST数据，Paddle框架的`paddle.dataset.mnist`包定义了MNIST数据的下载和读取。 代码如下：
+
+```python
+train_reader = paddle.fluid.io.batch(
+    paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+train_feeder = fluid.DataFeeder(['image', 'label'], fluid.CPUPlace(), student_program)
+```
+
+```python
+for data in train_reader():
+    acc1, acc5, loss_np = exe.run(student_program, feed=train_feeder.feed(data), fetch_list=[acc_top1.name, acc_top5.name, loss.name])
+    print("Acc1: {:.6f}, Acc5: {:.6f}, Loss: {:.6f}".format(acc1.mean(), acc5.mean(), loss_np.mean()))
+```
diff --git a/docs/zh_cn/quick_start/index.rst b/docs/zh_cn/quick_start/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..713218aa032c81885e08346df3fa6136d8aa96cf
--- /dev/null
+++ b/docs/zh_cn/quick_start/index.rst
@@ -0,0 +1,13 @@
+
+快速开始
+========
+
+.. toctree::
+   :maxdepth: 1
+
+   pruning_tutorial.md
+   distillation_tutorial.md
+   quant_aware_tutorial.md
+   quant_post_static_tutorial.md
+   nas_tutorial.md
+    
diff --git a/docs/zh_cn/quick_start/nas_tutorial.md b/docs/zh_cn/quick_start/nas_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..8fc0d6d084770c65a13f66932239bf65810a44fc
--- /dev/null
+++ b/docs/zh_cn/quick_start/nas_tutorial.md
@@ -0,0 +1,156 @@
+# 图像分类网络结构搜索-快速开始
+
+该教程以图像分类模型MobileNetV2为例，说明如何在cifar10数据集上快速使用[网络结构搜索接口](../api/nas_api.md)。
+该示例包含以下步骤：
+
+1. 导入依赖
+2. 初始化SANAS搜索实例
+3. 构建网络
+4. 定义输入数据函数
+5. 定义训练函数
+6. 定义评估函数
+7. 启动搜索实验  
+  7.1 获取模型结构  
+  7.2 构造program  
+  7.3 定义输入数据  
+  7.4 训练模型  
+  7.5 评估模型  
+  7.6 回传当前模型的得分
+8. 完整示例
+
+
+以下章节依次介绍每个步骤的内容。
+
+## 1. 导入依赖
+请确认已正确安装Paddle，导入需要的依赖包。
+```python
+import paddle
+import paddle.fluid as fluid
+import paddleslim as slim
+import numpy as np
+```
+
+## 2. 初始化SANAS搜索实例
+```python
+sanas = slim.nas.SANAS(configs=[('MobileNetV2Space')], server_addr=("", 8337), save_checkpoint=None)
+```
+
+## 3. 构建网络
+根据传入的网络结构构造训练program和测试program。
+```python
+def build_program(archs):
+    train_program = fluid.Program()
+    startup_program = fluid.Program()
+    with fluid.program_guard(train_program, startup_program):
+        data = fluid.data(name='data', shape=[None, 3, 32, 32], dtype='float32')
+        label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+        output = archs(data)
+        output = fluid.layers.fc(input=output, size=10)
+
+        softmax_out = fluid.layers.softmax(input=output, use_cudnn=False)
+        cost = fluid.layers.cross_entropy(input=softmax_out, label=label)
+        avg_cost = fluid.layers.mean(cost)
+        acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=softmax_out, label=label, k=5)
+        test_program = fluid.default_main_program().clone(for_test=True)
+
+        optimizer = fluid.optimizer.Adam(learning_rate=0.1)
+        optimizer.minimize(avg_cost)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_program)
+    return exe, train_program, test_program, (data, label), avg_cost, acc_top1, acc_top5
+```
+
+## 4. 定义输入数据函数
+使用的数据集为cifar10，paddle框架中`paddle.dataset.cifar`包括了cifar数据集的下载和读取，代码如下：
+```python
+def input_data(inputs):
+    train_reader = paddle.fluid.io.batch(paddle.reader.shuffle(paddle.dataset.cifar.train10(cycle=False), buf_size=1024),batch_size=256)
+    train_feeder = fluid.DataFeeder(inputs, fluid.CPUPlace())
+    eval_reader = paddle.fluid.io.batch(paddle.dataset.cifar.test10(cycle=False), batch_size=256)
+    eval_feeder = fluid.DataFeeder(inputs, fluid.CPUPlace())
+    return train_reader, train_feeder, eval_reader, eval_feeder
+```
+
+## 5. 定义训练函数
+根据训练program和训练数据进行训练。
+```python
+def start_train(program, data_reader, data_feeder):
+    outputs = [avg_cost.name, acc_top1.name, acc_top5.name]
+    for data in data_reader():
+        batch_reward = exe.run(program, feed=data_feeder.feed(data), fetch_list = outputs)
+        print("TRAIN: loss: {}, acc1: {}, acc5:{}".format(batch_reward[0], batch_reward[1], batch_reward[2]))
+```
+
+## 6. 定义评估函数
+根据评估program和评估数据进行评估。
+```python
+def start_eval(program, data_reader, data_feeder):
+    reward = []
+    outputs = [avg_cost.name, acc_top1.name, acc_top5.name]
+    for data in data_reader():
+        batch_reward = exe.run(program, feed=data_feeder.feed(data), fetch_list = outputs)
+        reward_avg = np.mean(np.array(batch_reward), axis=1)
+        reward.append(reward_avg)
+        print("TEST: loss: {}, acc1: {}, acc5:{}".format(batch_reward[0], batch_reward[1], batch_reward[2]))
+    finally_reward = np.mean(np.array(reward), axis=0)
+    print("FINAL TEST: avg_cost: {}, acc1: {}, acc5: {}".format(finally_reward[0], finally_reward[1], finally_reward[2]))
+    return finally_reward
+```
+
+## 7. 启动搜索实验
+以下步骤拆解说明了如何获得当前模型结构以及获得当前模型结构之后应该有的步骤，如果想要看如何启动搜索实验的完整示例可以看步骤9。
+
+### 7.1 获取模型结构
+调用`next_archs()`函数获取到下一个模型结构。
+```python
+archs = sanas.next_archs()[0]
+```
+
+### 7.2 构造program
+调用步骤3中的函数，根据4.1中的模型结构构造相应的program。
+```python
+exe, train_program, eval_program, inputs, avg_cost, acc_top1, acc_top5 = build_program(archs)
+```
+
+### 7.3 定义输入数据
+```python
+train_reader, train_feeder, eval_reader, eval_feeder = input_data(inputs)
+```
+
+### 7.4 训练模型
+根据上面得到的训练program和评估数据启动训练。
+```python
+start_train(train_program, train_reader, train_feeder)
+```
+### 7.5 评估模型
+根据上面得到的评估program和评估数据启动评估。
+```python
+finally_reward = start_eval(eval_program, eval_reader, eval_feeder)
+```
+### 7.6 回传当前模型的得分
+```
+sanas.reward(float(finally_reward[1]))
+```
+
+## 8. 完整示例
+以下是一个完整的搜索实验示例，示例中使用FLOPs作为约束条件，搜索实验一共搜索3个step，表示搜索到3个满足条件的模型结构进行训练，每搜索到一个网络结构训练7个epoch。
+```python
+for step in range(3):
+    archs = sanas.next_archs()[0]
+    exe, train_program, eval_program, inputs, avg_cost, acc_top1, acc_top5 = build_program(archs)
+    train_reader, train_feeder, eval_reader, eval_feeder = input_data(inputs)
+
+    current_flops = slim.analysis.flops(train_program)
+    if current_flops > 321208544:
+        continue
+
+    for epoch in range(7):
+        start_train(train_program, train_reader, train_feeder)
+
+    finally_reward = start_eval(eval_program, eval_reader, eval_feeder)
+
+    sanas.reward(float(finally_reward[1]))
+```
diff --git a/docs/zh_cn/quick_start/pruning_tutorial.md b/docs/zh_cn/quick_start/pruning_tutorial.md
new file mode 100755
index 0000000000000000000000000000000000000000..051a740f4602f2175537efe5eaf22b62a06c10e7
--- /dev/null
+++ b/docs/zh_cn/quick_start/pruning_tutorial.md
@@ -0,0 +1,89 @@
+#  图像分类模型通道剪裁-快速开始
+
+该教程以图像分类模型MobileNetV1为例，说明如何快速使用[PaddleSlim的卷积通道剪裁接口]()。
+该示例包含以下步骤：
+
+1. 导入依赖
+2. 构建模型
+3. 剪裁
+4. 训练剪裁后的模型
+
+以下章节依次次介绍每个步骤的内容。
+
+## 1. 导入依赖
+
+PaddleSlim依赖Paddle1.7版本，请确认已正确安装Paddle，然后按以下方式导入Paddle和PaddleSlim:
+
+```
+import paddle
+import paddle.fluid as fluid
+import paddleslim as slim
+```
+
+## 2. 构建网络
+
+该章节构造一个用于对MNIST数据进行分类的分类模型，选用`MobileNetV1`，并将输入大小设置为`[1, 28, 28]`，输出类别数为10。
+为了方便展示示例，我们在`paddleslim.models`下预定义了用于构建分类模型的方法，执行以下代码构建分类模型：
+
+```
+exe, train_program, val_program, inputs, outputs =
+    slim.models.image_classification("MobileNet", [1, 28, 28], 10, use_gpu=False)
+```
+
+>注意：paddleslim.models下的API并非PaddleSlim常规API，是为了简化示例而封装预定义的一系列方法，比如：模型结构的定义、Program的构建等。
+
+## 3. 剪裁卷积层通道
+
+### 3.1 计算剪裁之前的FLOPs
+
+```
+FLOPs = slim.analysis.flops(train_program)
+print("FLOPs: {}".format(FLOPs))
+```
+
+### 3.2 剪裁
+
+我们这里对参数名为`conv2_1_sep_weights`和`conv2_2_sep_weights`的卷积层进行剪裁，分别剪掉20%和30%的通道数。
+代码如下所示：
+
+```
+pruner = slim.prune.Pruner()
+pruned_program, _, _ = pruner.prune(
+        train_program,
+        fluid.global_scope(),
+        params=["conv2_1_sep_weights", "conv2_2_sep_weights"],
+        ratios=[0.33] * 2,
+        place=fluid.CPUPlace())
+```
+
+以上操作会修改`train_program`中对应卷积层参数的定义，同时对`fluid.global_scope()`中存储的参数数组进行裁剪。
+
+### 3.3 计算剪裁之后的FLOPs
+
+```
+FLOPs = paddleslim.analysis.flops(train_program)
+print("FLOPs: {}".format(FLOPs))
+```
+
+## 4. 训练剪裁后的模型
+
+### 4.1 定义输入数据
+
+为了快速执行该示例，我们选取简单的MNIST数据，Paddle框架的`paddle.dataset.mnist`包定义了MNIST数据的下载和读取。
+代码如下：
+
+```
+import paddle.dataset.mnist as reader
+train_reader = paddle.fluid.io.batch(
+        reader.train(), batch_size=128, drop_last=True)
+train_feeder = fluid.DataFeeder(inputs, fluid.CPUPlace())
+```
+
+### 4.2 执行训练
+以下代码执行了一个`epoch`的训练：
+
+```
+for data in train_reader():
+    acc1, acc5, loss = exe.run(pruned_program, feed=train_feeder.feed(data), fetch_list=outputs)
+    print(acc1, acc5, loss)
+```
diff --git a/docs/zh_cn/quick_start/quant_aware_tutorial.md b/docs/zh_cn/quick_start/quant_aware_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..bfafa2f4995bea098da7e365100c1aa87f4be451
--- /dev/null
+++ b/docs/zh_cn/quick_start/quant_aware_tutorial.md
@@ -0,0 +1,140 @@
+# 图像分类模型量化训练-快速开始
+
+该教程以图像分类模型MobileNetV1为例，说明如何快速使用PaddleSlim的[量化训练接口](../api_cn/quantization_api.html)。 该示例包含以下步骤：
+
+1. 导入依赖
+2. 构建模型
+3. 训练模型
+4. 量化
+5. 训练和测试量化后的模型
+6. 保存量化后的模型
+
+## 1. 导入依赖
+PaddleSlim依赖Paddle1.7版本，请确认已正确安装Paddle，然后按以下方式导入Paddle和PaddleSlim:
+
+
+```python
+import paddle
+import paddle.fluid as fluid
+import paddleslim as slim
+import numpy as np
+```
+
+## 2. 构建网络
+该章节构造一个用于对MNIST数据进行分类的分类模型，选用`MobileNetV1`，并将输入大小设置为`[1, 28, 28]`，输出类别数为10。               为了方便展示示例，我们在`paddleslim.models`下预定义了用于构建分类模型的方法，执行以下代码构建分类模型：
+
+>注意：paddleslim.models下的API并非PaddleSlim常规API，是为了简化示例而封装预定义的一系列方法，比如：模型结构的定义、Program的构建等。
+
+
+```python
+exe, train_program, val_program, inputs, outputs = \
+    slim.models.image_classification("MobileNet", [1, 28, 28], 10, use_gpu=True)
+```
+
+## 3. 训练模型
+该章节介绍了如何定义输入数据和如何训练和测试分类模型。先训练分类模型的原因是量化训练过程是在训练好的模型上进行的，也就是说是在训练好的模型的基础上加入量化反量化op之后，用小学习率进行参数微调。
+
+### 3.1 定义输入数据
+
+为了快速执行该示例，我们选取简单的MNIST数据，Paddle框架的`paddle.dataset.mnist`包定义了MNIST数据的下载和读取。
+代码如下：
+
+
+```python
+import paddle.dataset.mnist as reader
+train_reader = paddle.fluid.io.batch(
+        reader.train(), batch_size=128, drop_last=True)
+test_reader = paddle.fluid.io.batch(
+        reader.train(), batch_size=128, drop_last=True)
+train_feeder = fluid.DataFeeder(inputs, fluid.CPUPlace())
+```
+
+### 3.2 训练和测试
+先定义训练和测试函数，正常训练和量化训练时只需要调用函数即可。在训练函数中执行了一个epoch的训练，因为MNIST数据集数据较少，一个epoch就可将top1精度训练到95%以上。
+
+
+```python
+def train(prog):
+    iter = 0
+    for data in train_reader():
+        acc1, acc5, loss = exe.run(prog, feed=train_feeder.feed(data), fetch_list=outputs)
+        if iter % 100 == 0:
+            print('train iter={}, top1={}, top5={}, loss={}'.format(iter, acc1.mean(), acc5.mean(), loss.mean()))
+        iter += 1
+
+def test(prog):
+    iter = 0
+    res = [[], []]
+    for data in train_reader():
+        acc1, acc5, loss = exe.run(prog, feed=train_feeder.feed(data), fetch_list=outputs)
+        if iter % 100 == 0:
+            print('test iter={}, top1={}, top5={}, loss={}'.format(iter, acc1.mean(), acc5.mean(), loss.mean()))
+        res[0].append(acc1.mean())
+        res[1].append(acc5.mean())
+        iter += 1
+    print('final test result top1={}, top5={}'.format(np.array(res[0]).mean(), np.array(res[1]).mean()))
+```
+
+调用``train``函数训练分类网络，``train_program``是在第2步：构建网络中定义的。
+
+
+```python
+train(train_program)
+```
+
+
+调用``test``函数测试分类网络，``val_program``是在第2步：构建网络中定义的。
+
+
+```python
+test(val_program)
+```
+
+
+## 4. 量化
+
+按照[默认配置](https://paddlepaddle.github.io/PaddleSlim/api_cn/quantization_api.html#id2)在``train_program``和``val_program``中加入量化和反量化op.
+
+
+```python
+quant_program = slim.quant.quant_aware(train_program, exe.place, for_test=False)
+val_quant_program = slim.quant.quant_aware(val_program, exe.place, for_test=True)
+```
+
+
+## 5. 训练和测试量化后的模型
+微调量化后的模型，训练一个epoch后测试。
+
+
+```python
+train(quant_program)
+```
+
+
+测试量化后的模型，和``3.2 训练和测试``中得到的测试结果相比，精度相近，达到了无损量化。
+
+
+```python
+test(val_quant_program)
+```
+
+
+## 6. 保存量化后的模型
+
+在``4. 量化``中使用接口``slim.quant.quant_aware``接口得到的模型只适合训练时使用，为了得到最终使用时的模型，需要使用[slim.quant.convert](https://paddlepaddle.github.io/PaddleSlim/api_cn/quantization_api.html#convert)接口，然后使用[fluid.io.save_inference_model](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/io_cn/save_inference_model_cn.html#save-inference-model)保存模型。``float_prog``的参数数据类型是float32，但是数据范围是int8, 保存之后可使用fluid或者paddle-lite加载使用，paddle-lite在使用时，会先将类型转换为int8。``int8_prog``的参数数据类型是int8, 保存后可看到量化后模型大小，不可加载使用。
+
+
+```python
+float_prog, int8_prog = slim.quant.convert(val_quant_program, exe.place, save_int8=True)
+target_vars = [float_prog.global_block().var(outputs[-1])]
+fluid.io.save_inference_model(dirname='./inference_model/float',
+        feeded_var_names=[inputs[0].name],
+        target_vars=target_vars,
+        executor=exe,
+        main_program=float_prog)
+fluid.io.save_inference_model(dirname='./inference_model/int8',
+        feeded_var_names=[inputs[0].name],
+        target_vars=target_vars,
+        executor=exe,
+        main_program=int8_prog)
+```
diff --git a/docs/zh_cn/quick_start/quant_post_static_tutorial.md b/docs/zh_cn/quick_start/quant_post_static_tutorial.md
new file mode 100755
index 0000000000000000000000000000000000000000..5f042f80a0d3c34ce14dc4d526484499707b5a13
--- /dev/null
+++ b/docs/zh_cn/quick_start/quant_post_static_tutorial.md
@@ -0,0 +1,130 @@
+ # 图像分类模型静态离线量化-快速开始
+
+该教程以图像分类模型MobileNetV1为例，说明如何快速使用PaddleSlim的[静态离线量化接口](../api_cn/quantization_api.html#quant-post-static)。 该示例包含以下步骤：
+
+1. 导入依赖
+2. 构建模型
+3. 训练模型
+4. 静态离线量化
+
+## 1. 导入依赖
+PaddleSlim依赖Paddle1.7版本，请确认已正确安装Paddle，然后按以下方式导入Paddle和PaddleSlim:
+
+
+```python
+import paddle
+import paddle.fluid as fluid
+import paddleslim as slim
+import numpy as np
+```
+
+## 2. 构建网络
+该章节构造一个用于对MNIST数据进行分类的分类模型，选用`MobileNetV1`，并将输入大小设置为`[1, 28, 28]`，输出类别数为10。为了方便展示示例，我们在`paddleslim.models`下预定义了用于构建分类模型的方法，执行以下代码构建分类模型：
+
+>注意：paddleslim.models下的API并非PaddleSlim常规API，是为了简化示例而封装预定义的一系列方法，比如：模型结构的定义、Program的构建等。
+
+
+```python
+exe, train_program, val_program, inputs, outputs = \
+    slim.models.image_classification("MobileNet", [1, 28, 28], 10, use_gpu=True)
+```
+
+## 3. 训练模型
+该章节介绍了如何定义输入数据和如何训练和测试分类模型。先训练分类模型的原因是离线量化需要一个训练好的模型。
+
+### 3.1 定义输入数据
+
+为了快速执行该示例，我们选取简单的MNIST数据，Paddle框架的`paddle.dataset.mnist`包定义了MNIST数据的下载和读取。
+代码如下：
+
+
+```python
+import paddle.dataset.mnist as reader
+train_reader = paddle.fluid.io.batch(
+        reader.train(), batch_size=128, drop_last=True)
+test_reader = paddle.fluid.io.batch(
+        reader.train(), batch_size=128, drop_last=True)
+train_feeder = fluid.DataFeeder(inputs, fluid.CPUPlace())
+```
+
+### 3.2 训练和测试
+先定义训练和测试函数。在训练函数中执行了一个epoch的训练，因为MNIST数据集数据较少，一个epoch就可将top1精度训练到95%以上。
+
+
+
+```python
+def train(prog):
+    iter = 0
+    for data in train_reader():
+        acc1, acc5, loss = exe.run(prog, feed=train_feeder.feed(data), fetch_list=outputs)
+        if iter % 100 == 0:
+            print('train', acc1.mean(), acc5.mean(), loss.mean())
+        iter += 1
+
+def test(prog, outputs=outputs):
+    iter = 0
+    res = [[], []]
+    for data in train_reader():
+        acc1, acc5, loss = exe.run(prog, feed=train_feeder.feed(data), fetch_list=outputs)
+        if iter % 100 == 0:
+            print('test', acc1.mean(), acc5.mean(), loss.mean())
+        res[0].append(acc1.mean())
+        res[1].append(acc5.mean())
+        iter += 1
+    print('final test result', np.array(res[0]).mean(), np.array(res[1]).mean())
+```
+
+调用``train``函数训练分类网络，``train_program``是在第2步：构建网络中定义的。
+
+
+```python
+train(train_program)
+```
+
+
+调用``test``函数测试分类网络，``val_program``是在第2步：构建网络中定义的。
+
+
+```python
+test(val_program)
+```
+
+
+保存inference model，将训练好的分类模型保存在``'./inference_model'``下，后续进行静态离线量化时将加载保存在此处的模型。
+
+
+```python
+target_vars = [val_program.global_block().var(name) for name in outputs]
+fluid.io.save_inference_model(dirname='./inference_model',
+        feeded_var_names=[var.name for var in inputs],
+        target_vars=target_vars,
+        executor=exe,
+        main_program=val_program)
+```
+
+## 4. 静态离线量化
+
+调用静态离线量化接口，加载文件夹``'./inference_model'``训练好的分类模型，并使用10个batch的数据进行参数校正。此过程无需训练，只需跑前向过程来计算量化所需参数。静态离线量化后的模型保存在文件夹``'./quant_post_static_model'``下。
+
+
+```python
+slim.quant.quant_post_static(
+        executor=exe,
+        model_dir='./inference_model',
+        quantize_model_path='./quant_post_static_model',
+        sample_generator=reader.test(),
+        batch_nums=10)
+```
+
+
+加载保存在文件夹``'./quant_post_static_model'``下的量化后的模型进行测试，可看到精度和``3.2 训练和测试``中得到的测试精度相近，因此静态离线量化过程对于此分类模型几乎无损。
+
+
+```python
+quant_post_static_prog, feed_target_names, fetch_targets = fluid.io.load_inference_model(
+        dirname='./quant_post_static_model',
+        model_filename='__model__',
+        params_filename='__params__',
+        executor=exe)
+test(quant_post_static_prog, fetch_targets)
+```
diff --git a/docs/zh_cn/tutorials/image_classification_mkldnn_quant_tutorial.md b/docs/zh_cn/tutorials/image_classification_mkldnn_quant_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..2a8eaa2e269cbe1a8a949ce14fa871b878d9d5dc
--- /dev/null
+++ b/docs/zh_cn/tutorials/image_classification_mkldnn_quant_tutorial.md
@@ -0,0 +1,43 @@
+# Intel CPU上部署量化模型教程
+
+在Intel Casecade Lake机器上（如：Intel(R) Xeon(R) Gold 6271），经过量化和DNNL加速，INT8模型在单线程上性能为FP32模型的3~3.7倍；在Intel SkyLake机器上（如：Intel(R) Xeon(R) Gold 6148），单线程性能为FP32模型的1.5倍，而精度仅有极小下降。图像分类量化的样例教程请参考[图像分类INT8模型在CPU优化部署和预测](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/demo/mkldnn_quant/)。自然语言处理模型的量化请参考[ERNIE INT8 模型精度与性能复现](https://github.com/PaddlePaddle/benchmark/tree/master/Inference/c%2B%2B/ernie/mkldnn)
+
+## 图像分类INT8模型在 Xeon(R) 6271 上的精度和性能
+
+>**图像分类INT8模型在 Intel(R) Xeon(R) Gold 6271 上精度**
+
+|     Model    | FP32 Top1 Accuracy | INT8 Top1 Accuracy | Top1 Diff | FP32 Top5 Accuracy | INT8 Top5 Accuracy | Top5 Diff |
+|:------------:|:------------------:|:------------------:|:---------:|:------------------:|:------------------:|:---------:|
+| MobileNet-V1 |       70.78%       |       70.74%       |   -0.04%  |       89.69%       |       89.43%       |   -0.26%  |
+| MobileNet-V2 |       71.90%       |       72.21%       |   0.31%   |       90.56%       |       90.62%       |   0.06%   |
+|   ResNet101  |       77.50%       |       77.60%       |   0.10%   |       93.58%       |       93.55%       |   -0.03%  |
+|   ResNet50   |       76.63%       |       76.50%       |   -0.13%  |       93.10%       |       92.98%       |   -0.12%  |
+|     VGG16    |       72.08%       |       71.74%       |   -0.34%  |       90.63%       |       89.71%       |   -0.92%  |
+|     VGG19    |       72.57%       |       72.12%       |   -0.45%  |       90.84%       |       90.15%       |   -0.69%  |
+
+>**图像分类INT8模型在 Intel(R) Xeon(R) Gold 6271 单核上性能**
+
+|     Model    | FP32 (images/s) | INT8 (images/s) | Ratio (INT8/FP32) |
+|:------------:|:---------------:|:---------------:|:-----------------:|
+| MobileNet-V1 |      74.05      |      216.36     |        2.92       |
+| MobileNet-V2 |      88.60      |      205.84     |        2.32       |
+|   ResNet101  |       7.20      |      26.48      |        3.68       |
+|   ResNet50   |      13.23      |      50.02      |        3.78       |
+|     VGG16    |       3.47      |      10.67      |        3.07       |
+|     VGG19    |       2.83      |       9.09      |        3.21       |
+
+## 自然语言处理INT8模型在 Xeon(R) 6271 上的精度和性能
+
+>**I. Ernie INT8 DNNL 在 Intel(R) Xeon(R) Gold 6271 的精度结果**
+
+| Model | FP32 Accuracy | INT8 Accuracy | Accuracy Diff |
+| :---: | :-----------: | :-----------: | :-----------: |
+| Ernie |    80.20%     |    79.44%     |    -0.76%     |
+
+
+>**II. Ernie INT8 DNNL 在 Intel(R) Xeon(R) Gold 6271 上单样本耗时**
+
+|  Threads   | FP32 Latency (ms) | INT8 Latency (ms) | Ratio (FP32/INT8) |
+| :--------: | :---------------: | :---------------: | :---------------: |
+|  1 thread  |      237.21       |       79.26       |       2.99X       |
+| 20 threads |       22.08       |       12.57       |       1.76X       |
diff --git a/docs/zh_cn/tutorials/image_classification_nas_quick_start.ipynb b/docs/zh_cn/tutorials/image_classification_nas_quick_start.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..fceccb98fbb201bd873cf34478a7f396110e427a
--- /dev/null
+++ b/docs/zh_cn/tutorials/image_classification_nas_quick_start.ipynb
@@ -0,0 +1,407 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 图像分类网络结构搜索-快速开始\n",
+    "\n",
+    "该教程以图像分类模型MobileNetV2为例，说明如何在cifar10数据集上快速使用[网络结构搜索接口](../api/nas_api.md)。\n",
+    "该示例包含以下步骤：\n",
+    "\n",
+    "1. 导入依赖\n",
+    "2. 初始化SANAS搜索实例\n",
+    "3. 构建网络\n",
+    "4. 定义输入数据函数\n",
+    "5. 定义训练函数\n",
+    "6. 定义评估函数\n",
+    "7. 启动搜索实验\n",
+    "  7.1 获取模型结构\n",
+    "  7.2 构造program\n",
+    "  7.3 定义输入数据\n",
+    "  7.4 训练模型\n",
+    "  7.5 评估模型\n",
+    "  7.6 回传当前模型的得分\n",
+    "8. 完整示例\n",
+    "\n",
+    "\n",
+    "以下章节依次介绍每个步骤的内容。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. 导入依赖\n",
+    "请确认已正确安装Paddle，导入需要的依赖包。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import paddle\n",
+    "import paddle.fluid as fluid\n",
+    "import paddleslim as slim\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. 初始化SANAS搜索实例"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-07 08:42:37,895-INFO: range table: ([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [7, 5, 8, 6, 2, 5, 8, 6, 2, 5, 8, 6, 2, 5, 10, 6, 2, 5, 10, 6, 2, 5, 12, 6, 2])\n",
+      "2020-02-07 08:42:37,897-INFO: ControllerServer - listen on: [10.255.125.38:8339]\n",
+      "2020-02-07 08:42:37,899-INFO: Controller Server run...\n"
+     ]
+    }
+   ],
+   "source": [
+    "sanas = slim.nas.SANAS(configs=[('MobileNetV2Space')], server_addr=(\"\", 8339), save_checkpoint=None)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. 构建网络\n",
+    "根据传入的网络结构构造训练program和测试program。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_program(archs):\n",
+    "    train_program = fluid.Program()\n",
+    "    startup_program = fluid.Program()\n",
+    "    with fluid.program_guard(train_program, startup_program):\n",
+    "        data = fluid.data(name='data', shape=[None, 3, 32, 32], dtype='float32')\n",
+    "        label = fluid.data(name='label', shape=[None, 1], dtype='int64')\n",
+    "        output = archs(data)\n",
+    "        output = fluid.layers.fc(input=output, size=10)\n",
+    "\n",
+    "        softmax_out = fluid.layers.softmax(input=output, use_cudnn=False)\n",
+    "        cost = fluid.layers.cross_entropy(input=softmax_out, label=label)\n",
+    "        avg_cost = fluid.layers.mean(cost)\n",
+    "        acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1)\n",
+    "        acc_top5 = fluid.layers.accuracy(input=softmax_out, label=label, k=5)\n",
+    "        test_program = fluid.default_main_program().clone(for_test=True)\n",
+    "\n",
+    "        optimizer = fluid.optimizer.Adam(learning_rate=0.1)\n",
+    "        optimizer.minimize(avg_cost)\n",
+    "\n",
+    "        place = fluid.CPUPlace()\n",
+    "        exe = fluid.Executor(place)\n",
+    "        exe.run(startup_program)\n",
+    "    return exe, train_program, test_program, (data, label), avg_cost, acc_top1, acc_top5"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. 定义输入数据函数\n",
+    "使用的数据集为cifar10，paddle框架中`paddle.dataset.cifar`包括了cifar数据集的下载和读取，代码如下："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def input_data(inputs):\n",
+    "    train_reader = paddle.fluid.io.batch(paddle.reader.shuffle(paddle.dataset.cifar.train10(cycle=False), buf_size=1024),batch_size=256)\n",
+    "    train_feeder = fluid.DataFeeder(inputs, fluid.CPUPlace())\n",
+    "    eval_reader = paddle.fluid.io.batch(paddle.dataset.cifar.test10(cycle=False), batch_size=256)\n",
+    "    eval_feeder = fluid.DataFeeder(inputs, fluid.CPUPlace())\n",
+    "    return train_reader, train_feeder, eval_reader, eval_feeder"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. 定义训练函数\n",
+    "根据训练program和训练数据进行训练。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def start_train(program, data_reader, data_feeder):\n",
+    "    outputs = [avg_cost.name, acc_top1.name, acc_top5.name]\n",
+    "    for data in data_reader():\n",
+    "        batch_reward = exe.run(program, feed=data_feeder.feed(data), fetch_list = outputs)\n",
+    "        print(\"TRAIN: loss: {}, acc1: {}, acc5:{}\".format(batch_reward[0], batch_reward[1], batch_reward[2]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. 定义评估函数\n",
+    "根据评估program和评估数据进行评估。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def start_eval(program, data_reader, data_feeder):\n",
+    "    reward = []\n",
+    "    outputs = [avg_cost.name, acc_top1.name, acc_top5.name]\n",
+    "    for data in data_reader():\n",
+    "        batch_reward = exe.run(program, feed=data_feeder.feed(data), fetch_list = outputs)\n",
+    "        reward_avg = np.mean(np.array(batch_reward), axis=1)\n",
+    "        reward.append(reward_avg)\n",
+    "        print(\"TEST: loss: {}, acc1: {}, acc5:{}\".format(batch_reward[0], batch_reward[1], batch_reward[2]))\n",
+    "    finally_reward = np.mean(np.array(reward), axis=0)\n",
+    "    print(\"FINAL TEST: avg_cost: {}, acc1: {}, acc5: {}\".format(finally_reward[0], finally_reward[1], finally_reward[2]))\n",
+    "    return finally_reward"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. 启动搜索实验\n",
+    "以下步骤拆解说明了如何获得当前模型结构以及获得当前模型结构之后应该有的步骤，如果想要看如何启动搜索实验的完整示例可以看步骤9。\n",
+    "\n",
+    "### 7.1 获取模型结构\n",
+    "调用`next_archs()`函数获取到下一个模型结构。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-07 08:42:45,035-INFO: current tokens: [4, 4, 5, 1, 0, 4, 4, 2, 0, 4, 4, 3, 0, 4, 5, 2, 0, 4, 7, 2, 0, 4, 9, 0, 0]\n"
+     ]
+    }
+   ],
+   "source": [
+    "archs = sanas.next_archs()[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 7.2 构造program"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "exe, train_program, eval_program, inputs, avg_cost, acc_top1, acc_top5 = build_program(archs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 7.3 定义输入数据"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_reader, train_feeder, eval_reader, eval_feeder = input_data(inputs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 7.4 训练模型\n",
+    "据上面得到的训练program和评估数据启动训练。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TRAIN: loss: [2.7999306], acc1: [0.1015625], acc5:[0.44140625]\n"
+     ]
+    }
+   ],
+   "source": [
+    "start_train(train_program, train_reader, train_feeder)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 7.5 评估模型\n",
+    "根据上面得到的评估program和评估数据启动评估。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TEST: loss: [49.99942], acc1: [0.078125], acc5:[0.46484375]\n",
+      "FINAL TEST: avg_cost: 49.999420166, acc1: 0.078125, acc5: 0.46484375\n"
+     ]
+    }
+   ],
+   "source": [
+    "finally_reward = start_eval(eval_program, eval_reader, eval_feeder)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 7.6 回传当前模型的得分"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-07 08:44:26,774-INFO: Controller - iter: 1; best_reward: 0.078125, best tokens: [4, 4, 5, 1, 0, 4, 4, 2, 0, 4, 4, 3, 0, 4, 5, 2, 0, 4, 7, 2, 0, 4, 9, 0, 0], current_reward: 0.078125; current tokens: [4, 4, 5, 1, 0, 4, 4, 2, 0, 4, 4, 3, 0, 4, 5, 2, 0, 4, 7, 2, 0, 4, 9, 0, 0]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sanas.reward(float(finally_reward[1]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 8. 完整示例\n",
+    "以下是一个完整的搜索实验示例，示例中使用FLOPs作为约束条件，搜索实验一共搜索3个step，表示搜索到3个满足条件的模型结构进行训练，每搜>索到一个网络结构训练7个epoch。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-07 08:45:06,927-INFO: current tokens: [4, 4, 5, 1, 0, 4, 4, 2, 0, 4, 4, 3, 1, 4, 5, 2, 0, 4, 7, 2, 0, 4, 9, 0, 0]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TRAIN: loss: [2.6932292], acc1: [0.08203125], acc5:[0.51953125]\n",
+      "TRAIN: loss: [42.387478], acc1: [0.078125], acc5:[0.47265625]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for step in range(3):\n",
+    "    archs = sanas.next_archs()[0]\n",
+    "    exe, train_program, eval_progarm, inputs, avg_cost, acc_top1, acc_top5 = build_program(archs)\n",
+    "    train_reader, train_feeder, eval_reader, eval_feeder = input_data(inputs)\n",
+    "\n",
+    "    current_flops = slim.analysis.flops(train_program)\n",
+    "    if current_flops > 321208544:\n",
+    "        continue\n",
+    "\n",
+    "    for epoch in range(7):\n",
+    "        start_train(train_program, train_reader, train_feeder)\n",
+    "\n",
+    "    finally_reward = start_eval(eval_program, eval_reader, eval_feeder)\n",
+    "\n",
+    "    sanas.reward(float(finally_reward[1]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/zh_cn/tutorials/image_classification_sensitivity_analysis_tutorial.md b/docs/zh_cn/tutorials/image_classification_sensitivity_analysis_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..6bde2f8ca4615554152aacf2983c8c5d6f369ff0
--- /dev/null
+++ b/docs/zh_cn/tutorials/image_classification_sensitivity_analysis_tutorial.md
@@ -0,0 +1,269 @@
+#  图像分类模型通道剪裁-敏感度分析
+
+该教程以图像分类模型MobileNetV1为例，说明如何快速使用[PaddleSlim的敏感度分析接口](https://paddlepaddle.github.io/PaddleSlim/api/prune_api/#sensitivity)。
+该示例包含以下步骤：
+
+1. 导入依赖
+2. 构建模型
+3. 定义输入数据
+4. 定义模型评估方法
+5. 训练模型
+6. 获取待分析卷积参数名称
+7. 分析敏感度
+8. 剪裁模型
+
+以下章节依次介绍每个步骤的内容。
+
+## 1. 导入依赖
+
+PaddleSlim依赖Paddle1.7版本，请确认已正确安装Paddle，然后按以下方式导入Paddle和PaddleSlim:
+
+
+```python
+import paddle
+import paddle.fluid as fluid
+import paddleslim as slim
+```
+
+## 2. 构建网络
+
+该章节构造一个用于对MNIST数据进行分类的分类模型，选用`MobileNetV1`，并将输入大小设置为`[1, 28, 28]`，输出类别数为10。
+为了方便展示示例，我们在`paddleslim.models`下预定义了用于构建分类模型的方法，执行以下代码构建分类模型：
+
+
+```python
+exe, train_program, val_program, inputs, outputs = slim.models.image_classification("MobileNet", [1, 28, 28], 10, use_gpu=True)
+place = fluid.CUDAPlace(0)
+```
+
+## 3 定义输入数据
+
+为了快速执行该示例，我们选取简单的MNIST数据，Paddle框架的`paddle.dataset.mnist`包定义了MNIST数据的下载和读取。
+代码如下：
+
+
+```python
+import paddle.dataset.mnist as reader
+train_reader = paddle.fluid.io.batch(
+        reader.train(), batch_size=128, drop_last=True)
+test_reader = paddle.fluid.io.batch(
+        reader.test(), batch_size=128, drop_last=True)
+data_feeder = fluid.DataFeeder(inputs, place)
+```
+
+## 4. 定义模型评估方法
+
+在计算敏感度时，需要裁剪单个卷积层后的模型在测试数据上的效果，我们定义以下方法实现该功能：
+
+
+```python
+import numpy as np
+def test(program):
+    acc_top1_ns = []
+    acc_top5_ns = []
+    for data in test_reader():
+        acc_top1_n, acc_top5_n, _ = exe.run(
+            program,
+            feed=data_feeder.feed(data),
+            fetch_list=outputs)
+        acc_top1_ns.append(np.mean(acc_top1_n))
+        acc_top5_ns.append(np.mean(acc_top5_n))
+    print("Final eva - acc_top1: {}; acc_top5: {}".format(
+        np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns))))
+    return np.mean(np.array(acc_top1_ns))
+```
+
+## 5. 训练模型
+
+只有训练好的模型才能做敏感度分析，因为该示例任务相对简单，我这里用训练一个`epoch`产出的模型做敏感度分析。对于其它训练比较耗时的模型，您可以加载训练好的模型权重。
+
+以下为模型训练代码：
+
+
+```python
+for data in train_reader():
+    acc1, acc5, loss = exe.run(train_program, feed=data_feeder.feed(data), fetch_list=outputs)
+print(np.mean(acc1), np.mean(acc5), np.mean(loss))
+```
+
+用上节定义的模型评估方法，评估当前模型在测试集上的精度：
+
+
+```python
+test(val_program)
+```
+
+## 6. 获取待分析卷积参数
+
+```python
+params = []
+for param in train_program.global_block().all_parameters():
+    if "_sep_weights" in param.name:
+        params.append(param.name)
+print(params)
+params = params[:5]
+```
+
+## 7. 分析敏感度
+
+### 7.1 简单计算敏感度
+
+调用[sensitivity接口](https://paddlepaddle.github.io/PaddleSlim/api/prune_api/#sensitivity)对训练好的模型进行敏感度分析。
+
+在计算过程中，敏感度信息会不断追加保存到选项`sensitivities_file`指定的文件中，该文件中已有的敏感度信息不会被重复计算。
+
+先用以下命令删除当前路径下可能已有的`sensitivities_0.data`文件:
+
+
+```python
+!rm -rf sensitivities_0.data
+```
+
+除了指定待分析的卷积层参数，我们还可以指定敏感度分析的粒度和范围，即单个卷积层参数分别被剪裁掉的比例。
+
+如果待分析的模型比较敏感，剪掉单个卷积层的40%的通道，模型在测试集上的精度损失就达90%，那么`pruned_ratios`最大设置到0.4即可，比如：
+`[0.1, 0.2, 0.3, 0.4]`
+
+为了得到更精确的敏感度信息，我可以适当调小`pruned_ratios`的粒度，比如：`[0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]`
+
+`pruned_ratios`的粒度越小，计算敏感度的速度越慢。
+
+
+```python
+sens_0 = slim.prune.sensitivity(
+        val_program,
+        place,
+        params,
+        test,
+        sensitivities_file="sensitivities_0.data",
+        pruned_ratios=[0.1, 0.2])
+print(sens_0)
+```
+
+### 7.2 扩展敏感度信息
+
+第7.1节计算敏感度用的是`pruned_ratios=[0.1, 0.2]`, 我们可以在此基础上将其扩展到`[0.1, 0.2, 0.3]`
+
+
+```python
+sens_0 = slim.prune.sensitivity(
+        val_program,
+        place,
+        params,
+        test,
+        sensitivities_file="sensitivities_0.data",
+        pruned_ratios=[0.3])
+print(sens_0)
+```
+
+### 7.3 多进程加速计算敏感度信息
+
+敏感度分析所用时间取决于待分析的卷积层数量和模型评估的速度，我们可以通过多进程的方式加速敏感度计算。
+
+在不同的进程设置不同`pruned_ratios`, 然后将结果合并。
+
+#### 7.3.1 多进程计算敏感度
+
+在以上章节，我们计算了`pruned_ratios=[0.1, 0.2, 0.3]`的敏感度，并将其保存到了文件`sensitivities_0.data`中。
+
+在另一个进程中，我们可以设置`pruned_ratios=[0.4]`，并将结果保存在文件`sensitivities_1.data`中。代码如下：
+
+
+```python
+sens_1 = slim.prune.sensitivity(
+        val_program,
+        place,
+        params,
+        test,
+        sensitivities_file="sensitivities_1.data",
+        pruned_ratios=[0.4])
+print(sens_1)
+```
+
+#### 7.3.2 加载多个进程产出的敏感度文件
+
+```python
+s_0 = slim.prune.load_sensitivities("sensitivities_0.data")
+s_1 = slim.prune.load_sensitivities("sensitivities_1.data")
+print(s_0)
+print(s_1)
+```
+
+#### 7.3.3 合并敏感度信息
+
+
+```python
+s = slim.prune.merge_sensitive([s_0, s_1])
+print(s)
+```
+
+## 8. 剪裁模型
+
+根据以上章节产出的敏感度信息，对模型进行剪裁。
+
+### 8.1 计算剪裁率
+
+首先，调用PaddleSlim提供的[get_ratios_by_loss](https://paddlepaddle.github.io/PaddleSlim/api/prune_api/#get_ratios_by_loss)方法根据敏感度计算剪裁率，通过调整参数`loss`大小获得合适的一组剪裁率：
+
+
+```python
+loss = 0.01
+ratios = slim.prune.get_ratios_by_loss(s_0, loss)
+print(ratios)
+```
+
+### 8.2 剪裁训练网络
+
+
+```python
+pruner = slim.prune.Pruner()
+print("FLOPs before pruning: {}".format(slim.analysis.flops(train_program)))
+pruned_program, _, _ = pruner.prune(
+        train_program,
+        fluid.global_scope(),
+        params=ratios.keys(),
+        ratios=ratios.values(),
+        place=place)
+print("FLOPs after pruning: {}".format(slim.analysis.flops(pruned_program)))
+```
+
+### 8.3 剪裁测试网络
+
+>注意：对测试网络进行剪裁时，需要将`only_graph`设置为True，具体原因请参考[Pruner API文档](https://paddlepaddle.github.io/PaddleSlim/api/prune_api/#pruner)
+
+
+```python
+pruner = slim.prune.Pruner()
+print("FLOPs before pruning: {}".format(slim.analysis.flops(val_program)))
+pruned_val_program, _, _ = pruner.prune(
+        val_program,
+        fluid.global_scope(),
+        params=ratios.keys(),
+        ratios=ratios.values(),
+        place=place,
+        only_graph=True)
+print("FLOPs after pruning: {}".format(slim.analysis.flops(pruned_val_program)))
+```
+
+测试一下剪裁后的模型在测试集上的精度：
+
+```python
+test(pruned_val_program)
+```
+
+### 8.4 训练剪裁后的模型
+
+对剪裁后的模型在训练集上训练一个`epoch`:
+
+
+```python
+for data in train_reader():
+    acc1, acc5, loss = exe.run(pruned_program, feed=data_feeder.feed(data), fetch_list=outputs)
+print(np.mean(acc1), np.mean(acc5), np.mean(loss))
+```
+
+测试训练后模型的精度：
+
+```python
+test(pruned_val_program)
+```
diff --git a/docs/zh_cn/tutorials/index.rst b/docs/zh_cn/tutorials/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8e8036add2c96d031def2ea6a4a3cf3e07a97b8f
--- /dev/null
+++ b/docs/zh_cn/tutorials/index.rst
@@ -0,0 +1,16 @@
+
+进阶教程
+========
+
+.. toctree::
+   :maxdepth: 1
+
+   image_classification_sensitivity_analysis_tutorial.md
+   darts_nas_turorial.md
+   paddledetection_slim_distillation_tutorial.md
+   paddledetection_slim_nas_tutorial.md
+   paddledetection_slim_pruing_tutorial.md
+   paddledetection_slim_prune_dist_tutorial.md
+   paddledetection_slim_quantization_tutorial.md
+   image_classification_mkldnn_quant_tutorial.md
+   paddledetection_slim_sensitivy_tutorial.md
diff --git a/docs/zh_cn/tutorials/paddledetection_slim_distillation_tutorial.md b/docs/zh_cn/tutorials/paddledetection_slim_distillation_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..dbdd30fff6b4b014e907a3af234735597a20db63
--- /dev/null
+++ b/docs/zh_cn/tutorials/paddledetection_slim_distillation_tutorial.md
@@ -0,0 +1,32 @@
+# 目标检测模型蒸馏教程
+
+教程内容请参考：https://github.com/PaddlePaddle/PaddleDetection/blob/release/0.2/slim/distillation/README.md
+
+
+## 示例结果
+
+### MobileNetV1-YOLO-V3-VOC
+
+| FLOPS |输入尺寸|每张GPU图片个数|推理时间（fps）|Box AP|下载|
+|:-:|:-:|:-:|:-:|:-:|:-:|
+|baseline|608     |16|104.291|76.2|[下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar)|
+|蒸馏后|608 |16|106.914|79.0|[下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_voc_distilled.tar)|
+|baseline|416 |16|-|76.7|[下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar)|
+|蒸馏后|416 |16|-|78.2|[下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_voc_distilled.tar)|
+|baseline|320 |16|-|75.3|[下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar)|
+|蒸馏后|320 |16|-|75.5|[下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_voc_distilled.tar)|
+
+> 蒸馏后的结果用ResNet34-YOLO-V3做teacher，4GPU总batch_size64训练90000 iter得到
+
+### MobileNetV1-YOLO-V3-COCO
+
+| FLOPS |输入尺寸|每张GPU图片个数|推理时间（fps）|Box AP|下载|
+|:-:|:-:|:-:|:-:|:-:|:-:|
+|baseline|608     |16|78.302|29.3|[下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar)|
+|蒸馏后|608 |16|78.523|31.4|[下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_distilled.tar)|
+|baseline|416 |16|-|29.3|[下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar)|
+|蒸馏后|416 |16|-|30.0|[下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_distilled.tar)|
+|baseline|320 |16|-|27.0|[下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar)|
+|蒸馏后|320 |16|-|27.1|[下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_distilled.tar)|
+
+> 蒸馏后的结果用ResNet34-YOLO-V3做teacher，4GPU总batch_size64训练600000 iter得到
diff --git a/docs/zh_cn/tutorials/paddledetection_slim_nas_tutorial.md b/docs/zh_cn/tutorials/paddledetection_slim_nas_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..b4aa6ad133f7cadc744c11912850c66f5868dc19
--- /dev/null
+++ b/docs/zh_cn/tutorials/paddledetection_slim_nas_tutorial.md
@@ -0,0 +1,8 @@
+# 人脸检测模型小模型结构搜索教程
+
+教程内容请参考：https://github.com/PaddlePaddle/PaddleDetection/blob/release/0.2/slim/nas/README.md
+
+## 概述
+
+我们选取人脸检测的BlazeFace模型作为神经网络搜索示例，该示例使用PaddleSlim 辅助完成神经网络搜索实验。
+基于PaddleSlim进行搜索实验过程中，搜索限制条件可以选择是浮点运算数(FLOPs)限制还是硬件延时(latency)限制，硬件延时限制需要提供延时表。本示例提供一份基于blazeface搜索空间的硬件延时表，名称是latency_855.txt(基于PaddleLite在骁龙855上测试的延时)，可以直接用该表进行blazeface的硬件延时搜索实验。
diff --git a/docs/zh_cn/tutorials/paddledetection_slim_pruing_tutorial.md b/docs/zh_cn/tutorials/paddledetection_slim_pruing_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..defd371ccef152a00670cab2c93aa8745688bae0
--- /dev/null
+++ b/docs/zh_cn/tutorials/paddledetection_slim_pruing_tutorial.md
@@ -0,0 +1,3 @@
+# 目标检测模型卷积通道剪裁教程
+
+请参考：https://github.com/PaddlePaddle/PaddleDetection/blob/release/0.2/slim/prune/README.md
diff --git a/docs/zh_cn/tutorials/paddledetection_slim_prune_dist_tutorial.md b/docs/zh_cn/tutorials/paddledetection_slim_prune_dist_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..5697e968da17681d1f8b4f0cdb38eb2a8d19a43d
--- /dev/null
+++ b/docs/zh_cn/tutorials/paddledetection_slim_prune_dist_tutorial.md
@@ -0,0 +1,7 @@
+# 目标检测模型蒸馏剪裁教程
+
+教程内容请参考：https://github.com/PaddlePaddle/PaddleDetection/blob/release/0.2/slim/extensions/distill_pruned_model/README.md
+
+## 概述
+
+该文档介绍如何使用PaddleSlim的蒸馏接口和卷积通道剪裁接口对检测库中的模型进行卷积层的通道剪裁并使用较高精度模型对其蒸馏。
diff --git a/docs/zh_cn/tutorials/paddledetection_slim_quantization_tutorial.md b/docs/zh_cn/tutorials/paddledetection_slim_quantization_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..63e17a15746caeab3f83eed934dee7b996e1c869
--- /dev/null
+++ b/docs/zh_cn/tutorials/paddledetection_slim_quantization_tutorial.md
@@ -0,0 +1,28 @@
+# 目标检测模型量化教程
+
+教程内容请参考：https://github.com/PaddlePaddle/PaddleDetection/blob/release/0.2/slim/quantization/README.md
+
+
+## 示例结果
+
+### 训练策略
+
+- 量化策略`post`为使用静态离线量化方法得到的模型，`aware`为在线量化训练方法得到的模型。
+
+### YOLOv3 on COCO
+
+| 骨架网络         | 预训练权重 | 量化策略 | 输入尺寸 | Box AP  |                           下载                          |
+| :----------------| :--------: | :------: | :------: |:------: | :-----------------------------------------------------: |
+| MobileNetV1      |  ImageNet  |   post   |   608    |  27.9   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_quant_post.tar) |
+| MobileNetV1      |  ImageNet  |   post   |   416    |  28.0   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_quant_post.tar) |
+| MobileNetV1      |  ImageNet  |   post   |   320    |  26.0   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_quant_post.tar) |
+| MobileNetV1      |  ImageNet  |  aware   |   608    |  28.1   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_quant_aware.tar) |
+| MobileNetV1      |  ImageNet  |  aware   |   416    |  28.2   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_quant_aware.tar) |
+| MobileNetV1      |  ImageNet  |  aware   |   320    |  25.8   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_mobilenetv1_coco_quant_aware.tar) |
+| ResNet34         |  ImageNet  |   post   |   608    |  35.7   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r34_coco_quant_post.tar) |
+| ResNet34         |  ImageNet  |  aware   |   608    |  35.2   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r34_coco_quant_aware.tar) |
+| ResNet34         |  ImageNet  |  aware   |   416    |  33.3   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r34_coco_quant_aware.tar) |
+| ResNet34         |  ImageNet  |  aware   |   320    |  30.3   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r34_coco_quant_aware.tar) |
+| R50vd-dcn        | object365  |  aware   |   608    |  40.6   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r50vd_dcn_obj365_pretrained_coco_quant_aware.tar) |
+| R50vd-dcn        | object365  |  aware   |   416    |  37.5   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r50vd_dcn_obj365_pretrained_coco_quant_aware.tar) |
+| R50vd-dcn        | object365  |  aware   |   320    |  34.1   | [下载链接](https://paddlemodels.bj.bcebos.com/PaddleSlim/yolov3_r50vd_dcn_obj365_pretrained_coco_quant_aware.tar) |
diff --git a/docs/zh_cn/tutorials/paddledetection_slim_sensitivy_tutorial.md b/docs/zh_cn/tutorials/paddledetection_slim_sensitivy_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..5e2c7bd54c8f5344f2348bc397a5da569548512c
--- /dev/null
+++ b/docs/zh_cn/tutorials/paddledetection_slim_sensitivy_tutorial.md
@@ -0,0 +1,3 @@
+# 目标检测模型敏感度分析教程
+
+教程内容请参考：https://github.com/PaddlePaddle/PaddleDetection/blob/release/0.2/slim/sensitive/README.md
diff --git a/docs/zh_cn/tutorials/sanas_darts_space.ipynb b/docs/zh_cn/tutorials/sanas_darts_space.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..658124b5ceea1b0e48fe488faffd2fbfa9b6584e
--- /dev/null
+++ b/docs/zh_cn/tutorials/sanas_darts_space.ipynb
@@ -0,0 +1,324 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import paddle\n",
+    "import paddle.fluid as fluid\n",
+    "from paddleslim.nas import SANAS\n",
+    "import numpy as np\n",
+    "\n",
+    "BATCH_SIZE=96\n",
+    "SERVER_ADDRESS = \"\"\n",
+    "PORT = 8377\n",
+    "SEARCH_STEPS = 300\n",
+    "RETAIN_EPOCH=30\n",
+    "MAX_PARAMS=3.77\n",
+    "IMAGE_SHAPE=[3, 32, 32]\n",
+    "AUXILIARY = True\n",
+    "AUXILIARY_WEIGHT= 0.4\n",
+    "TRAINSET_NUM = 50000\n",
+    "LR = 0.025\n",
+    "MOMENTUM = 0.9\n",
+    "WEIGHT_DECAY = 0.0003\n",
+    "DROP_PATH_PROBILITY = 0.2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-23 12:28:09,752-INFO: range table: ([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14])\n",
+      "2020-02-23 12:28:09,754-INFO: ControllerServer - listen on: [127.0.0.1:8377]\n",
+      "2020-02-23 12:28:09,756-INFO: Controller Server run...\n"
+     ]
+    }
+   ],
+   "source": [
+    "config = [('DartsSpace')]\n",
+    "sa_nas = SANAS(config, server_addr=(SERVER_ADDRESS, PORT), search_steps=SEARCH_STEPS, is_server=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def count_parameters_in_MB(all_params, prefix='model'):\n",
+    "    parameters_number = 0\n",
+    "    for param in all_params:\n",
+    "        if param.name.startswith(\n",
+    "                prefix) and param.trainable and 'aux' not in param.name:\n",
+    "            parameters_number += np.prod(param.shape)\n",
+    "    return parameters_number / 1e6"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_data_loader(IMAGE_SHAPE, is_train):\n",
+    "    image = fluid.data(\n",
+    "        name=\"image\", shape=[None] + IMAGE_SHAPE, dtype=\"float32\")\n",
+    "    label = fluid.data(name=\"label\", shape=[None, 1], dtype=\"int64\")\n",
+    "    data_loader = fluid.io.DataLoader.from_generator(\n",
+    "        feed_list=[image, label],\n",
+    "        capacity=64,\n",
+    "        use_double_buffer=True,\n",
+    "        iterable=True)\n",
+    "    drop_path_prob = ''\n",
+    "    drop_path_mask = ''\n",
+    "    if is_train:\n",
+    "        drop_path_prob = fluid.data(\n",
+    "            name=\"drop_path_prob\", shape=[BATCH_SIZE, 1], dtype=\"float32\")\n",
+    "        drop_path_mask = fluid.data(\n",
+    "            name=\"drop_path_mask\",\n",
+    "            shape=[BATCH_SIZE, 20, 4, 2],\n",
+    "            dtype=\"float32\")\n",
+    "\n",
+    "    return data_loader, image, label, drop_path_prob, drop_path_mask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_program(main_program, startup_program, IMAGE_SHAPE, archs, is_train):\n",
+    "    with fluid.program_guard(main_program, startup_program):\n",
+    "        data_loader, data, label, drop_path_prob, drop_path_mask = create_data_loader(\n",
+    "            IMAGE_SHAPE, is_train)\n",
+    "        logits, logits_aux = archs(data, drop_path_prob, drop_path_mask,\n",
+    "                                   is_train, 10)\n",
+    "        top1 = fluid.layers.accuracy(input=logits, label=label, k=1)\n",
+    "        top5 = fluid.layers.accuracy(input=logits, label=label, k=5)\n",
+    "        loss = fluid.layers.reduce_mean(\n",
+    "            fluid.layers.softmax_with_cross_entropy(logits, label))\n",
+    "\n",
+    "        if is_train:\n",
+    "            if AUXILIARY:\n",
+    "                loss_aux = fluid.layers.reduce_mean(\n",
+    "                    fluid.layers.softmax_with_cross_entropy(logits_aux, label))\n",
+    "                loss = loss + AUXILIARY_WEIGHT * loss_aux\n",
+    "            step_per_epoch = int(TRAINSET_NUM / BATCH_SIZE)\n",
+    "            learning_rate = fluid.layers.cosine_decay(LR, step_per_epoch, RETAIN_EPOCH)\n",
+    "            fluid.clip.set_gradient_clip(\n",
+    "                clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0))\n",
+    "            optimizer = fluid.optimizer.MomentumOptimizer(\n",
+    "                learning_rate,\n",
+    "                MOMENTUM,\n",
+    "                regularization=fluid.regularizer.L2DecayRegularizer(\n",
+    "                    WEIGHT_DECAY))\n",
+    "            optimizer.minimize(loss)\n",
+    "            outs = [loss, top1, top5, learning_rate]\n",
+    "        else:\n",
+    "            outs = [loss, top1, top5]\n",
+    "    return outs, data_loader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train(main_prog, exe, epoch_id, train_loader, fetch_list):\n",
+    "    loss = []\n",
+    "    top1 = []\n",
+    "    top5 = []\n",
+    "    for step_id, data in enumerate(train_loader()):\n",
+    "        devices_num = len(data)\n",
+    "        if DROP_PATH_PROBILITY > 0:\n",
+    "            feed = []\n",
+    "            for device_id in range(devices_num):\n",
+    "                image = data[device_id]['image']\n",
+    "                label = data[device_id]['label']\n",
+    "                drop_path_prob = np.array(\n",
+    "                    [[DROP_PATH_PROBILITY * epoch_id / RETAIN_EPOCH]\n",
+    "                     for i in range(BATCH_SIZE)]).astype(np.float32)\n",
+    "                drop_path_mask = 1 - np.random.binomial(\n",
+    "                    1, drop_path_prob[0],\n",
+    "                    size=[BATCH_SIZE, 20, 4, 2]).astype(np.float32)\n",
+    "                feed.append({\n",
+    "                    \"image\": image,\n",
+    "                    \"label\": label,\n",
+    "                    \"drop_path_prob\": drop_path_prob,\n",
+    "                    \"drop_path_mask\": drop_path_mask\n",
+    "                })\n",
+    "        else:\n",
+    "            feed = data\n",
+    "        loss_v, top1_v, top5_v, lr = exe.run(\n",
+    "            main_prog, feed=feed, fetch_list=[v.name for v in fetch_list])\n",
+    "        loss.append(loss_v)\n",
+    "        top1.append(top1_v)\n",
+    "        top5.append(top5_v)\n",
+    "        if step_id % 10 == 0:\n",
+    "            print(\n",
+    "                \"Train Epoch {}, Step {}, Lr {:.8f}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}\".\n",
+    "                format(epoch_id, step_id, lr[0], np.mean(loss), np.mean(top1), np.mean(top5)))\n",
+    "    return np.mean(top1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def valid(main_prog, exe, epoch_id, valid_loader, fetch_list):\n",
+    "    loss = []\n",
+    "    top1 = []\n",
+    "    top5 = []\n",
+    "    for step_id, data in enumerate(valid_loader()):\n",
+    "        loss_v, top1_v, top5_v = exe.run(\n",
+    "            main_prog, feed=data, fetch_list=[v.name for v in fetch_list])\n",
+    "        loss.append(loss_v)\n",
+    "        top1.append(top1_v)\n",
+    "        top5.append(top5_v)\n",
+    "        if step_id % 10 == 0:\n",
+    "            print(\n",
+    "                \"Valid Epoch {}, Step {}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}\".\n",
+    "                format(epoch_id, step_id, np.mean(loss), np.mean(top1), np.mean(top5)))\n",
+    "    return np.mean(top1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-02-23 12:28:57,462-INFO: current tokens: [5, 5, 5, 5, 5, 12, 7, 7, 7, 7, 7, 7, 7, 10, 10, 10, 10, 10, 10, 10]\n"
+     ]
+    }
+   ],
+   "source": [
+    "archs = sa_nas.next_archs()[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_program = fluid.Program()\n",
+    "test_program = fluid.Program()\n",
+    "startup_program = fluid.Program()\n",
+    "train_fetch_list, train_loader = build_program(train_program, startup_program, IMAGE_SHAPE, archs, is_train=True)\n",
+    "test_fetch_list, test_loader = build_program(test_program, startup_program, IMAGE_SHAPE, archs, is_train=False)\n",
+    "test_program = test_program.clone(for_test=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "place = fluid.CPUPlace()\n",
+    "exe = fluid.Executor(place)\n",
+    "exe.run(startup_program)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<paddle.fluid.reader.GeneratorLoader at 0x7fddc8fe7cd0>"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_reader = paddle.fluid.io.batch(paddle.reader.shuffle(paddle.dataset.cifar.train10(cycle=False), buf_size=1024), batch_size=BATCH_SIZE, drop_last=True)\n",
+    "test_reader = paddle.fluid.io.batch(paddle.dataset.cifar.test10(cycle=False), batch_size=BATCH_SIZE, drop_last=False)\n",
+    "train_loader.set_sample_list_generator(train_reader, places=place)\n",
+    "test_loader.set_sample_list_generator(test_reader, places=place)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Epoch 0, Step 0, Lr 0.02500000, loss 3.310467, acc_1 0.062500, acc_5 0.468750\n"
+     ]
+    }
+   ],
+   "source": [
+    "for epoch_id in range(RETAIN_EPOCH):\n",
+    "    train_top1 = train(train_program, exe, epoch_id, train_loader, train_fetch_list)\n",
+    "    print(\"TRAIN: Epoch {}, train_acc {:.6f}\".format(epoch_id, train_top1))\n",
+    "    valid_top1 = valid(test_program, exe, epoch_id, test_loader, test_fetch_list)\n",
+    "    print(\"TEST: Epoch {}, valid_acc {:.6f}\".format(epoch_id, valid_top1))\n",
+    "    valid_top1_list.append(valid_top1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/zh_cn/tutorials/sanas_darts_space.md b/docs/zh_cn/tutorials/sanas_darts_space.md
new file mode 100644
index 0000000000000000000000000000000000000000..b280db4ef88c5e2e63e398379f771976c03a3d7b
--- /dev/null
+++ b/docs/zh_cn/tutorials/sanas_darts_space.md
@@ -0,0 +1,275 @@
+# SANAS进阶版实验教程-压缩DARTS产出模型
+
+## 收益情况
+利用DARTS搜索出来的最终模型结构（以下简称为DARTS_model）构造相应的搜索空间，根据PaddleSlim提供的SANAS搜索方法进行搜索实验，最终得到的模型结构（以下简称为DARTS_SA）相比DARTS_model的精度提升<font color=green>0.141%</font>，模型大小下降<font color=green>11.2%</font>。
+
+## 搜索教程
+本教程展示了如何在DARTS_model基础上利用SANAS进行搜索实验，并得到DARTS_SA的结果。
+
+本教程包含以下步骤：
+1. 构造搜索空间
+2. 导入依赖包并定义全局变量
+3. 初始化SANAS实例
+4. 定义计算模型参数量的函数
+5. 定义网络输入数据的函数
+6. 定义造program的函数
+7. 定义训练函数
+8. 定义预测函数
+9. 启动搜索  
+  9.1 获取下一个模型结构  
+  9.2 构造相应的训练和预测program  
+  9.3 添加搜索限制  
+  9.4 定义环境  
+  9.5 定义输入数据  
+  9.6 启动训练和评估  
+  9.7 回传当前模型的得分reward
+10. 利用demo下的脚本启动搜索
+11. 利用demo下的脚本启动最终实验
+
+### 1. 构造搜索空间
+进行搜索实验之前，首先需要根据DARTS_model的模型特点构造相应的搜索空间，本次实验仅会对DARTS_model的通道数进行搜索，搜索的目的是得到一个精度更高并且模型参数更少的模型。
+定义如下搜索空间：
+- 通道数`filter_num`: 定义了每个卷积操作的通道数变化区间。取值区间为：`[4, 8, 12, 16, 20, 36, 54, 72, 90, 108, 144, 180, 216, 252]`
+
+按照通道数来区分DARTS_model中block的话，则DARTS_model中共有3个block，第一个block仅包含6个normal cell，之后的两个block每个block都包含和一个reduction cell和6个normal cell，共有20个cell。在构造搜索空间的时候我们定义每个cell中的所有卷积操作都使用相同的通道数，共有20位token。
+
+完整的搜索空间可以参考[基于DARTS_model的搜索空间](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/nas/search_space/darts_space.py)
+
+### 2. 引入依赖包并定义全局变量
+```python
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddleslim.nas import SANAS
+
+BATCH_SIZE=96
+SERVER_ADDRESS = ""
+PORT = 8377
+SEARCH_STEPS = 300
+RETAIN_EPOCH=30
+MAX_PARAMS=3.77
+IMAGE_SHAPE=[3, 32, 32]
+AUXILIARY = True
+AUXILIARY_WEIGHT= 0.4
+TRAINSET_NUM = 50000
+LR = 0.025
+MOMENTUM = 0.9
+WEIGHT_DECAY = 0.0003
+DROP_PATH_PROBILITY = 0.2
+```
+
+### 3. 初始化SANAS实例
+首先需要初始化SANAS示例。
+```python
+config = [('DartsSpace')]
+sa_nas = SANAS(config, server_addr=(SERVER_ADDRESS, PORT), search_steps=SEARCH_STEPS, is_server=True)
+```
+
+### 4. 定义计算模型参数量的函数
+根据输入的program计算当前模型中的参数量。本教程使用模型参数量作为搜索的限制条件。
+```python
+def count_parameters_in_MB(all_params, prefix='model'):
+    parameters_number = 0
+    for param in all_params:
+        if param.name.startswith(
+                prefix) and param.trainable and 'aux' not in param.name:
+            parameters_number += np.prod(param.shape)
+    return parameters_number / 1e6
+```
+
+### 5. 定义网络输入数据的函数
+根据输入图片的尺寸定义网络中的输入，其中包括图片输入、标签输入和在训练过程中需要随机丢弃单元的比例和掩膜。
+```python
+def create_data_loader(IMAGE_SHAPE, is_train):
+    image = fluid.data(
+        name="image", shape=[None] + IMAGE_SHAPE, dtype="float32")
+    label = fluid.data(name="label", shape=[None, 1], dtype="int64")
+    data_loader = fluid.io.DataLoader.from_generator(
+        feed_list=[image, label],
+        capacity=64,
+        use_double_buffer=True,
+        iterable=True)
+    drop_path_prob = ''
+    drop_path_mask = ''
+    if is_train:
+        drop_path_prob = fluid.data(
+            name="drop_path_prob", shape=[BATCH_SIZE, 1], dtype="float32")
+        drop_path_mask = fluid.data(
+            name="drop_path_mask",
+            shape=[BATCH_SIZE, 20, 4, 2],
+            dtype="float32")
+
+    return data_loader, image, label, drop_path_prob, drop_path_mask
+```
+
+### 6. 定义构造program的函数
+根据输入的模型结构、输入图片尺寸和当前program是否是训练模式构造program。
+```python
+def build_program(main_program, startup_program, IMAGE_SHAPE, archs, is_train):
+    with fluid.program_guard(main_program, startup_program):
+        data_loader, data, label, drop_path_prob, drop_path_mask = create_data_loader(
+            IMAGE_SHAPE, is_train)
+        logits, logits_aux = archs(data, drop_path_prob, drop_path_mask,
+                                   is_train, 10)
+        top1 = fluid.layers.accuracy(input=logits, label=label, k=1)
+        top5 = fluid.layers.accuracy(input=logits, label=label, k=5)
+        loss = fluid.layers.reduce_mean(
+            fluid.layers.softmax_with_cross_entropy(logits, label))
+
+        if is_train:
+            if AUXILIARY:
+                loss_aux = fluid.layers.reduce_mean(
+                    fluid.layers.softmax_with_cross_entropy(logits_aux, label))
+                loss = loss + AUXILIARY_WEIGHT * loss_aux
+            step_per_epoch = int(TRAINSET_NUM / BATCH_SIZE)
+            learning_rate = fluid.layers.cosine_decay(LR, step_per_epoch, RETAIN_EPOCH)
+            fluid.clip.set_gradient_clip(
+                clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0))
+            optimizer = fluid.optimizer.MomentumOptimizer(
+                learning_rate,
+                MOMENTUM,
+                regularization=fluid.regularizer.L2DecayRegularizer(
+                    WEIGHT_DECAY))
+            optimizer.minimize(loss)
+            outs = [loss, top1, top5, learning_rate]
+        else:
+            outs = [loss, top1, top5]
+    return outs, data_loader
+
+```
+
+### 7. 定义训练函数
+```python
+def train(main_prog, exe, epoch_id, train_loader, fetch_list):
+    loss = []
+    top1 = []
+    top5 = []
+    for step_id, data in enumerate(train_loader()):
+        devices_num = len(data)
+        if DROP_PATH_PROBILITY > 0:
+            feed = []
+            for device_id in range(devices_num):
+                image = data[device_id]['image']
+                label = data[device_id]['label']
+                drop_path_prob = np.array(
+                    [[DROP_PATH_PROBILITY * epoch_id / RETAIN_EPOCH]
+                     for i in range(BATCH_SIZE)]).astype(np.float32)
+                drop_path_mask = 1 - np.random.binomial(
+                    1, drop_path_prob[0],
+                    size=[BATCH_SIZE, 20, 4, 2]).astype(np.float32)
+                feed.append({
+                    "image": image,
+                    "label": label,
+                    "drop_path_prob": drop_path_prob,
+                    "drop_path_mask": drop_path_mask
+                })
+        else:
+            feed = data
+        loss_v, top1_v, top5_v, lr = exe.run(
+            main_prog, feed=feed, fetch_list=[v.name for v in fetch_list])
+        loss.append(loss_v)
+        top1.append(top1_v)
+        top5.append(top5_v)
+        if step_id % 10 == 0:
+            print(
+                "Train Epoch {}, Step {}, Lr {:.8f}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}".
+                format(epoch_id, step_id, lr[0], np.mean(loss), np.mean(top1), np.mean(top5)))
+    return np.mean(top1)
+```
+
+### 8. 定义预测函数
+```python
+def valid(main_prog, exe, epoch_id, valid_loader, fetch_list):
+    loss = []
+    top1 = []
+    top5 = []
+    for step_id, data in enumerate(valid_loader()):
+        loss_v, top1_v, top5_v = exe.run(
+            main_prog, feed=data, fetch_list=[v.name for v in fetch_list])
+        loss.append(loss_v)
+        top1.append(top1_v)
+        top5.append(top5_v)
+        if step_id % 10 == 0:
+            print(
+                "Valid Epoch {}, Step {}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}".
+                format(epoch_id, step_id, np.mean(loss), np.mean(top1), np.mean(top5)))
+    return np.mean(top1)
+```
+
+### 9. 启动搜索实验
+以下步骤拆解说明了如何获得当前模型结构以及获得当前模型结构之后应该有的步骤。
+
+#### 9.1 获取下一个模型结构
+根据上面的SANAS实例中的函数获取下一个模型结构。
+```python
+archs = sa_nas.next_archs()[0]
+```
+
+#### 9.2 构造训练和预测program
+根据上一步中获得的模型结构分别构造训练program和预测program。
+```python
+train_program = fluid.Program()
+test_program = fluid.Program()
+startup_program = fluid.Program()
+train_fetch_list, train_loader = build_program(train_program, startup_program, IMAGE_SHAPE, archs, is_train=True)
+test_fetch_list, test_loader = build_program(test_program, startup_program, IMAGE_SHAPE, archs, is_train=False)
+test_program = test_program.clone(for_test=True)
+```
+
+#### 9.3 添加搜索限制
+本教程以模型参数量为限制条件。首先计算一下当前program的参数量，如果超出限制条件，则终止本次模型结构的训练，获取下一个模型结构。
+```python
+current_params = count_parameters_in_MB(
+    train_program.global_block().all_parameters(), 'cifar10')
+```
+
+#### 9.4 定义环境
+定义数据和模型的环境并初始化参数。
+```python
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+exe.run(startup_program)
+```
+
+#### 9.5 定义输入数据
+由于本示例中对cifar10中的图片进行了一些额外的预处理操作，和[快速开始](https://paddlepaddle.github.io/PaddleSlim/quick_start/nas_tutorial.html)示例中的reader不同，所以需要自定义cifar10的reader，不能直接调用paddle中封装好的`paddle.dataset.cifar10`的reader。自定义cifar10的reader文件位于[demo/nas](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/demo/nas/darts_cifar10_reader.py)中。
+
+**注意：**本示例为了简化代码直接调用`paddle.dataset.cifar10`定义训练数据和预测数据，实际训练需要使用自定义cifar10文件中的reader。
+```python
+train_reader = paddle.fluid.io.batch(paddle.reader.shuffle(paddle.dataset.cifar.train10(cycle=False), buf_size=1024), batch_size=BATCH_SIZE, drop_last=True)
+test_reader = paddle.fluid.io.batch(paddle.dataset.cifar.test10(cycle=False), batch_size=BATCH_SIZE, drop_last=False)
+train_loader.set_sample_list_generator(train_reader, places=place)
+test_loader.set_sample_list_generator(test_reader, places=place)
+```
+
+#### 9.6 启动训练和评估
+```python
+for epoch_id in range(RETAIN_EPOCH):
+    train_top1 = train(train_program, exe, epoch_id, train_loader, train_fetch_list)
+    print("TRAIN: Epoch {}, train_acc {:.6f}".format(epoch_id, train_top1))
+    valid_top1 = valid(test_program, exe, epoch_id, test_loader, test_fetch_list)
+    print("TEST: Epoch {}, valid_acc {:.6f}".format(epoch_id, valid_top1))
+    valid_top1_list.append(valid_top1)
+```
+
+#### 9.7 回传当前模型的得分reward
+本教程利用最后两个epoch的准确率均值作为最终的得分回传给SANAS。
+```python
+sa_nas.reward(float(valid_top1_list[-1] + valid_top1_list[-2]) / 2)
+```
+
+
+### 10. 利用demo下的脚本启动搜索
+
+搜索文件位于: [darts_sanas_demo](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/demo/nas/sanas_darts_space.py)，搜索过程中限制模型参数量为不大于3.77M。
+```python
+cd demo/nas/
+python darts_nas.py
+```
+
+### 11. 利用demo下的脚本启动最终实验
+最终实验文件位于: [darts_sanas_demo](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/demo/nas/sanas_darts_space.py)，最终实验需要训练600epoch。以下示例输入token为`[5, 5, 0, 5, 5, 10, 7, 7, 5, 7, 7, 11, 10, 12, 10, 0, 5, 3, 10, 8]`。
+```python
+cd demo/nas/
+python darts_nas.py --token 5 5 0 5 5 10 7 7 5 7 7 11 10 12 10 0 5 3 10 8 --retain_epoch 600
+```
diff --git a/docs/zh_cn/tutorials/sensitivity_tutorial.md b/docs/zh_cn/tutorials/sensitivity_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..7b23ee91d0f1c43039226d88d387b8a1d7b59330
--- /dev/null
+++ b/docs/zh_cn/tutorials/sensitivity_tutorial.md
@@ -0,0 +1,25 @@
+#  图像分类模型通道剪裁-敏感度分析
+
+该教程以图像分类模型MobileNetV1为例，说明如何快速使用[PaddleSlim的敏感度分析接口](https://paddlepaddle.github.io/PaddleSlim/api/prune_api/#sensitivity)。
+该示例包含以下步骤：
+
+1. 导入依赖
+2. 构建模型
+3. 定义输入数据
+4. 定义模型评估方法
+5. 训练模型
+6. 获取待分析卷积参数名称
+7. 分析敏感度
+8. 剪裁模型
+
+以下章节依次次介绍每个步骤的内容。
+
+## 1. 导入依赖
+
+PaddleSlim依赖Paddle1.7版本，请确认已正确安装Paddle，然后按以下方式导入Paddle和PaddleSlim:
+
+```
+import paddle
+import paddle.fluid as fluid
+import paddleslim as slim
+```
diff --git a/paddleslim/__init__.py b/paddleslim/__init__.py
index 9d0531501ca43921438ee5b2fb58ac0ad2396d1b..357e1a293b4471eb6aca8c26c8ac852cae25f365 100644
--- a/paddleslim/__init__.py
+++ b/paddleslim/__init__.py
@@ -11,3 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import absolute_import
+from paddleslim import models
+from paddleslim import prune
+from paddleslim import nas
+from paddleslim import analysis
+from paddleslim import dist
+from paddleslim import quant
+from paddleslim import pantheon
+__all__ = ['models', 'prune', 'nas', 'analysis', 'dist', 'quant', 'pantheon']
diff --git a/paddleslim/analysis/flops.py b/paddleslim/analysis/flops.py
index fed377db89e69ef09dc9600243765f5b4ec6c717..120ab9545da3cc869d66e2d0848856eb40738532 100644
--- a/paddleslim/analysis/flops.py
+++ b/paddleslim/analysis/flops.py
@@ -19,24 +19,22 @@ __all__ = ["flops"]
 
 
 def flops(program, only_conv=True, detail=False):
-    """
-    Get FLOPS of target graph.
+    """Get FLOPs of target graph.
+
     Args:
         program(Program): The program used to calculate FLOPS.
         only_conv(bool): Just return number of mul-adds in convolution and FC layer if `only_conv` is true.
                          default: True.
         detail(bool): Whether to return detail of each convolution layer.
     
-    Return:
-        If `detail` is true, then return a tuple in format `(FLOPs, details)`, otherwise it will just return `FlOPs`
-        FLOPs(int): The FLOPs of target network.
-        details(dict): The key is the parameter name of convlution layer and the value is the FLOPs of each convolution layer.
+    Returns:
+        int|tuple: If `detail` is true, then return a tuple in format `(FLOPs, details)`, otherwise it will just return `FlOPs`. The details is a dict whose key is the parameter name of convlution layer and value is the FLOPs of each convolution layer. 
     """
     graph = GraphWrapper(program)
     return _graph_flops(graph, only_conv=only_conv, detail=detail)
 
 
-def _graph_flops(graph, only_conv=False, detail=False):
+def _graph_flops(graph, only_conv=True, detail=False):
     assert isinstance(graph, GraphWrapper)
     flops = 0
     params2flops = {}
@@ -71,7 +69,8 @@ def _graph_flops(graph, only_conv=False, detail=False):
             flops += op_flops
             params2flops[op.inputs("Y")[0].name()] = op_flops
 
-        elif op.type() in ['relu', 'sigmoid', 'batch_norm', 'relu6'] and not only_conv:
+        elif op.type() in ['relu', 'sigmoid', 'batch_norm', 'relu6'
+                           ] and not only_conv:
             input_shape = list(op.inputs("X")[0].shape())
             if input_shape[0] == -1:
                 input_shape[0] = 1
diff --git a/paddleslim/analysis/latency.py b/paddleslim/analysis/latency.py
index fea255c7152df58274a013bd030d6ac63c7b43a8..3c9211e4e189e2d56d8cf6b2cd0d8501039b946c 100644
--- a/paddleslim/analysis/latency.py
+++ b/paddleslim/analysis/latency.py
@@ -1,3 +1,5 @@
+"""Define latency evaluators that evaluate the performance of mode on devices.
+"""
 # Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
@@ -18,13 +20,21 @@ __all__ = ["LatencyEvaluator", "TableLatencyEvaluator"]
 
 
 class LatencyEvaluator(object):
-    def __init__(self):
-        pass
+    """Base class of latency evaluator.
+    """
 
     def latency(self, graph):
-        pass
+        """Get latency of graph. It is an abstract method.
 
-    def _get_ops_from_graph(self, graph):
+        Args:
+            graph(GrapWrapper | Program): The graph to be evaluated.
+
+        Returns:
+            latency(float): The latency of given graph on current evaluator.
+        """
+        raise NotImplementedError('Abstract method.')
+
+    def _get_ops_from_graph(self, graph, only_conv):
         assert isinstance(graph, GraphWrapper)
         ops = []
         i = 0
@@ -33,22 +43,20 @@ class LatencyEvaluator(object):
                 tmp = self._conv_op_args(op)
             elif op.type() in [
                     'elementwise_add', 'elementwise_mul', 'elementwise_max'
-            ]:
+            ] and only_conv == False:
                 tmp = self._eltwise_op_args(op)
             elif op.type() in [
                     'relu', 'prelu', 'sigmoid', 'relu6', 'elu', 'brelu',
                     'leaky_relu'
-            ]:
+            ] and only_conv == False:
                 tmp = self._activation_op_args(op)
-            elif op.type() == 'batch_norm':
+            elif op.type() == 'batch_norm' and only_conv == False:
                 tmp = self._batch_norm_op_args(op)
-            elif op.type() == 'pool2d':
+            elif op.type() == 'pool2d' and only_conv == False:
                 tmp = self._pooling_op_args(op)
-            elif op.type() == 'batch_norm':
-                tmp = self._batch_norm_op_args(op)
-            elif op.type() == 'softmax':
+            elif op.type() == 'softmax' and only_conv == False:
                 tmp = self._softmax_op_args(op)
-            elif op.type() == 'mul':
+            elif op.type() == 'mul' and only_conv == False:
                 tmp = self._fc_op_args(op)
             else:
                 tmp = None
@@ -243,13 +251,14 @@ class LatencyEvaluator(object):
 
 
 class TableLatencyEvaluator(LatencyEvaluator):
+    """The evaluator used to get graph's latency on some devices and infer engines.
+
+    Args:
+      table_file(str): The path of file that records the devices latency of operators.
+      delimiter(str): The delimiter used in `table_file`.
+    """
+
     def __init__(self, table_file, delimiter=","):
-        """
-        The evaluator used to get graph's latency on some devices and infer engines.
-        Args:
-          - table_file(str): The path of file that records the devices latency of operators.
-          - delimiter(str): The delimiter used in `table_file`.
-        """
         self._table = self._load_table(table_file)
         self._delimiter = delimiter
 
@@ -268,11 +277,13 @@ class TableLatencyEvaluator(LatencyEvaluator):
         assert op_str in self._table
         return self._table[op_str]
 
-    def latency(self, graph):
-        """
-        Get latency of target graph.
+    def latency(self, graph, only_conv=True):
+        """Get latency of target graph.
+
         Args:
-            - graph(GrapWrapper | Program): The graph to be evaluated.
+            graph(GrapWrapper | Program): The graph to be evaluated.
+            only_conv(bool): only evaluated convolution layer if `only_conv` is true. Default: True.
+
         Returns:
             latency(float): The latency of given graph on current evaluator.
         """
@@ -280,7 +291,7 @@ class TableLatencyEvaluator(LatencyEvaluator):
         if isinstance(graph, Program):
             graph = GraphWrapper(graph)
         assert isinstance(graph, GraphWrapper)
-        for op in self._get_ops_from_graph(graph):
+        for op in self._get_ops_from_graph(graph, only_conv):
             total_latency += self._op_latency(
                 self._delimiter.join(map(lambda x: str(x), op)))
         return total_latency
diff --git a/paddleslim/analysis/model_size.py b/paddleslim/analysis/model_size.py
index 34574d5d53764810185112d7122aeb5b99d74682..55a1595e55512a3c64dae0aa63a9a0e5dc06a83a 100644
--- a/paddleslim/analysis/model_size.py
+++ b/paddleslim/analysis/model_size.py
@@ -21,8 +21,12 @@ __all__ = ["model_size"]
 def model_size(program):
     """
     Get total value numbers of all parameters.
+
     Args:
-        program(Program): The program used to calculate model size.
+        program(fluid.Program): The program used to calculate model size.
+
+    Returns:
+        int: The total count of all parameters. 
     """
     size = 0
     for block in program.blocks:
diff --git a/paddleslim/common/__init__.py b/paddleslim/common/__init__.py
index ccf9b76d41a5ad38f029d20593aa23470e2f60b9..894d5d5a1a13d9fd1aa1fabcfc6e849df6fa17ca 100644
--- a/paddleslim/common/__init__.py
+++ b/paddleslim/common/__init__.py
@@ -11,15 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .controller import EvolutionaryController
+from .controller import EvolutionaryController, RLBaseController
 from .sa_controller import SAController
 from .log_helper import get_logger
 from .controller_server import ControllerServer
 from .controller_client import ControllerClient
-from .lock_utils import lock, unlock
+from .lock import lock, unlock
 from .cached_reader import cached_reader
+from .server import Server
+from .client import Client
+from .meter import AvgrageMeter
+from .analyze_helper import pdf
 
 __all__ = [
     'EvolutionaryController', 'SAController', 'get_logger', 'ControllerServer',
-    'ControllerClient', 'lock', 'unlock', 'cached_reader'
+    'ControllerClient', 'lock', 'unlock', 'cached_reader', 'AvgrageMeter',
+    'Server', 'Client', 'RLBaseController', 'pdf'
 ]
diff --git a/paddleslim/common/analyze_helper.py b/paddleslim/common/analyze_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5883bb597d027ce2c836e098a45af1383684c64
--- /dev/null
+++ b/paddleslim/common/analyze_helper.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import matplotlib
+matplotlib.use('Agg')
+import logging
+import numpy as np
+from matplotlib.backends.backend_pdf import PdfPages
+import matplotlib.pyplot as plt
+import os
+
+import paddle
+import paddle.fluid as fluid
+
+from ..common import get_logger
+_logger = get_logger(__name__, level=logging.INFO)
+
+
+def pdf(program,
+        var_names,
+        executor=None,
+        batch_generator=None,
+        data_loader=None,
+        feed_vars=None,
+        fetch_list=None,
+        scope=None,
+        pdf_save_dir='tmp_pdf'):
+    """
+    Draw hist for distributtion of variables in that name is in var_names
+    
+    Args:
+        program(fluid.Program): program to analyze.
+        var_names(list): name of variables to analyze. When there is activation name in var_names, 
+            you should set executor, one of batch_generator and data_loader, feed_list.
+        executor(fluid.Executor, optional): The executor to run program. Default is None.
+        batch_generator(Python Generator, optional): The batch generator provides calibrate data for DataLoader,
+            and it returns a batch every time. For data_loader and batch_generator, 
+            only one can be set. Default is None.
+        data_loader(fluid.io.DataLoader, optional): The data_loader provides calibrate data to run program. 
+            Default is None.
+        feed_vars(list): feed variables for program. When you use batch_generator to provide data, 
+            you should set feed_vars. Default is None.
+        fetch_list(list): fetch list for program. Default is None.
+        scope(fluid.Scope, optional): The scope to run program, use it to load variables. 
+            If scope is None, will use fluid.global_scope().
+        pdf_save_dir(str): dirname to save pdf. Default is 'tmp_pdf'
+    
+    Returns:
+        dict: numpy array of variables that name in var_names
+    """
+    scope = fluid.global_scope() if scope is None else scope
+    assert isinstance(var_names, list), 'var_names is a list of variable name'
+    real_names = []
+    weight_only = True
+    for var in program.list_vars():
+        if var.name in var_names:
+            if var.persistable == False:
+                weight_only = False
+                var.persistable = True
+            real_names.append(var.name)
+
+    if weight_only == False:
+        if batch_generator is not None:
+            assert feed_vars is not None, "When using batch_generator, feed_vars must be set"
+            dataloader = fluid.io.DataLoader.from_generator(
+                feed_list=feed_vars, capacity=512, iterable=True)
+            dataloader.set_batch_generator(batch_generator, executor.place)
+        elif data_loader is not None:
+            dataloader = data_loader
+        else:
+            _logger.info(
+                "When both batch_generator and data_loader is None, var_names can only include weight names"
+            )
+            return
+
+        assert executor is not None, "when var_names include activations'name, executor must be set"
+        assert fetch_list is not None, "when var_names include activations'name,, executor must be set"
+
+        for data in dataloader:
+            executor.run(program=program,
+                         feed=data,
+                         fetch_list=fetch_list,
+                         return_numpy=False)
+            break
+
+    res_np = {}
+    for name in real_names:
+        var = fluid.global_scope().find_var(name)
+        if var is not None:
+            res_np[name] = np.array(var.get_tensor())
+        else:
+            _logger.info(
+                "can't find var {}. Maybe you should set one of batch_generator and data_loader".
+                format(name))
+    numbers = len(real_names)
+    if pdf_save_dir is not None:
+        if not os.path.exists(pdf_save_dir):
+            os.mkdir(pdf_save_dir)
+        pdf_path = os.path.join(pdf_save_dir, 'result.pdf')
+        with PdfPages(pdf_path) as pdf:
+            idx = 1
+            for name in res_np.keys():
+                if idx % 10 == 0:
+                    _logger.info("plt {}/{}".format(idx, numbers))
+                arr = res_np[name]
+                arr = arr.flatten()
+                weights = np.ones_like(arr) / len(arr)
+                plt.hist(arr, bins=1000, weights=weights)
+                plt.xlabel(name)
+                plt.ylabel("frequency")
+                plt.title("Hist of variable {}".format(name))
+                plt.show()
+                pdf.savefig()
+                plt.close()
+                idx += 1
+    return res_np
diff --git a/paddleslim/common/cached_reader.py b/paddleslim/common/cached_reader.py
index 55f27054efe55d9df90352b3e707fe51c8996023..c297fe9dfdd56b92b4e46f79ad014f0f4261b022 100644
--- a/paddleslim/common/cached_reader.py
+++ b/paddleslim/common/cached_reader.py
@@ -25,6 +25,7 @@ _logger = get_logger(__name__, level=logging.INFO)
 def cached_reader(reader, sampled_rate, cache_path, cached_id):
     """
     Sample partial data from reader and cache them into local file system.
+
     Args:
         reader: Iterative data source.
         sampled_rate(float): The sampled rate used to sample partial data for evaluation. None means using all data in eval_reader. default: None.
diff --git a/paddleslim/common/client.py b/paddleslim/common/client.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0feacf1618dc6e61f5b1baf32ebd0679955dbf2
--- /dev/null
+++ b/paddleslim/common/client.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import signal
+import zmq
+import socket
+import logging
+import time
+import threading
+import six
+if six.PY2:
+    import cPickle as pickle
+else:
+    import pickle
+from .log_helper import get_logger
+from .rl_controller.utils import compute_grad, ConnectMessage
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+
+class Client(object):
+    def __init__(self, controller, address, client_name):
+        self._controller = controller
+        self._address = address
+        self._ip = self._address[0]
+        self._port = self._address[1]
+        self._client_name = client_name
+        self._params_dict = None
+        self.init_wait = False
+        self._connect_server()
+
+    def _connect_server(self):
+        self._ctx = zmq.Context()
+        self._client_socket = self._ctx.socket(zmq.REQ)
+        ### NOTE: change the method to exit client when server is dead if there are better solutions
+        self._client_socket.setsockopt(zmq.RCVTIMEO,
+                                       ConnectMessage.TIMEOUT * 1000)
+        client_address = "{}:{}".format(self._ip, self._port)
+        self._client_socket.connect("tcp://{}".format(client_address))
+        self._client_socket.send_multipart([
+            pickle.dumps(ConnectMessage.INIT), pickle.dumps(self._client_name)
+        ])
+        message = self._client_socket.recv_multipart()
+        if pickle.loads(message[0]) != ConnectMessage.INIT_DONE:
+            _logger.error("Client {} init failure, Please start it again".
+                          format(self._client_name))
+            pid = os.getpid()
+            os.kill(pid, signal.SIGTERM)
+        _logger.info("Client {}: connect to server success!!!".format(
+            self._client_name))
+        _logger.debug("Client {}: connect to server {}".format(
+            self._client_name, client_address))
+
+    def _connect_wait_socket(self, port):
+        self._wait_socket = self._ctx.socket(zmq.REQ)
+        wait_address = "{}:{}".format(self._ip, port)
+        self._wait_socket.connect("tcp://{}".format(wait_address))
+        self._wait_socket.send_multipart([
+            pickle.dumps(ConnectMessage.WAIT_PARAMS),
+            pickle.dumps(self._client_name)
+        ])
+        message = self._wait_socket.recv_multipart()
+        return pickle.loads(message[0])
+
+    def next_tokens(self, obs, is_inference=False):
+        _logger.debug("Client: requests for weight {}".format(
+            self._client_name))
+        self._client_socket.send_multipart([
+            pickle.dumps(ConnectMessage.GET_WEIGHT),
+            pickle.dumps(self._client_name)
+        ])
+        try:
+            message = self._client_socket.recv_multipart()
+        except zmq.error.Again as e:
+            _logger.error(
+                "CANNOT recv params from server in next_archs, Please check whether the server is alive!!! {}".
+                format(e))
+            os._exit(0)
+        self._params_dict = pickle.loads(message[0])
+        tokens = self._controller.next_tokens(
+            obs, params_dict=self._params_dict, is_inference=is_inference)
+        _logger.debug("Client: client_name is {}, current token is {}".format(
+            self._client_name, tokens))
+        return tokens
+
+    def update(self, rewards, **kwargs):
+        assert self._params_dict != None, "Please call next_token to get token first, then call update"
+        current_params_dict = self._controller.update(
+            rewards, self._params_dict, **kwargs)
+        params_grad = compute_grad(current_params_dict, self._params_dict)
+        _logger.debug("Client: update weight {}".format(self._client_name))
+        self._client_socket.send_multipart([
+            pickle.dumps(ConnectMessage.UPDATE_WEIGHT),
+            pickle.dumps(self._client_name), pickle.dumps(params_grad)
+        ])
+        _logger.debug("Client: update done {}".format(self._client_name))
+
+        try:
+            message = self._client_socket.recv_multipart()
+        except zmq.error.Again as e:
+            _logger.error(
+                "CANNOT recv params from server in rewards, Please check whether the server is alive!!! {}".
+                format(e))
+            os._exit(0)
+
+        if pickle.loads(message[0]) == ConnectMessage.WAIT:
+            _logger.debug("Client: self.init_wait: {}".format(self.init_wait))
+            if not self.init_wait:
+                wait_port = pickle.loads(message[1])
+                wait_signal = self._connect_wait_socket(wait_port)
+                self.init_wait = True
+            else:
+                wait_signal = pickle.loads(message[0])
+            while wait_signal != ConnectMessage.OK:
+                time.sleep(1)
+                self._wait_socket.send_multipart([
+                    pickle.dumps(ConnectMessage.WAIT_PARAMS),
+                    pickle.dumps(self._client_name)
+                ])
+                wait_signal = self._wait_socket.recv_multipart()
+                wait_signal = pickle.loads(wait_signal[0])
+                _logger.debug("Client: {} {}".format(self._client_name,
+                                                     wait_signal))
+
+        return pickle.loads(message[0])
+
+    def __del__(self):
+        try:
+            self._client_socket.send_multipart([
+                pickle.dumps(ConnectMessage.EXIT),
+                pickle.dumps(self._client_name)
+            ])
+            _ = self._client_socket.recv_multipart()
+        except:
+            pass
+        self._client_socket.close()
diff --git a/paddleslim/common/controller.py b/paddleslim/common/controller.py
index 8c30f49c3aec27a326417554bac3163789342ff6..87c887bdc85a1e535e15daf03e27358f2c6a529e 100644
--- a/paddleslim/common/controller.py
+++ b/paddleslim/common/controller.py
@@ -16,19 +16,18 @@
 import copy
 import math
 import numpy as np
+import paddle.fluid as fluid
 
-__all__ = ['EvolutionaryController']
+__all__ = ['EvolutionaryController', 'RLBaseController']
 
 
 class EvolutionaryController(object):
     """Abstract controller for all evolutionary searching method.
     """
 
-    def __init__(self, *args, **kwargs):
-        pass
-
     def update(self, tokens, reward):
         """Update the status of controller according current tokens and reward.
+
         Args:
             tokens(list<int>): A solution of searching task.
             reward(list<int>): The reward of tokens.
@@ -37,6 +36,7 @@ class EvolutionaryController(object):
 
     def reset(self, range_table, constrain_func=None):
         """Reset the controller.
+
         Args:
             range_table(list<int>): It is used to define the searching space of controller.
                                     The tokens[i] generated by controller should be in [0, range_table[i]).
@@ -47,5 +47,36 @@ class EvolutionaryController(object):
 
     def next_tokens(self):
         """Generate new tokens.
+
+        Returns:
+            list<list>: The next searched tokens.
         """
         raise NotImplementedError('Abstract method.')
+
+
+class RLBaseController(object):
+    """ Base Controller for reforcement learning"""
+
+    def next_tokens(self, *args, **kwargs):
+        raise NotImplementedError('Abstract method.')
+
+    def update(self, *args, **kwargs):
+        raise NotImplementedError('Abstract method.')
+
+    def save_controller(self, program, output_dir):
+        fluid.save(program, output_dir)
+
+    def load_controller(self, program, load_dir):
+        fluid.load(program, load_dir)
+
+    def get_params(self, program):
+        var_dict = {}
+        for var in program.global_block().all_parameters():
+            var_dict[var.name] = np.array(fluid.global_scope().find_var(
+                var.name).get_tensor())
+        return var_dict
+
+    def set_params(self, program, params_dict, place):
+        for var in program.global_block().all_parameters():
+            fluid.global_scope().find_var(var.name).get_tensor().set(
+                params_dict[var.name], place)
diff --git a/paddleslim/common/controller_client.py b/paddleslim/common/controller_client.py
index 15c36386f1f6a754a0e0f50f119410287ed14642..ceb6ecf54ac82035b71ec92d3e2b796f82f2fdcf 100644
--- a/paddleslim/common/controller_client.py
+++ b/paddleslim/common/controller_client.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import time
 import logging
 import socket
 from .log_helper import get_logger
@@ -24,45 +26,103 @@ _logger = get_logger(__name__, level=logging.INFO)
 class ControllerClient(object):
     """
     Controller client.
+    Args:
+        server_ip(str): The ip that controller server listens on. None means getting the ip automatically. Default: None.
+        server_port(int): The port that controller server listens on. 0 means getting usable port automatically. Default: 0.
+        key(str): The key used to identify legal agent for controller server. Default: "light-nas"
+        client_name(str): Current client name, random generate for counting client number. Default: None.
     """
 
-    def __init__(self, server_ip=None, server_port=None, key=None):
+    START = True
+
+    def __init__(self,
+                 server_ip=None,
+                 server_port=None,
+                 key=None,
+                 client_name=None):
         """
-        Args:
-            server_ip(str): The ip that controller server listens on. None means getting the ip automatically. Default: None.
-            server_port(int): The port that controller server listens on. 0 means getting usable port automatically. Default: 0.
-            key(str): The key used to identify legal agent for controller server. Default: "light-nas"
         """
         self.server_ip = server_ip
         self.server_port = server_port
-        self.socket_client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
         self._key = key
+        self._client_name = client_name
 
     def update(self, tokens, reward, iter):
         """
         Update the controller according to latest tokens and reward.
+
         Args:
             tokens(list<int>): The tokens generated in last step.
             reward(float): The reward of tokens.
+            iter(int): The iteration number of current client.
         """
+        ControllerClient.START = False
         socket_client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        socket_client.connect((self.server_ip, self.server_port))
-        tokens = ",".join([str(token) for token in tokens])
-        socket_client.send("{}\t{}\t{}\t{}".format(self._key, tokens, reward,
-                                                   iter).encode())
-        response = socket_client.recv(1024).decode()
-        if response.strip('\n').split("\t") == "ok":
-            return True
+        errno = socket_client.connect_ex((self.server_ip, self.server_port))
+        if errno != 0:
+            _logger.info("Server is closed!!!")
+            os._exit(0)
         else:
-            return False
+            tokens = ",".join([str(token) for token in tokens])
+            socket_client.send("{}\t{}\t{}\t{}\t{}".format(
+                self._key, tokens, reward, iter, self._client_name).encode())
+            try:
+                response = socket_client.recv(1024).decode()
+                if "ok" in response.strip('\n').split("\t"):
+                    return True
+                else:
+                    return False
+            except Exception as err:
+                _logger.error(err)
+                os._exit(0)
 
     def next_tokens(self):
         """
         Get next tokens.
         """
-        socket_client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        socket_client.connect((self.server_ip, self.server_port))
+        retry_cnt = 0
+
+        if ControllerClient.START:
+            while True:
+                socket_client = socket.socket(socket.AF_INET,
+                                              socket.SOCK_STREAM)
+                errno = socket_client.connect_ex(
+                    (self.server_ip, self.server_port))
+                if errno != 0:
+                    retry_cnt += 1
+                    _logger.info("Server is NOT ready, wait 10 second to retry")
+                    time.sleep(10)
+                else:
+                    break
+
+                if retry_cnt == 6:
+                    _logger.error(
+                        "Server is NOT ready in 1 minute, please check if it start"
+                    )
+                    os._exit(errno)
+
+        else:
+            socket_client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            errno = socket_client.connect_ex((self.server_ip, self.server_port))
+            if errno != 0:
+                _logger.info("Server is closed")
+                os._exit(0)
+
         socket_client.send("next_tokens".encode())
         tokens = socket_client.recv(1024).decode()
         tokens = [int(token) for token in tokens.strip("\n").split(",")]
         return tokens
+
+    def request_current_info(self):
+        """
+        Request for current information.
+        """
+        socket_client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        errno = socket_client.connect_ex((self.server_ip, self.server_port))
+        if errno != 0:
+            _logger.info("Server is closed")
+            return None
+        else:
+            socket_client.send("current_info".encode())
+            current_info = socket_client.recv(1024).decode()
+            return eval(current_info)
diff --git a/paddleslim/common/controller_server.py b/paddleslim/common/controller_server.py
index 6b198a5a129f9d90aa75a6b854329be90a6497ed..5103dfed8f371938fb151d45d388b23fc9a9f01a 100644
--- a/paddleslim/common/controller_server.py
+++ b/paddleslim/common/controller_server.py
@@ -15,9 +15,10 @@
 import os
 import logging
 import socket
+import time
 from .log_helper import get_logger
 from threading import Thread
-from .lock_utils import lock, unlock
+from .lock import lock, unlock
 
 __all__ = ['ControllerServer']
 
@@ -25,8 +26,14 @@ _logger = get_logger(__name__, level=logging.INFO)
 
 
 class ControllerServer(object):
-    """
-    The controller wrapper with a socket server to handle the request of search agent.
+    """The controller wrapper with a socket server to handle the request of search agent.
+    Args:
+        controller(slim.searcher.Controller): The controller used to generate tokens.
+        address(tuple): The address of current server binding with format (ip, port). Default: ('', 0).
+                        which means setting ip automatically
+        max_client_num(int): The maximum number of clients connecting to current server simultaneously. Default: 100.
+        search_steps(int|None): The total steps of searching. None means never stopping. Default: None 
+        key(str|None): Config information. Default: None.
     """
 
     def __init__(self,
@@ -36,12 +43,6 @@ class ControllerServer(object):
                  search_steps=None,
                  key=None):
         """
-        Args:
-            controller(slim.searcher.Controller): The controller used to generate tokens.
-            address(tuple): The address of current server binding with format (ip, port). Default: ('', 0).
-                            which means setting ip automatically
-            max_client_num(int): The maximum number of clients connecting to current server simultaneously. Default: 100.
-            search_steps(int): The total steps of searching. None means never stopping. Default: None 
         """
         self._controller = controller
         self._address = address
@@ -51,6 +52,9 @@ class ControllerServer(object):
         self._port = address[1]
         self._ip = address[0]
         self._key = key
+        self._client_num = 0
+        self._client = dict()
+        self._compare_time = 172800  ### 48 hours
 
     def start(self):
         self._socket_server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
@@ -58,9 +62,11 @@ class ControllerServer(object):
         self._socket_server.listen(self._max_client_num)
         self._port = self._socket_server.getsockname()[1]
         self._ip = self._socket_server.getsockname()[0]
-        _logger.info("ControllerServer - listen on: [{}:{}]".format(
+        _logger.info("ControllerServer Start!!!")
+        _logger.debug("ControllerServer - listen on: [{}:{}]".format(
             self._ip, self._port))
         thread = Thread(target=self.run)
+        thread.setDaemon(True)
         thread.start()
         return str(thread)
 
@@ -78,6 +84,8 @@ class ControllerServer(object):
         return self._ip
 
     def run(self):
+        """Start the server.
+        """
         _logger.info("Controller Server run...")
         try:
             while ((self._search_steps is None) or
@@ -85,23 +93,59 @@ class ControllerServer(object):
                     (self._search_steps))) and not self._closed:
                 conn, addr = self._socket_server.accept()
                 message = conn.recv(1024).decode()
+                _logger.debug(message)
                 if message.strip("\n") == "next_tokens":
                     tokens = self._controller.next_tokens()
                     tokens = ",".join([str(token) for token in tokens])
                     conn.send(tokens.encode())
+                elif message.strip("\n") == "current_info":
+                    current_info = dict()
+                    current_info['best_tokens'] = self._controller.best_tokens
+                    current_info['best_reward'] = self._controller.max_reward
+                    current_info[
+                        'current_tokens'] = self._controller.current_tokens
+                    conn.send(str(current_info).encode())
                 else:
                     _logger.debug("recv message from {}: [{}]".format(addr,
                                                                       message))
                     messages = message.strip('\n').split("\t")
-                    if (len(messages) < 4) or (messages[0] != self._key):
+                    if (len(messages) < 5) or (messages[0] != self._key):
                         _logger.debug("recv noise from {}: [{}]".format(
                             addr, message))
                         continue
                     tokens = messages[1]
                     reward = messages[2]
                     iter = messages[3]
+                    client_name = messages[4]
+
+                    one_step_time = -1
+                    if client_name in self._client.keys():
+                        current_time = time.time() - self._client[client_name]
+                        if current_time > one_step_time:
+                            one_step_time = current_time
+                            self._compare_time = 2 * one_step_time
+
+                    if client_name not in self._client.keys():
+                        self._client[client_name] = time.time()
+                        self._client_num += 1
+
+                    self._client[client_name] = time.time()
+
+                    for key_client in self._client.keys():
+                        ### if a client not request token in double train one tokens' time, we think this client was stoped.
+                        if (time.time() - self._client[key_client]
+                            ) > self._compare_time and len(self._client.keys(
+                            )) > 1:
+                            self._client.pop(key_client)
+                            self._client_num -= 1
+                    _logger.debug(
+                        "client: {}, client_num: {}, compare_time: {}".format(
+                            self._client, self._client_num,
+                            self._compare_time))
                     tokens = [int(token) for token in tokens.split(",")]
-                    self._controller.update(tokens, float(reward), int(iter))
+                    self._controller.update(tokens,
+                                            float(reward),
+                                            int(iter), int(self._client_num))
                     response = "ok"
                     conn.send(response.encode())
                     _logger.debug("send message to {}: [{}]".format(addr,
diff --git a/paddleslim/prune/lock.py b/paddleslim/common/lock.py
similarity index 100%
rename from paddleslim/prune/lock.py
rename to paddleslim/common/lock.py
diff --git a/paddleslim/common/lock_utils.py b/paddleslim/common/lock_utils.py
deleted file mode 100644
index 9daf4f3f6e842609a39fd286dfa49eb705c631a7..0000000000000000000000000000000000000000
--- a/paddleslim/common/lock_utils.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-__all__ = ['lock', 'unlock']
-
-if os.name == 'nt':
-
-    def lock(file):
-        raise NotImplementedError('Windows is not supported.')
-
-    def unlock(file):
-        raise NotImplementedError('Windows is not supported.')
-
-elif os.name == 'posix':
-    from fcntl import flock, LOCK_EX, LOCK_UN
-
-    def lock(file):
-        """Lock the file in local file system."""
-        flock(file.fileno(), LOCK_EX)
-
-    def unlock(file):
-        """Unlock the file in local file system."""
-        flock(file.fileno(), LOCK_UN)
-else:
-    raise RuntimeError("File Locker only support NT and Posix platforms!")
diff --git a/paddleslim/common/log_helper.py b/paddleslim/common/log_helper.py
index 18000ce4ec6c472914de49a053e960c02cfd8e32..e0b38e893d74fee1f5ac2b5410498295994253f8 100644
--- a/paddleslim/common/log_helper.py
+++ b/paddleslim/common/log_helper.py
@@ -24,15 +24,20 @@ def get_logger(name, level, fmt='%(asctime)s-%(levelname)s: %(message)s'):
     Get logger from logging with given name, level and format without
     setting logging basicConfig. For setting basicConfig in paddle
     will disable basicConfig setting after import paddle.
+
     Args:
         name (str): The logger name.
         level (logging.LEVEL): The base level of the logger
         fmt (str): Format of logger output
+
     Returns:
         logging.Logger: logging logger with given setttings
+
     Examples:
-        .. code-block:: python
-            logger = log_helper.get_logger(__name__, logging.INFO,
+
+    .. code-block:: python
+
+       logger = log_helper.get_logger(__name__, logging.INFO,
                             fmt='%(asctime)s-%(levelname)s: %(message)s')
     """
 
diff --git a/paddleslim/common/meter.py b/paddleslim/common/meter.py
new file mode 100644
index 0000000000000000000000000000000000000000..6770257971a75018d011de444715729f23a6b778
--- /dev/null
+++ b/paddleslim/common/meter.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['AvgrageMeter']
+
+
+class AvgrageMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+
+    def update(self, val, n=1):
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
diff --git a/paddleslim/common/rl_controller/__init__.py b/paddleslim/common/rl_controller/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b29c563781d1867c38e071f8996835438e9c7fad
--- /dev/null
+++ b/paddleslim/common/rl_controller/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from ..log_helper import get_logger
+_logger = get_logger(__name__, level=logging.INFO)
+try:
+    import parl
+    from .ddpg import *
+except ImportError as e:
+    pass
+
+from .lstm import *
+from .utils import *
diff --git a/paddleslim/common/rl_controller/base_env.py b/paddleslim/common/rl_controller/base_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd20dc3f821603eb894f97610ee1c25c2ab2e405
--- /dev/null
+++ b/paddleslim/common/rl_controller/base_env.py
@@ -0,0 +1,29 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Base environment used in reinforcement learning"""
+
+import numpy as np
+
+__all__ = ['BaseEnv']
+
+
+class BaseEnv:
+    def reset(self):
+        raise NotImplementedError('Abstract method.')
+
+    def step(self):
+        raise NotImplementedError('Abstract method.')
+
+    def _build_state_embedding(self):
+        raise NotImplementedError('Abstract method.')
diff --git a/paddleslim/common/rl_controller/ddpg/__init__.py b/paddleslim/common/rl_controller/ddpg/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5486a1a01630c965337edd7267f6be101abe2fae
--- /dev/null
+++ b/paddleslim/common/rl_controller/ddpg/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .ddpg_controller import *
diff --git a/paddleslim/common/rl_controller/ddpg/ddpg_controller.py b/paddleslim/common/rl_controller/ddpg/ddpg_controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..50216adbdaec7f152d64e3d2f16007d059510efb
--- /dev/null
+++ b/paddleslim/common/rl_controller/ddpg/ddpg_controller.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import parl
+from parl import layers
+from paddle import fluid
+from ..utils import RLCONTROLLER, action_mapping
+from ...controller import RLBaseController
+from .ddpg_model import DefaultDDPGModel as default_ddpg_model
+from .noise import AdaptiveNoiseSpec as default_noise
+from parl.utils import ReplayMemory
+
+__all__ = ['DDPG']
+
+
+class DDPGAgent(parl.Agent):
+    def __init__(self, algorithm, obs_dim, act_dim):
+        assert isinstance(obs_dim, int)
+        assert isinstance(act_dim, int)
+        self.obs_dim = obs_dim
+        self.act_dim = act_dim
+        super(DDPGAgent, self).__init__(algorithm)
+
+        # Attention: In the beginning, sync target model totally.
+        self.alg.sync_target(decay=0)
+
+    def build_program(self):
+        self.pred_program = fluid.Program()
+        self.learn_program = fluid.Program()
+
+        with fluid.program_guard(self.pred_program):
+            obs = fluid.data(
+                name='obs', shape=[None, self.obs_dim], dtype='float32')
+            self.pred_act = self.alg.predict(obs)
+
+        with fluid.program_guard(self.learn_program):
+            obs = fluid.data(
+                name='obs', shape=[None, self.obs_dim], dtype='float32')
+            act = fluid.data(
+                name='act', shape=[None, self.act_dim], dtype='float32')
+            reward = fluid.data(name='reward', shape=[None], dtype='float32')
+            next_obs = fluid.data(
+                name='next_obs', shape=[None, self.obs_dim], dtype='float32')
+            terminal = fluid.data(
+                name='terminal', shape=[None, 1], dtype='bool')
+            _, self.critic_cost = self.alg.learn(obs, act, reward, next_obs,
+                                                 terminal)
+
+    def predict(self, obs):
+        act = self.fluid_executor.run(self.pred_program,
+                                      feed={'obs': obs},
+                                      fetch_list=[self.pred_act])[0]
+        return act
+
+    def learn(self, obs, act, reward, next_obs, terminal):
+        feed = {
+            'obs': obs,
+            'act': act,
+            'reward': reward,
+            'next_obs': next_obs,
+            'terminal': terminal
+        }
+        critic_cost = self.fluid_executor.run(self.learn_program,
+                                              feed=feed,
+                                              fetch_list=[self.critic_cost])[0]
+        self.alg.sync_target()
+        return critic_cost
+
+
+@RLCONTROLLER.register
+class DDPG(RLBaseController):
+    def __init__(self, range_tables, use_gpu=False, **kwargs):
+        self.use_gpu = use_gpu
+        self.range_tables = range_tables - np.asarray(1)
+        self.act_dim = len(self.range_tables)
+        self.obs_dim = kwargs.get('obs_dim')
+        self.model = kwargs.get(
+            'model') if 'model' in kwargs else default_ddpg_model
+        self.actor_lr = kwargs.get(
+            'actor_lr') if 'actor_lr' in kwargs else 1e-4
+        self.critic_lr = kwargs.get(
+            'critic_lr') if 'critic_lr' in kwargs else 1e-3
+        self.gamma = kwargs.get('gamma') if 'gamma' in kwargs else 0.99
+        self.tau = kwargs.get('tau') if 'tau' in kwargs else 0.001
+        self.memory_size = kwargs.get(
+            'memory_size') if 'memory_size' in kwargs else 10
+        self.reward_scale = kwargs.get(
+            'reward_scale') if 'reward_scale' in kwargs else 0.1
+        self.batch_size = kwargs.get(
+            'controller_batch_size') if 'controller_batch_size' in kwargs else 1
+        self.actions_noise = kwargs.get(
+            'actions_noise') if 'actions_noise' in kwargs else default_noise
+        self.action_dist = 0.0
+        self.place = fluid.CUDAPlace(0) if self.use_gpu else fluid.CPUPlace()
+
+        model = self.model(self.act_dim)
+
+        if self.actions_noise:
+            self.actions_noise = self.actions_noise()
+
+        algorithm = parl.algorithms.DDPG(
+            model,
+            gamma=self.gamma,
+            tau=self.tau,
+            actor_lr=self.actor_lr,
+            critic_lr=self.critic_lr)
+        self.agent = DDPGAgent(algorithm, self.obs_dim, self.act_dim)
+        self.rpm = ReplayMemory(self.memory_size, self.obs_dim, self.act_dim)
+
+        self.pred_program = self.agent.pred_program
+        self.learn_program = self.agent.learn_program
+        self.param_dict = self.get_params(self.learn_program)
+
+    def next_tokens(self, obs, params_dict, is_inference=False):
+        batch_obs = np.expand_dims(obs, axis=0)
+        self.set_params(self.pred_program, params_dict, self.place)
+        actions = self.agent.predict(batch_obs.astype('float32'))
+        ### add noise to action
+        if self.actions_noise and is_inference == False:
+            actions_noise = np.clip(
+                np.random.normal(
+                    actions, scale=self.actions_noise.stdev_curr),
+                -1.0,
+                1.0)
+            self.action_dist = np.mean(np.abs(actions_noise - actions))
+        else:
+            actions_noise = actions
+        actions_noise = action_mapping(actions_noise, self.range_tables)
+        return actions_noise
+
+    def _update_noise(self, actions_dist):
+        self.actions_noise.update(actions_dist)
+
+    def update(self, rewards, params_dict, obs, actions, obs_next, terminal):
+        self.set_params(self.learn_program, params_dict, self.place)
+        self.rpm.append(obs, actions, self.reward_scale * rewards, obs_next,
+                        terminal)
+        if self.actions_noise:
+            self._update_noise(self.action_dist)
+        if self.rpm.size() > self.memory_size:
+            obs, actions, rewards, obs_next, terminal = rpm.sample_batch(
+                self.batch_size)
+        self.agent.learn(obs, actions, rewards, obs_next, terminal)
+        params_dict = self.get_params(self.learn_program)
+        return params_dict
diff --git a/paddleslim/common/rl_controller/ddpg/ddpg_model.py b/paddleslim/common/rl_controller/ddpg/ddpg_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..6607f8b4204171fe99e852a40baf7c76c6dacb61
--- /dev/null
+++ b/paddleslim/common/rl_controller/ddpg/ddpg_model.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import parl
+from parl import layers
+
+
+class DefaultDDPGModel(parl.Model):
+    def __init__(self, act_dim):
+        self.actor_model = ActorModel(act_dim)
+        self.critic_model = CriticModel()
+
+    def policy(self, obs):
+        return self.actor_model.policy(obs)
+
+    def value(self, obs, act):
+        return self.critic_model.value(obs, act)
+
+    def get_actor_params(self):
+        return self.actor_model.parameters()
+
+
+class ActorModel(parl.Model):
+    def __init__(self, act_dim):
+        hid1_size = 400
+        hid2_size = 300
+
+        self.fc1 = layers.fc(size=hid1_size, act='relu')
+        self.fc2 = layers.fc(size=hid2_size, act='relu')
+        self.fc3 = layers.fc(size=act_dim, act='tanh')
+
+    def policy(self, obs):
+        hid1 = self.fc1(obs)
+        hid2 = self.fc2(hid1)
+        means = self.fc3(hid2)
+        means = means
+        return means
+
+
+class CriticModel(parl.Model):
+    def __init__(self):
+        hid1_size = 400
+        hid2_size = 300
+
+        self.fc1 = layers.fc(size=hid1_size, act='relu')
+        self.fc2 = layers.fc(size=hid2_size, act='relu')
+        self.fc3 = layers.fc(size=1, act=None)
+
+    def value(self, obs, act):
+        hid1 = self.fc1(obs)
+        concat = layers.concat([hid1, act], axis=1)
+        hid2 = self.fc2(concat)
+        Q = self.fc3(hid2)
+        Q = layers.squeeze(Q, axes=[1])
+        return Q
diff --git a/paddleslim/common/rl_controller/ddpg/noise.py b/paddleslim/common/rl_controller/ddpg/noise.py
new file mode 100644
index 0000000000000000000000000000000000000000..4efbf96dfb82d96a91cc2ad0b54b54e11348440a
--- /dev/null
+++ b/paddleslim/common/rl_controller/ddpg/noise.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['AdaptiveNoiseSpec']
+
+
+class AdaptiveNoiseSpec(object):
+    def __init__(self):
+        self.stdev_curr = 1.0
+
+    def reset(self):
+        self.stdev_curr = 1.0
+
+    def update(self, action_dist):
+        if action_dist > 1e-2:
+            self.stdev_curr /= 1.03
+        else:
+            self.stdev_curr *= 1.03
diff --git a/paddleslim/common/rl_controller/lstm/__init__.py b/paddleslim/common/rl_controller/lstm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..12e43f774cb13bc1685f779f0157414012820448
--- /dev/null
+++ b/paddleslim/common/rl_controller/lstm/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .lstm_controller import *
diff --git a/paddleslim/common/rl_controller/lstm/lstm_controller.py b/paddleslim/common/rl_controller/lstm/lstm_controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..920b29eac64c4eec2d8dca28d60c40c694a09512
--- /dev/null
+++ b/paddleslim/common/rl_controller/lstm/lstm_controller.py
@@ -0,0 +1,293 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import logging
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid import ParamAttr
+from paddle.fluid.layers import RNNCell, LSTMCell, rnn
+from paddle.fluid.contrib.layers import basic_lstm
+from ...controller import RLBaseController
+from ...log_helper import get_logger
+from ..utils import RLCONTROLLER
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+uniform_initializer = lambda x: fluid.initializer.UniformInitializer(low=-x, high=x)
+
+
+class lstm_cell(RNNCell):
+    def __init__(self, num_layers, hidden_size):
+        self.num_layers = num_layers
+        self.hidden_size = hidden_size
+        self.lstm_cells = []
+
+        param_attr = ParamAttr(initializer=uniform_initializer(
+            1.0 / math.sqrt(hidden_size)))
+        bias_attr = ParamAttr(initializer=uniform_initializer(
+            1.0 / math.sqrt(hidden_size)))
+        for i in range(num_layers):
+            self.lstm_cells.append(
+                LSTMCell(hidden_size, param_attr, bias_attr))
+
+    def call(self, inputs, states):
+        new_states = []
+        for i in range(self.num_layers):
+            out, new_state = self.lstm_cells[i](inputs, states[i])
+            new_states.append(new_state)
+        return out, new_states
+
+    @property
+    def state_shape(self):
+        return [cell.state_shape for cell in self.lstm_cells]
+
+
+@RLCONTROLLER.register
+class LSTM(RLBaseController):
+    def __init__(self, range_tables, use_gpu=False, **kwargs):
+        self.use_gpu = use_gpu
+        self.range_tables = range_tables
+        self.lstm_num_layers = kwargs.get('lstm_num_layers') or 1
+        self.hidden_size = kwargs.get('hidden_size') or 100
+        self.temperature = kwargs.get('temperature') or None
+        self.controller_lr = kwargs.get('controller_lr') or 1e-4
+        self.decay_steps = kwargs.get('controller_decay_steps') or None
+        self.decay_rate = kwargs.get('controller_decay_rate') or None
+        self.tanh_constant = kwargs.get('tanh_constant') or None
+        self.decay = kwargs.get('decay') or 0.99
+        self.weight_entropy = kwargs.get('weight_entropy') or None
+        self.controller_batch_size = kwargs.get('controller_batch_size') or 1
+
+        self.max_range_table = max(self.range_tables) + 1
+
+        self._create_parameter()
+        self._build_program()
+
+        self.place = fluid.CUDAPlace(0) if self.use_gpu else fluid.CPUPlace()
+        self.exe = fluid.Executor(self.place)
+        self.exe.run(fluid.default_startup_program())
+
+        self.param_dict = self.get_params(self.learn_program)
+
+    def _lstm(self, inputs, hidden, cell, token_idx):
+        cells = lstm_cell(self.lstm_num_layers, self.hidden_size)
+        output, new_states = cells.call(inputs, states=([[hidden, cell]]))
+        logits = fluid.layers.fc(new_states[0], self.range_tables[token_idx])
+
+        if self.temperature is not None:
+            logits = logits / self.temperature
+        if self.tanh_constant is not None:
+            logits = self.tanh_constant * fluid.layers.tanh(logits)
+
+        return logits, output, new_states
+
+    def _create_parameter(self):
+        self.g_emb = fluid.layers.create_parameter(
+            name='emb_g',
+            shape=(self.controller_batch_size, self.hidden_size),
+            dtype='float32',
+            default_initializer=uniform_initializer(1.0))
+        self.baseline = fluid.layers.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name='baseline')
+        self.baseline.stop_gradient = True
+
+    def _network(self, hidden, cell, init_actions=None, is_inference=False):
+        actions = []
+        entropies = []
+        sample_log_probs = []
+
+        with fluid.unique_name.guard('Controller'):
+            self._create_parameter()
+            inputs = self.g_emb
+
+            for idx in range(len(self.range_tables)):
+                logits, output, states = self._lstm(
+                    inputs, hidden, cell, token_idx=idx)
+                hidden, cell = np.squeeze(states)
+                probs = fluid.layers.softmax(logits, axis=1)
+                if is_inference:
+                    action = fluid.layers.argmax(probs, axis=1)
+                else:
+                    if init_actions:
+                        action = fluid.layers.slice(
+                            init_actions,
+                            axes=[1],
+                            starts=[idx],
+                            ends=[idx + 1])
+                        action = fluid.layers.squeeze(action, axes=[1])
+                        action.stop_gradient = True
+                    else:
+                        action = fluid.layers.sampling_id(probs)
+                actions.append(action)
+                log_prob = fluid.layers.softmax_with_cross_entropy(
+                    logits,
+                    fluid.layers.reshape(
+                        action, shape=[fluid.layers.shape(action), 1]),
+                    axis=1)
+                sample_log_probs.append(log_prob)
+
+                entropy = log_prob * fluid.layers.exp(-1 * log_prob)
+                entropy.stop_gradient = True
+                entropies.append(entropy)
+
+                action_emb = fluid.layers.cast(action, dtype=np.int64)
+                inputs = fluid.embedding(
+                    action_emb,
+                    size=(self.max_range_table, self.hidden_size),
+                    param_attr=fluid.ParamAttr(
+                        name='emb_w', initializer=uniform_initializer(1.0)))
+
+            self.sample_log_probs = fluid.layers.concat(
+                sample_log_probs, axis=0)
+
+            entropies = fluid.layers.stack(entropies)
+            self.sample_entropies = fluid.layers.reduce_sum(entropies)
+
+        return actions
+
+    def _build_program(self, is_inference=False):
+        self.pred_program = fluid.Program()
+        self.learn_program = fluid.Program()
+        with fluid.program_guard(self.pred_program):
+            self.g_emb = fluid.layers.create_parameter(
+                name='emb_g',
+                shape=(self.controller_batch_size, self.hidden_size),
+                dtype='float32',
+                default_initializer=uniform_initializer(1.0))
+
+            fluid.layers.assign(
+                fluid.layers.uniform_random(shape=self.g_emb.shape),
+                self.g_emb)
+            hidden = fluid.data(name='hidden', shape=[None, self.hidden_size])
+            cell = fluid.data(name='cell', shape=[None, self.hidden_size])
+            self.tokens = self._network(
+                hidden, cell, is_inference=is_inference)
+
+        with fluid.program_guard(self.learn_program):
+            hidden = fluid.data(name='hidden', shape=[None, self.hidden_size])
+            cell = fluid.data(name='cell', shape=[None, self.hidden_size])
+            init_actions = fluid.data(
+                name='init_actions',
+                shape=[None, len(self.range_tables)],
+                dtype='int64')
+            self._network(hidden, cell, init_actions=init_actions)
+
+            rewards = fluid.data(name='rewards', shape=[None])
+            self.rewards = fluid.layers.reduce_mean(rewards)
+
+            if self.weight_entropy is not None:
+                self.rewards += self.weight_entropy * self.sample_entropies
+
+            self.sample_log_probs = fluid.layers.reduce_sum(
+                self.sample_log_probs)
+
+            fluid.layers.assign(self.baseline - (1.0 - self.decay) *
+                                (self.baseline - self.rewards), self.baseline)
+            self.loss = self.sample_log_probs * (self.rewards - self.baseline)
+            clip = fluid.clip.GradientClipByNorm(clip_norm=5.0)
+            if self.decay_steps is not None:
+                lr = fluid.layers.exponential_decay(
+                    self.controller_lr,
+                    decay_steps=self.decay_steps,
+                    decay_rate=self.decay_rate)
+            else:
+                lr = self.controller_lr
+            optimizer = fluid.optimizer.Adam(learning_rate=lr, grad_clip=clip)
+            optimizer.minimize(self.loss)
+
+    def _create_input(self, is_test=True, actual_rewards=None):
+        feed_dict = dict()
+        np_init_hidden = np.zeros(
+            (self.controller_batch_size, self.hidden_size)).astype('float32')
+        np_init_cell = np.zeros(
+            (self.controller_batch_size, self.hidden_size)).astype('float32')
+
+        feed_dict["hidden"] = np_init_hidden
+        feed_dict["cell"] = np_init_cell
+
+        if is_test == False:
+            if isinstance(actual_rewards, np.float32):
+                assert actual_rewards != None, "if you want to update controller, you must inputs a reward"
+                actual_rewards = np.expand_dims(actual_rewards, axis=0)
+            elif isinstance(actual_rewards, np.float) or isinstance(
+                    actual_rewards, np.float64):
+                actual_rewards = np.float32(actual_rewards)
+                assert actual_rewards != None, "if you want to update controller, you must inputs a reward"
+                actual_rewards = np.expand_dims(actual_rewards, axis=0)
+            else:
+                assert actual_rewards.all(
+                ) != None, "if you want to update controller, you must inputs a reward"
+                actual_rewards = actual_rewards.astype(np.float32)
+
+            feed_dict['rewards'] = actual_rewards
+            feed_dict['init_actions'] = np.array(self.init_tokens).astype(
+                'int64')
+
+        return feed_dict
+
+    def next_tokens(self, num_archs=1, params_dict=None, is_inference=False):
+        """ sample next tokens according current parameter and inputs"""
+        self.num_archs = num_archs
+
+        self.set_params(self.pred_program, params_dict, self.place)
+
+        batch_tokens = []
+        feed_dict = self._create_input()
+
+        for _ in range(
+                int(np.ceil(float(num_archs) / self.controller_batch_size))):
+            if is_inference:
+                self._build_program(is_inference=True)
+
+            actions = self.exe.run(self.pred_program,
+                                   feed=feed_dict,
+                                   fetch_list=self.tokens)
+
+            for idx in range(self.controller_batch_size):
+                each_token = {}
+                for i, action in enumerate(actions):
+                    token = action[idx]
+                    if idx in each_token:
+                        each_token[idx].append(int(token))
+                    else:
+                        each_token[idx] = [int(token)]
+                batch_tokens.append(each_token[idx])
+
+        self.init_tokens = batch_tokens
+        mod_token = (self.controller_batch_size -
+                     (num_archs % self.controller_batch_size)
+                     ) % self.controller_batch_size
+        if mod_token != 0:
+            return batch_tokens[:-mod_token]
+        else:
+            return batch_tokens
+
+    def update(self, rewards, params_dict=None):
+        """train controller according reward"""
+        self.set_params(self.learn_program, params_dict, self.place)
+
+        feed_dict = self._create_input(is_test=False, actual_rewards=rewards)
+
+        loss = self.exe.run(self.learn_program,
+                            feed=feed_dict,
+                            fetch_list=[self.loss])
+        _logger.info("Controller: current reward is {}, loss is {}".format(
+            rewards, loss))
+        params_dict = self.get_params(self.learn_program)
+        return params_dict
diff --git a/paddleslim/common/rl_controller/utils.py b/paddleslim/common/rl_controller/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8363460ca977b110f82ed78003d03d33beef2068
--- /dev/null
+++ b/paddleslim/common/rl_controller/utils.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from ...core import Registry
+
+__all__ = [
+    "RLCONTROLLER", "action_mapping", "add_grad", "compute_grad",
+    "ConnectMessage"
+]
+
+RLCONTROLLER = Registry('RLController')
+
+
+class ConnectMessage:
+    INIT = 'INIT'
+    INIT_DONE = 'INIT_DONE'
+    GET_WEIGHT = 'GET_WEIGHT'
+    UPDATE_WEIGHT = 'UPDATE_WEIGHT'
+    OK = 'OK'
+    WAIT = 'WAIT'
+    WAIT_PARAMS = 'WAIT_PARAMS'
+    EXIT = 'EXIT'
+    TIMEOUT = 10
+
+
+def action_mapping(actions, range_table):
+    actions = (actions - (-1.0)) * (range_table / np.asarray(2.0))
+    return actions.astype('int64')
+
+
+def add_grad(dict1, dict2):
+    dict3 = dict()
+    for key, value in dict1.items():
+        dict3[key] = dict1[key] + dict2[key]
+    return dict3
+
+
+def compute_grad(dict1, dict2):
+    dict3 = dict()
+    for key, value in dict1.items():
+        dict3[key] = dict1[key] - dict2[key]
+    return dict3
diff --git a/paddleslim/common/sa_controller.py b/paddleslim/common/sa_controller.py
index 8a081761178c0e9ad9a9477eebda7018561abcf1..3f6c1c98f1a469a0345976ea42b4daf6d5b63e89 100644
--- a/paddleslim/common/sa_controller.py
+++ b/paddleslim/common/sa_controller.py
@@ -29,12 +29,27 @@ _logger = get_logger(__name__, level=logging.INFO)
 
 
 class SAController(EvolutionaryController):
-    """Simulated annealing controller."""
+    """Simulated annealing controller.
+
+    Args:
+        range_table(list<int>): Range table.
+        reduce_rate(float): The decay rate of temperature.
+        init_temperature(float): Init temperature.
+        max_try_times(int): max try times before get legal tokens. Default: 300.
+        init_tokens(list<int>): The initial tokens. Default: None.
+        reward(float): The reward of current tokens. Default: -1.
+        max_reward(float): The max reward in the search of sanas, in general, best tokens get max reward. Default: -1.
+        iters(int): The iteration of sa controller. Default: 0.
+        best_tokens(list<int>): The best tokens in the search of sanas, in general, best tokens get max reward. Default: None.
+        constrain_func(function): The callback function used to check whether the tokens meet constraint. None means there is no constraint. Default: None.
+        checkpoints(str): if checkpoint is None, donnot save checkpoints, else save scene to checkpoints file.
+        searched(dict<list, float>): remember tokens which are searched.
+        """
 
     def __init__(self,
                  range_table=None,
                  reduce_rate=0.85,
-                 init_temperature=1024,
+                 init_temperature=None,
                  max_try_times=300,
                  init_tokens=None,
                  reward=-1,
@@ -44,21 +59,6 @@ class SAController(EvolutionaryController):
                  constrain_func=None,
                  checkpoints=None,
                  searched=None):
-        """Initialize.
-        Args:
-            range_table(list<int>): Range table.
-            reduce_rate(float): The decay rate of temperature.
-            init_temperature(float): Init temperature.
-            max_try_times(int): max try times before get legal tokens. Default: 300.
-            init_tokens(list<int>): The initial tokens. Default: None.
-            reward(float): The reward of current tokens. Default: -1.
-            max_reward(float): The max reward in the search of sanas, in general, best tokens get max reward. Default: -1.
-            iters(int): The iteration of sa controller. Default: 0.
-            best_tokens(list<int>): The best tokens in the search of sanas, in general, best tokens get max reward. Default: None.
-            constrain_func(function): The callback function used to check whether the tokens meet constraint. None means there is no constraint. Default: None.
-            checkpoints(str): if checkpoint is None, donnot save checkpoints, else save scene to checkpoints file.
-            searched(dict<list, float>): remember tokens which are searched.
-        """
         super(SAController, self).__init__()
         self._range_table = range_table
         assert isinstance(self._range_table, tuple) and (
@@ -68,12 +68,20 @@ class SAController(EvolutionaryController):
         self._max_try_times = max_try_times
         self._reward = reward
         self._tokens = init_tokens
+
+        if init_temperature == None:
+            if init_tokens == None:
+                self._init_temperature = 10.0
+            else:
+                self._init_temperature = 1.0
+
         self._constrain_func = constrain_func
         self._max_reward = max_reward
         self._best_tokens = best_tokens
         self._iter = iters
         self._checkpoints = checkpoints
         self._searched = searched if searched != None else dict()
+        self._current_tokens = init_tokens
 
     def __getstate__(self):
         d = {}
@@ -84,6 +92,11 @@ class SAController(EvolutionaryController):
 
     @property
     def best_tokens(self):
+        """Get current best tokens.
+
+        Returns:
+            list<int>: The best tokens.
+        """
         return self._best_tokens
 
     @property
@@ -92,20 +105,30 @@ class SAController(EvolutionaryController):
 
     @property
     def current_tokens(self):
-        return self._tokens
+        """Get tokens generated in current searching step.
 
-    def update(self, tokens, reward, iter):
+        Returns:
+            list<int>: The best tokens.
+        """
+
+        return self._current_tokens
+
+    def update(self, tokens, reward, iter, client_num=1):
         """
         Update the controller according to latest tokens and reward.
+
         Args:
-            tokens(list<int>): The tokens generated in last step.
+            tokens(list<int>): The tokens generated in current step.
             reward(float): The reward of tokens.
+            iter(int): The current step of searching client.
+            client_num(int): The total number of searching client. 
         """
         iter = int(iter)
         if iter > self._iter:
             self._iter = iter
         self._searched[str(tokens)] = reward
-        temperature = self._init_temperature * self._reduce_rate**self._iter
+        temperature = self._init_temperature * self._reduce_rate**(client_num *
+                                                                   self._iter)
         if (reward > self._reward) or (np.random.random() <= math.exp(
             (reward - self._reward) / temperature)):
             self._reward = reward
@@ -117,6 +140,9 @@ class SAController(EvolutionaryController):
             "Controller - iter: {}; best_reward: {}, best tokens: {}, current_reward: {}; current tokens: {}".
             format(self._iter, self._max_reward, self._best_tokens, reward,
                    tokens))
+        _logger.debug(
+            'Controller - iter: {}, controller current tokens: {}, controller current reward: {}'.
+            format(self._iter, self._tokens, self._reward))
 
         if self._checkpoints != None:
             self._save_checkpoint(self._checkpoints)
@@ -124,6 +150,12 @@ class SAController(EvolutionaryController):
     def next_tokens(self, control_token=None):
         """
         Get next tokens.
+
+        Args:
+            control_token: The tokens used to generate next tokens.
+
+        Returns:
+            list<int>: The next tokens.
         """
         if control_token:
             tokens = control_token[:]
@@ -137,7 +169,7 @@ class SAController(EvolutionaryController):
             _logger.debug("change index[{}] from {} to {}".format(
                 index, tokens[index], new_tokens[index]))
 
-            if self._searched.has_key(str(new_tokens)):
+            if str(new_tokens) in self._searched.keys():
                 _logger.debug('get next tokens including searched tokens: {}'.
                               format(new_tokens))
                 continue
@@ -151,8 +183,7 @@ class SAController(EvolutionaryController):
             )
             sys.exit()
 
-        if self._constrain_func is None or self._max_try_times is None:
-            return new_tokens
+        self._current_tokens = new_tokens
 
         return new_tokens
 
diff --git a/paddleslim/common/server.py b/paddleslim/common/server.py
new file mode 100644
index 0000000000000000000000000000000000000000..abd34e29c7e398f397e215939b771d007bdc53d5
--- /dev/null
+++ b/paddleslim/common/server.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import zmq
+import socket
+import signal
+import six
+import os
+if six.PY2:
+    import cPickle as pickle
+else:
+    import pickle
+import logging
+import time
+import threading
+from .log_helper import get_logger
+from .rl_controller.utils import add_grad, ConnectMessage
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+
+class Server(object):
+    def __init__(self,
+                 controller,
+                 address,
+                 is_sync=False,
+                 load_controller=None,
+                 save_controller=None):
+        self._controller = controller
+        self._address = address
+        self._ip = self._address[0]
+        self._port = self._address[1]
+        self._is_sync = is_sync
+        self._done = False
+        self._load_controller = load_controller
+        self._save_controller = save_controller
+        ### key-value : client_name-update_times
+        self._client_dict = dict()
+        self._client = list()
+        self._lock = threading.Lock()
+        self._server_alive = True
+        self._max_update_times = 0
+
+    def close(self):
+        self._server_alive = False
+        _logger.info("server closed")
+        pid = os.getpid()
+        os.kill(pid, signal.SIGTERM)
+
+    def start(self):
+        self._ctx = zmq.Context()
+        ### main socket
+        self._server_socket = self._ctx.socket(zmq.REP)
+        server_address = "{}:{}".format(self._ip, self._port)
+        self._server_socket.bind("tcp://{}".format(server_address))
+        self._server_socket.linger = 0
+        _logger.info("ControllerServer Start!!!")
+        _logger.debug("ControllerServer - listen on: [{}]".format(
+            server_address))
+        thread = threading.Thread(target=self.run, args=())
+        thread.setDaemon(True)
+        thread.start()
+
+        if self._load_controller:
+            assert os.path.exists(
+                self._load_controller
+            ), "controller checkpoint is not exist, please check your directory: {}".format(
+                self._load_controller)
+
+            with open(
+                    os.path.join(self._load_controller, 'rlnas.params'),
+                    'rb') as f:
+                self._params_dict = pickle.load(f)
+            _logger.info("Load params done")
+
+        else:
+            self._params_dict = self._controller.param_dict
+
+        if self._is_sync:
+            self._wait_socket = self._ctx.socket(zmq.REP)
+            self._wait_port = self._wait_socket.bind_to_random_port(
+                addr="tcp://*")
+            self._wait_socket_linger = 0
+            wait_thread = threading.Thread(
+                target=self._wait_for_params, args=())
+            wait_thread.setDaemon(True)
+            wait_thread.start()
+
+    def _wait_for_params(self):
+        try:
+            while self._server_alive:
+                message = self._wait_socket.recv_multipart()
+                cmd = pickle.loads(message[0])
+                client_name = pickle.loads(message[1])
+                if cmd == ConnectMessage.WAIT_PARAMS:
+                    _logger.debug("Server: wait for params")
+                    self._lock.acquire()
+                    self._wait_socket.send_multipart([
+                        pickle.dumps(ConnectMessage.OK)
+                        if self._done else pickle.dumps(ConnectMessage.WAIT)
+                    ])
+                    if self._done and client_name in self._client:
+                        self._client.remove(client_name)
+                    if len(self._client) == 0:
+                        if self._save_controller != False:
+                            self.save_params()
+                        self._done = False
+                    self._lock.release()
+                else:
+                    _logger.error("Error message {}".format(message))
+                    raise NotImplementedError
+        except Exception as err:
+            logger.error(err)
+
+    def run(self):
+        try:
+            while self._server_alive:
+                try:
+                    sum_params_dict = dict()
+                    message = self._server_socket.recv_multipart()
+                    cmd = pickle.loads(message[0])
+                    client_name = pickle.loads(message[1])
+                    if cmd == ConnectMessage.INIT:
+                        self._server_socket.send_multipart(
+                            [pickle.dumps(ConnectMessage.INIT_DONE)])
+                        _logger.debug("Server: init client {}".format(
+                            client_name))
+                        self._client_dict[client_name] = 0
+                    elif cmd == ConnectMessage.GET_WEIGHT:
+                        self._lock.acquire()
+                        _logger.debug("Server: get weight {}".format(
+                            client_name))
+                        self._server_socket.send_multipart(
+                            [pickle.dumps(self._params_dict)])
+                        _logger.debug("Server: send params done {}".format(
+                            client_name))
+                        self._lock.release()
+                    elif cmd == ConnectMessage.UPDATE_WEIGHT:
+                        _logger.info("Server: update {}".format(client_name))
+                        params_dict_grad = pickle.loads(message[2])
+                        if self._is_sync:
+                            if not sum_params_dict:
+                                sum_params_dict = self._params_dict
+                            self._lock.acquire()
+                            sum_params_dict = add_grad(sum_params_dict,
+                                                       params_dict_grad)
+                            self._client.append(client_name)
+                            self._lock.release()
+
+                            if len(self._client) == len(
+                                    self._client_dict.items()):
+                                self._done = True
+                                self._params_dict = sum_params_dict
+                                del sum_params_dict
+
+                            self._server_socket.send_multipart([
+                                pickle.dumps(ConnectMessage.WAIT),
+                                pickle.dumps(self._wait_port)
+                            ])
+                        else:
+                            self._lock.acquire()
+                            self._params_dict = add_grad(self._params_dict,
+                                                         params_dict_grad)
+                            self._client_dict[client_name] += 1
+                            if self._client_dict[
+                                    client_name] > self._max_update_times:
+                                self._max_update_times = self._client_dict[
+                                    client_name]
+                            self._lock.release()
+                            if self._save_controller != False:
+                                self.save_params()
+                            self._server_socket.send_multipart(
+                                [pickle.dumps(ConnectMessage.OK)])
+
+                    elif cmd == ConnectMessage.EXIT:
+                        self._client_dict.pop(client_name)
+                        if client_name in self._client:
+                            self._client.remove(client_name)
+                        self._server_socket.send_multipart(
+                            [pickle.dumps(ConnectMessage.EXIT)])
+                except zmq.error.Again as e:
+                    _logger.error(e)
+            self.close()
+
+        except Exception as err:
+            _logger.error(err)
+        finally:
+            self._server_socket.close(0)
+            if self._is_sync:
+                self._wait_socket.close(0)
+            self.close()
+
+    def save_params(self):
+        if self._save_controller:
+            if not os.path.exists(self._save_controller):
+                os.makedirs(self._save_controller)
+            output_dir = self._save_controller
+        else:
+            if not os.path.exists('./.rlnas_controller'):
+                os.makedirs('./.rlnas_controller')
+            output_dir = './.rlnas_controller'
+
+        with open(os.path.join(output_dir, 'rlnas.params'), 'wb') as f:
+            pickle.dump(self._params_dict, f)
+        _logger.debug("Save params done")
diff --git a/paddleslim/core/graph_wrapper.py b/paddleslim/core/graph_wrapper.py
index dc01846a10feb8bf212f9e35b9cd585df47ba739..d35e6685636e69b3ddb48e646974f18119d94ea1 100644
--- a/paddleslim/core/graph_wrapper.py
+++ b/paddleslim/core/graph_wrapper.py
@@ -46,7 +46,7 @@ class VarWrapper(object):
         """
         Overwrite this function for ...in... syntax in python.
         """
-        return self._var.name == v._var.name
+        return (v is not None) and self._var.name == v._var.name
 
     def name(self):
         """
@@ -72,6 +72,7 @@ class VarWrapper(object):
     def inputs(self):
         """
         Get all the operators that use this variable as output.
+
         Returns:
             list<OpWrapper>: A list of operators.
         """
@@ -84,6 +85,7 @@ class VarWrapper(object):
     def outputs(self):
         """
         Get all the operators that use this variable as input.
+
         Returns:
             list<OpWrapper>: A list of operators.
         """
@@ -93,6 +95,9 @@ class VarWrapper(object):
                 ops.append(op)
         return ops
 
+    def is_parameter(self):
+        return isinstance(self._var, Parameter)
+
 
 class OpWrapper(object):
     def __init__(self, op, graph):
@@ -161,9 +166,7 @@ class OpWrapper(object):
         """
         Get all the varibales by the output name.
         """
-        return [
-            self._graph.var(var_name) for var_name in self._op.output(name)
-        ]
+        return [self._graph.var(var_name) for var_name in self._op.output(name)]
 
     def set_attr(self, key, value):
         """
@@ -193,18 +196,19 @@ class GraphWrapper(object):
     """
     It is a wrapper of paddle.fluid.framework.IrGraph with some special functions
     for paddle slim framework.
+
+    Args:
+        program(framework.Program): A program with 
+        in_nodes(dict): A dict to indicate the input nodes of the graph.
+                        The key is user-defined and human-readable name.
+                        The value is the name of Variable.
+        out_nodes(dict): A dict to indicate the input nodes of the graph.
+                        The key is user-defined and human-readable name.
+                        The value is the name of Variable.
     """
 
     def __init__(self, program=None, in_nodes=[], out_nodes=[]):
         """
-        Args:
-            program(framework.Program): A program with 
-            in_nodes(dict): A dict to indicate the input nodes of the graph.
-                            The key is user-defined and human-readable name.
-                            The value is the name of Variable.
-            out_nodes(dict): A dict to indicate the input nodes of the graph.
-                            The key is user-defined and human-readable name.
-                            The value is the name of Variable.
         """
         super(GraphWrapper, self).__init__()
         self.program = Program() if program is None else program
@@ -223,6 +227,7 @@ class GraphWrapper(object):
     def all_parameters(self):
         """
         Get all the parameters in this graph.
+
         Returns:
             list<VarWrapper>: A list of VarWrapper instances.
         """
@@ -235,6 +240,7 @@ class GraphWrapper(object):
     def is_parameter(self, var):
         """
         Whether the given variable is parameter.
+
         Args:
             var(VarWrapper): The given varibale.
         """
@@ -243,6 +249,7 @@ class GraphWrapper(object):
     def is_persistable(self, var):
         """
         Whether the given variable is persistable.
+
         Args:
             var(VarWrapper): The given varibale.
         """
@@ -268,11 +275,15 @@ class GraphWrapper(object):
         """
         Get the variable by variable name.
         """
-        return VarWrapper(self.program.global_block().var(name), self)
+        for block in self.program.blocks:
+            if block.has_var(name):
+                return VarWrapper(block.var(name), self)
+        return None
 
     def clone(self, for_test=False):
         """
         Clone a new graph from current graph.
+
         Returns:
             (GraphWrapper): The wrapper of a new graph.
         """
@@ -289,8 +300,10 @@ class GraphWrapper(object):
     def pre_ops(self, op):
         """
         Get all the previous operators of target operator.
+
         Args:
-            op(OpWrapper): Target operator..
+            op(OpWrapper): Target operator.
+
         Returns:
             list<OpWrapper>: A list of operators.
         """
@@ -304,8 +317,10 @@ class GraphWrapper(object):
     def next_ops(self, op):
         """
         Get all the next operators of target operator.
+
         Args:
-            op(OpWrapper): Target operator..
+            op(OpWrapper): Target operator.
+
         Returns:
             list<OpWrapper>: A list of operators.
         """
@@ -337,27 +352,46 @@ class GraphWrapper(object):
             ret += np.product(param.shape())
         return ret
 
-    def update_param_shape(self, scope):
-        """
-        Update the shape of parameters in the graph according to tensors in scope.
-        It is used after loading pruned parameters from file.
-        """
-        for param in self.all_parameters():
-            tensor_shape = np.array(
-                scope.find_var(param.name()).get_tensor()).shape
-            param.set_shape(tensor_shape)
-
     def infer_shape(self):
         """
         Update the groups of convolution layer according to current filters.
         It is used after loading pruned parameters from file.
         """
+        head_op = []
+        visited = []
         for op in self.ops():
             if op.type() != 'conditional_block':
+                if len(self.pre_ops(op)) == 0:
+                    head_op.append(op)
+        candidate_op = self.ops()
+
+        def recursive_infer(op, infer=False):
+            if op in candidate_op:
+                if op.type() != 'conditional_block':
+                    if infer:
+                        op._op.desc.infer_shape(op._op.block.desc)
+                    else:
+                        visited.append(op)
+                candidate_op.remove(op)
+                for next_op in self.next_ops(op):
+                    recursive_infer(next_op)
+
+        # Find ops which not in the DAG, some ops, such as optimizer op,
+        # should be infered before normal cumputation ops.
+        for op in head_op:
+            recursive_infer(op, infer=False)
+
+        # Infer ops which not in the DAG firstly.
+        candidate_op = self.ops()
+        for op in candidate_op:
+            if op not in visited and op.type() != 'conditional_block':
                 op._op.desc.infer_shape(op._op.block.desc)
+        # Infer the remain ops in topological order.
+        for op in head_op:
+            recursive_infer(op, infer=True)
 
     def update_groups_of_conv(self):
         for op in self.ops():
-            if op.type() == 'depthwise_conv2d' or op.type(
-            ) == 'depthwise_conv2d_grad':
+            if 'conv2d' in op.type() and op.attr('groups') >= op.inputs(
+                    'Filter')[0].shape()[0]:
                 op.set_attr('groups', op.inputs('Filter')[0].shape()[0])
diff --git a/paddleslim/core/registry.py b/paddleslim/core/registry.py
index b746e5089f4c37a9d059d2c05fc7be44fc92a957..8d222cf0199b9d98646b261ad6b79bf3819e525c 100644
--- a/paddleslim/core/registry.py
+++ b/paddleslim/core/registry.py
@@ -25,9 +25,6 @@ class Registry(object):
         return self._module_dict.get(key, None)
 
     def _register_module(self, module_class):
-        if not inspect.isclass(module_class):
-            raise TypeError('module must be a class, but receive {}.'.format(
-                type(module_class)))
         module_name = module_class.__name__
         if module_name in self._module_dict:
             raise KeyError('{} is already registered in {}.'.format(
diff --git a/paddleslim/dist/__init__.py b/paddleslim/dist/__init__.py
old mode 100644
new mode 100755
index 9d0531501ca43921438ee5b2fb58ac0ad2396d1b..04b8ef113d70f54b664ed518e71cf0667d599831
--- a/paddleslim/dist/__init__.py
+++ b/paddleslim/dist/__init__.py
@@ -11,3 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .single_distiller import merge, fsp_loss, l2_loss, soft_label_loss, loss
+from .dml import DML
diff --git a/paddleslim/dist/dml.py b/paddleslim/dist/dml.py
new file mode 100755
index 0000000000000000000000000000000000000000..0eba61498fef52bb2072d20e360b4e75e6988b10
--- /dev/null
+++ b/paddleslim/dist/dml.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import paddle
+import paddle.fluid as fluid
+
+PADDLE_VERSION = 1.8
+try:
+    from paddle.fluid.layers import log_softmax
+except:
+    from paddle.nn import LogSoftmax
+    PADDLE_VERSION = 2.0
+
+
+class DML(fluid.dygraph.Layer):
+    def __init__(self, model, use_parallel=False):
+        super(DML, self).__init__()
+        self.model = model
+        self.use_parallel = use_parallel
+        self.model_num = len(self.model)
+        if self.use_parallel:
+            strategy = fluid.dygraph.parallel.prepare_context()
+            self.model = [
+                fluid.dygraph.parallel.DataParallel(m, strategy)
+                for m in self.model
+            ]
+
+    def full_name(self):
+        return [m.full_name() for m in self.model]
+
+    def forward(self, input):
+        return [m(input) for m in self.model]
+
+    def opt(self, optimizer):
+        assert len(
+            optimizer
+        ) == self.model_num, "The number of optimizers must match the number of models"
+        optimizer = DMLOptimizers(self.model, optimizer, self.use_parallel)
+        return optimizer
+
+    def ce_loss(self, logits, labels):
+        assert len(
+            logits
+        ) == self.model_num, "The number of logits must match the number of models"
+        ce_losses = []
+        for i in range(self.model_num):
+            ce_losses.append(
+                fluid.layers.mean(
+                    fluid.layers.softmax_with_cross_entropy(logits[i], labels)))
+        return ce_losses
+
+    def kl_loss(self, logits):
+        assert len(
+            logits
+        ) == self.model_num, "The number of logits must match the number of models"
+        if self.model_num == 1:
+            return []
+        kl_losses = []
+        for i in range(self.model_num):
+            cur_kl_loss = 0
+            for j in range(self.model_num):
+                if i != j:
+                    if PADDLE_VERSION == 2.0:
+                        log_softmax = LogSoftmax(axis=1)
+                        x = log_softmax(logits[i])
+                    else:
+                        x = fluid.layers.log_softmax(logits[i], axis=1)
+                    y = fluid.layers.softmax(logits[j], axis=1)
+                    cur_kl_loss += fluid.layers.kldiv_loss(
+                        x, y, reduction='batchmean')
+            kl_losses.append(cur_kl_loss / (self.model_num - 1))
+        return kl_losses
+
+    def loss(self, logits, labels):
+        gt_losses = self.ce_loss(logits, labels)
+        kl_losses = self.kl_loss(logits)
+        if self.model_num > 1:
+            return [a + b for a, b in zip(gt_losses, kl_losses)]
+        else:
+            return gt_losses
+
+    def acc(self, logits, labels, k):
+        accs = [
+            fluid.layers.accuracy(
+                input=l, label=labels, k=k) for l in logits
+        ]
+        return accs
+
+    def train(self):
+        for m in self.model:
+            m.train()
+
+    def eval(self):
+        for m in self.model:
+            m.eval()
+
+
+class DMLOptimizers(object):
+    def __init__(self, model, optimizer, use_parallel):
+        self.model = model
+        self.optimizer = optimizer
+        self.use_parallel = use_parallel
+
+    def minimize(self, losses):
+        assert len(losses) == len(
+            self.optimizer
+        ), "The number of losses must match the number of optimizers"
+        for i in range(len(losses)):
+            if self.use_parallel:
+                losses[i] = self.model[i].scale_loss(losses[i])
+                losses[i].backward()
+                self.model[i].apply_collective_grads()
+            else:
+                losses[i].backward()
+            self.optimizer[i].minimize(losses[i])
+            self.model[i].clear_gradients()
+
+    def get_lr(self):
+        current_step_lr = [opt.current_step_lr() for opt in self.optimizer]
+        return current_step_lr
diff --git a/paddleslim/dist/mp_distiller.py b/paddleslim/dist/mp_distiller.py
deleted file mode 100755
index ff15f5f17dd130edfd6fc5bfa1d8c358da2a5ae2..0000000000000000000000000000000000000000
--- a/paddleslim/dist/mp_distiller.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import logging
-import numpy as np
-from six.moves.queue import Queue
-
-import paddle.fluid as fluid
-from paddle.fluid.framework import Variable
-from paddle.fluid.reader import DataLoaderBase
-from paddle.fluid.core import EOFException
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-__all__ = ['Knowledge']
-
-
-class Knowledge(object):
-    """
-    The knowledge class describes how to extract and store the dark knowledge
-    of the teacher model, and how the student model learns these dark knowledge.
-    """
-
-    def __init__(self,
-                 path,
-                 items,
-                 reduce_strategy={'type': 'sum',
-                                  'key': 'image'}):
-        """Init a knowledge instance.
-        Args:
-            path(list<str>, str, optional): Specifies the storage path of the knowledge,
-                       supports AFS/HDFS, local file system, and memory.
-            items(list<str>): Save the tensor of the specified name
-            reduce_strategy(dict, optional): The policy for performing the reduce
-                                   operation. If it is set to None,
-                                   the reduce operation is not performed.
-            reduce_strategy.type(str): Type of reduce operation.
-            reduce_strategy.key(str): The key of the reduce operation.
-                                      It is an element in the item.
-        """
-        assert (isinstance(path, list) or isinstance(path, str) or
-                (path is None)), "path type should be list or str or None"
-        assert (isinstance(items, list)), "items should be a list"
-        assert (isinstance(reduce_strategy,
-                           dict)), "reduce_strategy should be a dict"
-        self.path = path
-        if isinstance(self.path, list):
-            self.write_type = 'HDFS/AFS'
-            assert (
-                len(self.path) == 4 and isinstance(self.path[0], str) and
-                isinstance(self.path[1], str) and
-                isinstance(self.path[2], str) and isinstance(self.path[3], str)
-            ), "path should contains four str, ['local hadoop home', 'fs.default.name', 'hadoop.job.ugi', 'FS path']"
-
-            hadoop_home = self.path[0]
-            configs = {
-                "fs.default.name": self.path[1],
-                "hadoop.job.ugi": self.path[2]
-            }
-            self.client = HDFSClient(hadoop_home, configs)
-            assert (
-                self.client.is_exist(self.path[3]) == True
-            ), "Plese make sure your hadoop confiuration is correct and FS path exists"
-
-            self.hdfs_local_path = "./teacher_knowledge"
-            if not os.path.exists(self.hdfs_local_path):
-                os.mkdir(self.hdfs_local_path)
-        elif isinstance(self.path, str):
-            self.write_type = "LocalFS"
-            if not os.path.exists(path):
-                raise ValueError("The local path [%s] does not exist." %
-                                 (path))
-        else:
-            self.write_type = "MEM"
-            self.knowledge_queue = Queue(64)
-
-        self.items = items
-        self.reduce_strategy = reduce_strategy
-
-    def _write(self, data):
-        if self.write_type == 'HDFS/AFS':
-            file_name = 'knowledge_' + str(self.file_cnt)
-            file_path = os.path.join(self.hdfs_local_path, file_name)
-            file_path += ".npy"
-            np.save(file_path, data)
-            self.file_cnt += 1
-            self.client.upload(self.path[3], file_path)
-            logger.info('{}.npy pushed to HDFS/AFS: {}'.format(file_name,
-                                                               self.path[3]))
-
-        elif self.write_type == 'LocalFS':
-            file_name = 'knowledge_' + str(self.file_cnt)
-            file_path = os.path.join(self.path, file_name)
-            np.save(file_path, data)
-            logger.info('{}.npy saved'.format(file_name))
-            self.file_cnt += 1
-
-        else:
-            self.knowledge_queue.put(data)
-            logger.info('{} pushed to Queue'.format(file_name))
-
-    def run(self, teacher_program, exe, place, scope, reader, inputs, outputs,
-            call_back):
-        """Start teacher model to do information.
-        Args:
-            teacher_program(Program): teacher program.
-            scope(Scope): The scope used to execute the teacher,
-                          which contains the initialized variables.
-            reader(reader): The data reader used by the teacher.
-            inputs(list<str>): The name of variables to feed the teacher program.
-            outputs(list<str>): Need to write to the variable instance's names of
-                                the Knowledge instance, which needs to correspond
-                                to the Knowledge's items.
-            call_back(func, optional): The callback function that handles the
-                          outputs of the teacher, which is none by default,
-                          that is, the output of the teacher is concat directly.
-        Return:
-            (bool): Whether the teacher task was successfully registered and started
-        """
-        assert (isinstance(
-            teacher_program,
-            fluid.Program)), "teacher_program should be a fluid.Program"
-        assert (isinstance(inputs, list)), "inputs should be a list"
-        assert (isinstance(outputs, list)), "outputs should be a list"
-        assert (len(self.items) == len(outputs)
-                ), "the length of outputs list should be equal with items list"
-        assert (callable(call_back) or (call_back is None)
-                ), "call_back should be a callable function or NoneType."
-
-        for var in teacher_program.list_vars():
-            var.stop_gradient = True
-
-        compiled_teacher_program = fluid.compiler.CompiledProgram(
-            teacher_program)
-        self.file_cnt = 0
-        if isinstance(reader, Variable) or (
-                isinstance(reader, DataLoaderBase) and (not reader.iterable)):
-            reader.start()
-            try:
-                while True:
-                    logits = exe.run(compiled_teacher_program,
-                                     scope=scope,
-                                     fetch_list=outputs,
-                                     feed=None)
-                    knowledge = dict()
-                    for index, array in enumerate(logits):
-                        knowledge[self.items[index]] = array
-                    self._write(knowledge)
-            except EOFException:
-                reader.reset()
-
-        else:
-            if not isinstance(reader, DataLoaderBase):
-                feeder = fluid.DataFeeder(
-                    feed_list=inputs, place=place, program=teacher_program)
-            for batch_id, data in enumerate(reader()):
-                if not isinstance(reader, DataLoaderBase):
-                    data = feeder.feed(data)
-                logits = exe.run(compiled_teacher_program,
-                                 scope=scope,
-                                 fetch_list=outputs,
-                                 feed=data)
-                knowledge = dict()
-                for index, array in enumerate(logits):
-                    knowledge[self.items[index]] = array
-                self._write(knowledge)
-        return True
-
-    def dist(self, student_program, losses):
-        """Building the distillation network
-        Args:
-            student_program(Program): student program.
-            losses(list<Variable>, optional): The losses need to add. If set to None
-                              does not add any loss.
-        Return:
-            (Program): Program for distillation.
-            (startup_program): Program for initializing distillation network.
-            (reader): Data reader for distillation training.
-            (Variable): Loss of distillation training
-        """
-
-    def loss(self, loss_func, *variables):
-        """User-defined loss
-        Args:
-            loss_func(func): Function used to define loss.
-            *variables(list<str>): Variable name list.
-        Return:
-            (Variable): Distillation loss.
-        """
-        pass
-
-    def fsp_loss(self):
-        """fsp loss
-        """
-        pass
-
-    def l2_loss(self):
-        """l2 loss
-        """
-        pass
-
-    def softlabel_loss(self):
-        """softlabel_loss
-        """
-        pass
diff --git a/paddleslim/dist/single_distiller.py b/paddleslim/dist/single_distiller.py
index 8f5dcaeb14a0f6a7aadd5c99de7bc3c144f21414..c5824851819afa8b0c5d410fe6c9c58843648842 100644
--- a/paddleslim/dist/single_distiller.py
+++ b/paddleslim/dist/single_distiller.py
@@ -20,22 +20,31 @@ def merge(teacher_program,
           student_program,
           data_name_map,
           place,
-          scope=fluid.global_scope(),
+          scope=None,
           name_prefix='teacher_'):
-    """
-    Merge teacher program into student program and add a uniform prefix to the
+    """Merge teacher program into student program and add a uniform prefix to the
     names of all vars in teacher program
+
     Args:
         teacher_program(Program): The input teacher model paddle program 
         student_program(Program): The input student model paddle program
-        data_map_map(dict): Describe the mapping between the teacher var name
-                            and the student var name
+        data_map_map(dict): Mapping of teacher input interface name and student
+                            input interface name, where key of dict is the
+                            input name of teacher_program, and value is the
+                            input name of student_program.
         place(fluid.CPUPlace()|fluid.CUDAPlace(N)): This parameter represents
                                                     paddle run on which device.
-        scope(Scope): The input scope
+        scope(Scope): This parameter indicates the variable scope used by
+                      the program. If not specified, the default global scope
+                      will be used. Default: None
         name_prefix(str): Name prefix added for all vars of the teacher program.
-    Return(Program): Merged program.
+                          Default: 'teacher_'
+
+    Returns:
+        None
     """
+    if scope==None:
+        scope = fluid.global_scope()
     teacher_program = teacher_program.clone(for_test=True)
     for teacher_var in teacher_program.list_vars():
         skip_rename = False
@@ -51,7 +60,7 @@ def merge(teacher_program,
                 old_var = scope.var(teacher_var.name).get_tensor()
                 renamed_var = scope.var(new_name).get_tensor()
                 renamed_var.set(np.array(old_var), place)
-    
+
                 # program var rename
                 renamed_var = teacher_program.global_block()._rename_var(
                     teacher_var.name, new_name)
@@ -84,13 +93,15 @@ def merge(teacher_program,
                     attrs[attr_name] = op.attr(attr_name)
                 student_program.global_block().append_op(
                     type=op.type, inputs=inputs, outputs=outputs, attrs=attrs)
-    return student_program
 
 
-def fsp_loss(teacher_var1_name, teacher_var2_name, student_var1_name,
-             student_var2_name, program=fluid.default_main_program()):
-    """
-    Combine variables from student model and teacher model by fsp-loss.
+def fsp_loss(teacher_var1_name,
+             teacher_var2_name,
+             student_var1_name,
+             student_var2_name,
+             program=None):
+    """Combine variables from student model and teacher model by fsp-loss.
+
     Args:
         teacher_var1_name(str): The name of teacher_var1.
         teacher_var2_name(str): The name of teacher_var2. Except for the
@@ -100,10 +111,14 @@ def fsp_loss(teacher_var1_name, teacher_var2_name, student_var1_name,
         student_var2_name(str): The name of student_var2. Except for the
             second dimension, all other dimensions should
             be consistent with student_var1.
-        program(Program): The input distiller program.
-                          default: fluid.default_main_program()
-    Return(Variable): fsp distiller loss.
+        program(Program): The input distiller program. If not specified,
+                          the default program will be used. Default: None
+
+    Returns:
+        Variable: fsp distiller loss.
     """
+    if program==None:
+        program=fluid.default_main_program()
     teacher_var1 = program.global_block().var(teacher_var1_name)
     teacher_var2 = program.global_block().var(teacher_var2_name)
     student_var1 = program.global_block().var(student_var1_name)
@@ -115,17 +130,22 @@ def fsp_loss(teacher_var1_name, teacher_var2_name, student_var1_name,
     return fsp_loss
 
 
-def l2_loss(teacher_var_name, student_var_name,
-            program=fluid.default_main_program()):
-    """
-    Combine variables from student model and teacher model by l2-loss.
+def l2_loss(teacher_var_name,
+            student_var_name,
+            program=None):
+    """Combine variables from student model and teacher model by l2-loss.
+
     Args:
         teacher_var_name(str): The name of teacher_var.
         student_var_name(str): The name of student_var.
-        program(Program): The input distiller program.
-                          default: fluid.default_main_program() 
-    Return(Variable): l2 distiller loss.
+        program(Program): The input distiller program. If not specified,
+                          the default program will be used. Default: None
+
+    Returns: 
+        Variable: l2 distiller loss.
     """
+    if program==None:
+        program=fluid.default_main_program()
     student_var = program.global_block().var(student_var_name)
     teacher_var = program.global_block().var(teacher_var_name)
     l2_loss = fluid.layers.reduce_mean(
@@ -135,22 +155,26 @@ def l2_loss(teacher_var_name, student_var_name,
 
 def soft_label_loss(teacher_var_name,
                     student_var_name,
-                    program=fluid.default_main_program(),
+                    program=None,
                     teacher_temperature=1.,
                     student_temperature=1.):
-    """
-    Combine variables from student model and teacher model by soft-label-loss.
+    """Combine variables from student model and teacher model by soft-label-loss.
+
     Args:
         teacher_var_name(str): The name of teacher_var.
         student_var_name(str): The name of student_var.
-        program(Program): The input distiller program.
-                          default: fluid.default_main_program() 
+        program(Program): The input distiller program. If not specified,
+                          the default program will be used. Default: None
         teacher_temperature(float): Temperature used to divide
-            teacher_feature_map before softmax. default: 1.0
+            teacher_feature_map before softmax. Default: 1.0
         student_temperature(float): Temperature used to divide 
-            student_feature_map before softmax. default: 1.0
-    Return(Variable): l2 distiller loss.
+            student_feature_map before softmax. Default: 1.0
+
+    Returns:
+        Variable: l2 distiller loss.
     """
+    if program==None:
+        program=fluid.default_main_program()
     student_var = program.global_block().var(student_var_name)
     teacher_var = program.global_block().var(teacher_var_name)
     student_var = fluid.layers.softmax(student_var / student_temperature)
@@ -162,15 +186,19 @@ def soft_label_loss(teacher_var_name,
     return soft_label_loss
 
 
-def loss(loss_func, program=fluid.default_main_program(), **kwargs):
-    """
-    Combine variables from student model and teacher model by self defined loss.
+def loss(loss_func, program=None, **kwargs):
+    """Combine variables from student model and teacher model by self defined loss.
+
     Args:
-        program(Program): The input distiller program.
-                          default: fluid.default_main_program() 
+        program(Program): The input distiller program. If not specified,
+                          the default program will be used. Default: None
         loss_func(function): The user self defined loss function. 
-    Return(Variable): self defined distiller loss.
+
+    Returns: 
+        Variable: self defined distiller loss.
     """
+    if program==None:
+        program=fluid.default_main_program()
     func_parameters = {}
     for item in kwargs.items():
         if isinstance(item[1], str):
diff --git a/paddleslim/models/README.md b/paddleslim/models/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..fe2715d8b571409fb77d46b12cc85b8c13ab941c
--- /dev/null
+++ b/paddleslim/models/README.md
@@ -0,0 +1,41 @@
+# SlimX系列小模型
+
+PaddleSlim模型压缩工具在人脸识别,OCR,通用任务分类任务，检测任务等多个任务上都发布了SlimX系列小模型:
+
+- `SlimMobileNet系列`
+- `SlimFaceNet系列`
+
+## SlimMobileNet系列指标
+
+SlimMobileNet基于百度自研的[GP-NAS论文](https://openaccess.thecvf.com/content_CVPR_2020/papers/Li_GP-NAS_Gaussian_Process_Based_Neural_Architecture_Search_CVPR_2020_paper.pdf)（CVPR2020）AutoDL技术以及自研的蒸馏技术得到。
+
+相比于MobileNetV3, SlimMobileNet_V1在精度提升1.7个点的情况下Flops可以压缩138%。
+由于精度比MobileNetV3高出了1.7个点，SlimMobileNet_V1量化后精度仍然高于MobileNetV3。量化后SlimMobileNet_V1可以在精度高于MobileNetV3的情况下Flops压缩552%。SlimMobileNet_V4_x1_1为业界首次发布的Flops 300M以下，ImagenetNet精度超过80%的分类小模型。
+
+|Method|Flops(M)|Top1 Acc|
+|------|-----|-----|
+|MobileNetV3_large_x1_0|225|75.2|
+|MobileNetV3_large_x1_25|357|76.6|
+|GhostNet_x1_3|220|75.7|
+|SlimMobileNet_V1|163|76.9|
+|SlimMobileNet_V4_x1_1|296|80.1|
+|SlimMobileNet_V5|390|80.4|
+
+## [SlimFaceNet](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/demo/slimfacenet/README.md)系列指标
+
+SlimFaceNet同样是基于百度自研的GP-NAS AutoDL技术以及百度自研的自监督超网络训练算法得到。相比于MobileNetV2，SlimFaceNet_A_x0_60 flops压缩216%，在RK3288上加速428%。基于PaddleSlim的离线量化功能还可以进一步压缩模型，相比于MobileNetV2，SlimFaceNet_A_x0_60_quant flops可以压缩865%，在RK3288硬件上可以加速643%。为了对齐论文，LFW指标为112x96输入下的结果；结合业务场景，Flops和speed为112x112输入下的结果，延时为RK3288上的延时。
+
+|Method|LFW|Flops|speed|
+|------|-----|-----|-----|
+|MobileNetV2|98.58%|277M|270ms|
+|MobileFaceNet|99.18%|224M|102ms|
+|SlimFaceNet_A_x0_60|99.21%|128M|63ms|
+|SlimFaceNet_B_x0_75|99.22%|151M|70ms|
+|SlimFaceNet_A_x0_60_quant|99.17%|32M|42ms|
+|SlimFaceNet_B_x0_75_quant|99.21%|38M|45ms|
+
+## 业界领先的AutoDL技术
+
+GP-NAS从贝叶斯角度来建模NAS，并为不同的搜索空间设计了定制化的高斯过程均值函数和核函数。 具体来说，基于GP-NAS的超参数，我们有能力高效率的预测搜索空间中任意模型结构的性能。 从而，模型结构自动搜索问题就被转
+换为GP-NAS高斯过程的超参数估计问题。接下来，通过互信息最大化采样算法，我们可以有效地对模型结构进行采样。 因此，根据采样网络的性能，我们可以有效的逐步更新GP-NAS超参数的后验分布。基于估计出的GP-NAS超参数，
+我们可以预测出满足特定延时约束的最优的模型结构，更详细的技术细节请参考GP-NAS论文。
diff --git a/paddleslim/models/__init__.py b/paddleslim/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb308a71d30309ac893be8032acf34a661e35c5d
--- /dev/null
+++ b/paddleslim/models/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from .util import image_classification
+from .slimfacenet import SlimFaceNet_A_x0_60, SlimFaceNet_B_x0_75, SlimFaceNet_C_x0_75
+from .slim_mobilenet import SlimMobileNet_v1, SlimMobileNet_v2, SlimMobileNet_v3, SlimMobileNet_v4, SlimMobileNet_v5
+from .mobilenet import MobileNet
+from .resnet import ResNet50
+
+__all__ = ["image_classification", "MobileNet", "ResNet50"]
diff --git a/paddleslim/models/classification_models.py b/paddleslim/models/classification_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5e605ccbc788de17d0c954795ed9258b460d2d0
--- /dev/null
+++ b/paddleslim/models/classification_models.py
@@ -0,0 +1,6 @@
+from __future__ import absolute_import
+from .mobilenet import MobileNet
+from .resnet import ResNet34, ResNet50
+from .mobilenet_v2 import MobileNetV2
+__all__ = ["model_list", "MobileNet", "ResNet34", "ResNet50", "MobileNetV2"]
+model_list = ['MobileNet', 'ResNet34', 'ResNet50', 'MobileNetV2']
diff --git a/paddleslim/models/dygraph/__init__.py b/paddleslim/models/dygraph/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d618ee708b3c9d594dd5e6b02d9ee75504452c38
--- /dev/null
+++ b/paddleslim/models/dygraph/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from .mobilenet import MobileNetV1
+from .resnet import ResNet
+
+__all__ = ["MobileNetV1", "ResNet"]
diff --git a/paddleslim/models/dygraph/mobilenet.py b/paddleslim/models/dygraph/mobilenet.py
new file mode 100755
index 0000000000000000000000000000000000000000..16f0aef39ef59697f71b50bde15eb5eb3778e522
--- /dev/null
+++ b/paddleslim/models/dygraph/mobilenet.py
@@ -0,0 +1,238 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#order: standard library, third party, local library 
+import os
+import time
+import sys
+import math
+import numpy as np
+import argparse
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid import framework
+
+
+class ConvBNLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 channels=None,
+                 num_groups=1,
+                 act='relu',
+                 use_cudnn=True,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=self.full_name() + "_weights"),
+            bias_attr=False)
+
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
+            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
+            moving_mean_name=self.full_name() + "_bn" + '_mean',
+            moving_variance_name=self.full_name() + "_bn" + '_variance')
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class DepthwiseSeparable(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters1,
+                 num_filters2,
+                 num_groups,
+                 stride,
+                 scale,
+                 name=None):
+        super(DepthwiseSeparable, self).__init__()
+
+        self._depthwise_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=int(num_filters1 * scale),
+            filter_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale),
+            use_cudnn=False)
+
+        self._pointwise_conv = ConvBNLayer(
+            num_channels=int(num_filters1 * scale),
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0)
+
+    def forward(self, inputs):
+        y = self._depthwise_conv(inputs)
+        y = self._pointwise_conv(y)
+        return y
+
+
+class MobileNetV1(fluid.dygraph.Layer):
+    def __init__(self, scale=1.0, class_dim=100):
+        super(MobileNetV1, self).__init__()
+        self.scale = scale
+        self.dwsl = []
+
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            channels=3,
+            num_filters=int(32 * scale),
+            stride=1,
+            padding=1)
+
+        dws21 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(32 * scale),
+                num_filters1=32,
+                num_filters2=64,
+                num_groups=32,
+                stride=1,
+                scale=scale),
+            name="conv2_1")
+        self.dwsl.append(dws21)
+
+        dws22 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(64 * scale),
+                num_filters1=64,
+                num_filters2=128,
+                num_groups=64,
+                stride=1,
+                scale=scale),
+            name="conv2_2")
+        self.dwsl.append(dws22)
+
+        dws31 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(128 * scale),
+                num_filters1=128,
+                num_filters2=128,
+                num_groups=128,
+                stride=1,
+                scale=scale),
+            name="conv3_1")
+        self.dwsl.append(dws31)
+
+        dws32 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(128 * scale),
+                num_filters1=128,
+                num_filters2=256,
+                num_groups=128,
+                stride=2,
+                scale=scale),
+            name="conv3_2")
+        self.dwsl.append(dws32)
+
+        dws41 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(256 * scale),
+                num_filters1=256,
+                num_filters2=256,
+                num_groups=256,
+                stride=1,
+                scale=scale),
+            name="conv4_1")
+        self.dwsl.append(dws41)
+
+        dws42 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(256 * scale),
+                num_filters1=256,
+                num_filters2=512,
+                num_groups=256,
+                stride=2,
+                scale=scale),
+            name="conv4_2")
+        self.dwsl.append(dws42)
+
+        for i in range(5):
+            tmp = self.add_sublayer(
+                sublayer=DepthwiseSeparable(
+                    num_channels=int(512 * scale),
+                    num_filters1=512,
+                    num_filters2=512,
+                    num_groups=512,
+                    stride=1,
+                    scale=scale),
+                name="conv5_" + str(i + 1))
+            self.dwsl.append(tmp)
+
+        dws56 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(512 * scale),
+                num_filters1=512,
+                num_filters2=1024,
+                num_groups=512,
+                stride=2,
+                scale=scale),
+            name="conv5_6")
+        self.dwsl.append(dws56)
+
+        dws6 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(1024 * scale),
+                num_filters1=1024,
+                num_filters2=1024,
+                num_groups=1024,
+                stride=1,
+                scale=scale),
+            name="conv6")
+        self.dwsl.append(dws6)
+
+        self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
+
+        self.out = Linear(
+            int(1024 * scale),
+            class_dim,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=self.full_name() + "fc7_weights"),
+            bias_attr=ParamAttr(name=self.full_name() + "fc7_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1(inputs)
+        for dws in self.dwsl:
+            y = dws(y)
+
+        y = self.pool2d_avg(y)
+        y = fluid.layers.reshape(y, shape=[-1, 1024])
+        y = self.out(y)
+
+        return y
diff --git a/paddleslim/models/dygraph/resnet.py b/paddleslim/models/dygraph/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..a33f6f56decfdba28f93282eb62adbdb185ede4a
--- /dev/null
+++ b/paddleslim/models/dygraph/resnet.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+
+
+class ConvBNLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+
+        self._batch_norm = BatchNorm(num_filters, act=act)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+
+        return y
+
+
+class BottleneckBlock(fluid.dygraph.Layer):
+    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu')
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu')
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride)
+
+        self.shortcut = shortcut
+
+        self._num_channels_out = num_filters * 4
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = fluid.layers.elementwise_add(x=short, y=conv2)
+
+        layer_helper = LayerHelper(self.full_name(), act='relu')
+        return layer_helper.append_activation(y)
+
+
+class ResNet(fluid.dygraph.Layer):
+    def __init__(self, layers=50, class_dim=100):
+        super(ResNet, self).__init__()
+
+        self.layers = layers
+        supported_layers = [34, 50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv = ConvBNLayer(
+            num_channels=3,
+            num_filters=64,
+            filter_size=7,
+            stride=1,
+            act='relu')
+        self.pool2d_max = Pool2D(
+            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+
+        self.bottleneck_block_list = []
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels=num_channels[block]
+                        if i == 0 else num_filters[block] * 4,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        shortcut=shortcut))
+                self.bottleneck_block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = Pool2D(
+            pool_size=7, pool_type='avg', global_pooling=True)
+
+        self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 4 * 1 * 1
+
+        import math
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_output,
+            class_dim,
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)))
+
+    def forward(self, inputs):
+        y = self.conv(inputs)
+        for bottleneck_block in self.bottleneck_block_list:
+            y = bottleneck_block(y)
+        y = self.pool2d_avg(y)
+        y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output])
+        y = self.out(y)
+        return y
diff --git a/paddleslim/models/mobilenet.py b/paddleslim/models/mobilenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..921d6226ca2a65d5c9b57e27bf6607c7376c51f6
--- /dev/null
+++ b/paddleslim/models/mobilenet.py
@@ -0,0 +1,197 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = ['MobileNet']
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [10, 16, 30],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class MobileNet():
+    def __init__(self):
+        self.params = train_parameters
+
+    def net(self, input, class_dim=1000, scale=1.0):
+        # conv1: 112x112
+        input = self.conv_bn_layer(
+            input,
+            filter_size=3,
+            channels=3,
+            num_filters=int(32 * scale),
+            stride=2,
+            padding=1,
+            name="conv1")
+
+        # 56x56
+        input = self.depthwise_separable(
+            input,
+            num_filters1=32,
+            num_filters2=64,
+            num_groups=32,
+            stride=1,
+            scale=scale,
+            name="conv2_1")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=64,
+            num_filters2=128,
+            num_groups=64,
+            stride=2,
+            scale=scale,
+            name="conv2_2")
+
+        # 28x28
+        input = self.depthwise_separable(
+            input,
+            num_filters1=128,
+            num_filters2=128,
+            num_groups=128,
+            stride=1,
+            scale=scale,
+            name="conv3_1")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=128,
+            num_filters2=256,
+            num_groups=128,
+            stride=2,
+            scale=scale,
+            name="conv3_2")
+
+        # 14x14
+        input = self.depthwise_separable(
+            input,
+            num_filters1=256,
+            num_filters2=256,
+            num_groups=256,
+            stride=1,
+            scale=scale,
+            name="conv4_1")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=256,
+            num_filters2=512,
+            num_groups=256,
+            stride=2,
+            scale=scale,
+            name="conv4_2")
+
+        # 14x14
+        for i in range(5):
+            input = self.depthwise_separable(
+                input,
+                num_filters1=512,
+                num_filters2=512,
+                num_groups=512,
+                stride=1,
+                scale=scale,
+                name="conv5" + "_" + str(i + 1))
+        # 7x7
+        input = self.depthwise_separable(
+            input,
+            num_filters1=512,
+            num_filters2=1024,
+            num_groups=512,
+            stride=2,
+            scale=scale,
+            name="conv5_6")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=1024,
+            num_filters2=1024,
+            num_groups=1024,
+            stride=1,
+            scale=scale,
+            name="conv6")
+
+        input = fluid.layers.pool2d(
+            input=input,
+            pool_size=0,
+            pool_stride=1,
+            pool_type='avg',
+            global_pooling=True)
+
+        output = fluid.layers.fc(input=input,
+                                 size=class_dim,
+                                 act='softmax',
+                                 param_attr=ParamAttr(
+                                     initializer=MSRA(), name="fc7_weights"),
+                                 bias_attr=ParamAttr(name="fc7_offset"))
+
+        return output
+
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      channels=None,
+                      num_groups=1,
+                      act='relu',
+                      use_cudnn=True,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=name + "_weights"),
+            bias_attr=False)
+        bn_name = name + "_bn"
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def depthwise_separable(self,
+                            input,
+                            num_filters1,
+                            num_filters2,
+                            num_groups,
+                            stride,
+                            scale,
+                            name=None):
+        depthwise_conv = self.conv_bn_layer(
+            input=input,
+            filter_size=3,
+            num_filters=int(num_filters1 * scale),
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale),
+            use_cudnn=False,
+            name=name + "_dw")
+
+        pointwise_conv = self.conv_bn_layer(
+            input=depthwise_conv,
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0,
+            name=name + "_sep")
+        return pointwise_conv
diff --git a/paddleslim/models/mobilenet_v2.py b/paddleslim/models/mobilenet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccfb250b79a5365d28470886624287fbc87be50c
--- /dev/null
+++ b/paddleslim/models/mobilenet_v2.py
@@ -0,0 +1,259 @@
+#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = [
+    'MobileNetV2', 'MobileNetV2_x0_25, '
+    'MobileNetV2_x0_5', 'MobileNetV2_x1_0', 'MobileNetV2_x1_5',
+    'MobileNetV2_x2_0', 'MobileNetV2_scale'
+]
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class MobileNetV2():
+    def __init__(self, scale=1.0, change_depth=False):
+        self.params = train_parameters
+        self.scale = scale
+        self.change_depth = change_depth
+
+    def net(self, input, class_dim=1000):
+        scale = self.scale
+        change_depth = self.change_depth
+        #if change_depth is True, the new depth is 1.4 times as deep as before.
+        bottleneck_params_list = [
+            (1, 16, 1, 1),
+            (6, 24, 2, 2),
+            (6, 32, 3, 2),
+            (6, 64, 4, 2),
+            (6, 96, 3, 1),
+            (6, 160, 3, 2),
+            (6, 320, 1, 1),
+        ] if change_depth == False else [
+            (1, 16, 1, 1),
+            (6, 24, 2, 2),
+            (6, 32, 5, 2),
+            (6, 64, 7, 2),
+            (6, 96, 5, 1),
+            (6, 160, 3, 2),
+            (6, 320, 1, 1),
+        ]
+
+        #conv1 
+        input = self.conv_bn_layer(
+            input,
+            num_filters=int(32 * scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            if_act=True,
+            name='conv1_1')
+
+        # bottleneck sequences
+        i = 1
+        in_c = int(32 * scale)
+        for layer_setting in bottleneck_params_list:
+            t, c, n, s = layer_setting
+            i += 1
+            input = self.invresi_blocks(
+                input=input,
+                in_c=in_c,
+                t=t,
+                c=int(c * scale),
+                n=n,
+                s=s,
+                name='conv' + str(i))
+            in_c = int(c * scale)
+        #last_conv
+        input = self.conv_bn_layer(
+            input=input,
+            num_filters=int(1280 * scale) if scale > 1.0 else 1280,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=True,
+            name='conv9')
+
+        input = fluid.layers.pool2d(
+            input=input,
+            pool_size=7,
+            pool_stride=1,
+            pool_type='avg',
+            global_pooling=True)
+
+        output = fluid.layers.fc(input=input,
+                                 size=class_dim,
+                                 act='softmax',
+                                 param_attr=ParamAttr(name='fc10_weights'),
+                                 bias_attr=ParamAttr(name='fc10_offset'))
+        return output
+
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      channels=None,
+                      num_groups=1,
+                      if_act=True,
+                      name=None,
+                      use_cudnn=True):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(name=name + '_weights'),
+            bias_attr=False)
+        bn_name = name + '_bn'
+        bn = fluid.layers.batch_norm(
+            input=conv,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+        if if_act:
+            return fluid.layers.relu6(bn)
+        else:
+            return bn
+
+    def shortcut(self, input, data_residual):
+        return fluid.layers.elementwise_add(input, data_residual)
+
+    def inverted_residual_unit(self,
+                               input,
+                               num_in_filter,
+                               num_filters,
+                               ifshortcut,
+                               stride,
+                               filter_size,
+                               padding,
+                               expansion_factor,
+                               name=None):
+        num_expfilter = int(round(num_in_filter * expansion_factor))
+
+        channel_expand = self.conv_bn_layer(
+            input=input,
+            num_filters=num_expfilter,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            name=name + '_expand')
+
+        bottleneck_conv = self.conv_bn_layer(
+            input=channel_expand,
+            num_filters=num_expfilter,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            num_groups=num_expfilter,
+            if_act=True,
+            name=name + '_dwise',
+            use_cudnn=False)
+
+        linear_out = self.conv_bn_layer(
+            input=bottleneck_conv,
+            num_filters=num_filters,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=False,
+            name=name + '_linear')
+        if ifshortcut:
+            out = self.shortcut(input=input, data_residual=linear_out)
+            return out
+        else:
+            return linear_out
+
+    def invresi_blocks(self, input, in_c, t, c, n, s, name=None):
+        first_block = self.inverted_residual_unit(
+            input=input,
+            num_in_filter=in_c,
+            num_filters=c,
+            ifshortcut=False,
+            stride=s,
+            filter_size=3,
+            padding=1,
+            expansion_factor=t,
+            name=name + '_1')
+
+        last_residual_block = first_block
+        last_c = c
+
+        for i in range(1, n):
+            last_residual_block = self.inverted_residual_unit(
+                input=last_residual_block,
+                num_in_filter=last_c,
+                num_filters=c,
+                ifshortcut=True,
+                stride=1,
+                filter_size=3,
+                padding=1,
+                expansion_factor=t,
+                name=name + '_' + str(i + 1))
+        return last_residual_block
+
+
+def MobileNetV2_x0_25():
+    model = MobileNetV2(scale=0.25)
+    return model
+
+
+def MobileNetV2_x0_5():
+    model = MobileNetV2(scale=0.5)
+    return model
+
+
+def MobileNetV2_x1_0():
+    model = MobileNetV2(scale=1.0)
+    return model
+
+
+def MobileNetV2_x1_5():
+    model = MobileNetV2(scale=1.5)
+    return model
+
+
+def MobileNetV2_x2_0():
+    model = MobileNetV2(scale=2.0)
+    return model
+
+
+def MobileNetV2_scale():
+    model = MobileNetV2(scale=1.2, change_depth=True)
+    return model
diff --git a/paddleslim/models/resnet.py b/paddleslim/models/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ceaef41ecc87d7388ae05d7fcb199de1841ebc2
--- /dev/null
+++ b/paddleslim/models/resnet.py
@@ -0,0 +1,229 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+import math
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = ["ResNet", "ResNet34", "ResNet50", "ResNet101", "ResNet152"]
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [10, 16, 30],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class ResNet():
+    def __init__(self, layers=50, prefix_name=''):
+        self.params = train_parameters
+        self.layers = layers
+        self.prefix_name = prefix_name
+
+    def net(self, input, class_dim=1000, conv1_name='conv1', fc_name=None):
+        layers = self.layers
+        prefix_name = self.prefix_name if self.prefix_name is '' else self.prefix_name + '_'
+        supported_layers = [34, 50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+
+        if layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_filters = [64, 128, 256, 512]
+
+        # TODO(wanghaoshuang@baidu.com):
+        # fix name("conv1") conflict between student and teacher in distillation.
+        conv = self.conv_bn_layer(
+            input=input,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu',
+            name=prefix_name + conv1_name)
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+
+        if layers >= 50:
+            for block in range(len(depth)):
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    conv_name = prefix_name + conv_name
+                    conv = self.bottleneck_block(
+                        input=conv,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        name=conv_name)
+
+            pool = fluid.layers.pool2d(
+                input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+            stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+            fc_name = fc_name if fc_name is None else prefix_name + fc_name
+            out = fluid.layers.fc(input=pool,
+                                  size=class_dim,
+                                  act='softmax',
+                                  name=fc_name,
+                                  param_attr=fluid.param_attr.ParamAttr(
+                                      initializer=fluid.initializer.Uniform(
+                                          -stdv, stdv)))
+        else:
+            for block in range(len(depth)):
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    conv_name = prefix_name + conv_name
+                    conv = self.basic_block(
+                        input=conv,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        is_first=block == i == 0,
+                        name=conv_name)
+
+            pool = fluid.layers.pool2d(
+                input=conv, pool_type='avg', global_pooling=True)
+            stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+            fc_name = fc_name if fc_name is None else prefix_name + fc_name
+            out = fluid.layers.fc(
+                input=pool,
+                size=class_dim,
+                act='softmax',
+                name=fc_name,
+                param_attr=fluid.param_attr.ParamAttr(
+                    initializer=fluid.initializer.Uniform(-stdv, stdv)))
+
+        return out
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            name=name + '.conv2d.output.1')
+        if self.prefix_name == '':
+            if name == "conv1":
+                bn_name = "bn_" + name
+            else:
+                bn_name = "bn" + name[3:]
+        else:
+            if name.split("_")[1] == "conv1":
+                bn_name = name.split("_", 1)[0] + "_bn_" + name.split("_",
+                                                                      1)[1]
+            else:
+                bn_name = name.split("_", 1)[0] + "_bn" + name.split("_",
+                                                                     1)[1][3:]
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            name=bn_name + '.output.1',
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance', )
+
+    def shortcut(self, input, ch_out, stride, is_first, name):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1 or is_first == True:
+            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+        else:
+            return input
+
+    def bottleneck_block(self, input, num_filters, stride, name):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        short = self.shortcut(
+            input,
+            num_filters * 4,
+            stride,
+            is_first=False,
+            name=name + "_branch1")
+
+        return fluid.layers.elementwise_add(
+            x=short, y=conv2, act='relu', name=name + ".add.output.5")
+
+    def basic_block(self, input, num_filters, stride, is_first, name):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=3,
+            act='relu',
+            stride=stride,
+            name=name + "_branch2a")
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            name=name + "_branch2b")
+        short = self.shortcut(
+            input, num_filters, stride, is_first, name=name + "_branch1")
+        return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
+
+
+def ResNet34(prefix_name=''):
+    model = ResNet(layers=34, prefix_name=prefix_name)
+    return model
+
+
+def ResNet50(prefix_name=''):
+    model = ResNet(layers=50, prefix_name=prefix_name)
+    return model
+
+
+def ResNet101():
+    model = ResNet(layers=101)
+    return model
+
+
+def ResNet152():
+    model = ResNet(layers=152)
+    return model
diff --git a/paddleslim/models/slim_mobilenet.py b/paddleslim/models/slim_mobilenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2e42bfb74f49419610c600203c8fc070b01518b
--- /dev/null
+++ b/paddleslim/models/slim_mobilenet.py
@@ -0,0 +1,322 @@
+#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = [
+    'SlimMobileNet_v1', 'SlimMobileNet_v2', 'SlimMobileNet_v3',
+    'SlimMobileNet_v4', 'SlimMobileNet_v5'
+]
+
+
+class SlimMobileNet():
+    def __init__(self, scale=1.0, model_name='large', token=[]):
+        assert len(token) >= 45
+        self.kernel_size_lis = token[:20]
+        self.exp_lis = token[20:40]
+        self.depth_lis = token[40:45]
+
+        self.scale = scale
+        self.inplanes = 16
+        if model_name == "large":
+            self.cfg_channel = [16, 24, 40, 80, 112, 160]
+            self.cfg_stride = [1, 2, 2, 2, 1, 2]
+            self.cfg_se = [False, False, True, False, True, True]
+            self.cfg_act = [
+                'relu', 'relu', 'relu', 'hard_swish', 'hard_swish',
+                'hard_swish'
+            ]
+            self.cls_ch_squeeze = 960
+            self.cls_ch_expand = 1280
+        else:
+            raise NotImplementedError("mode[" + model_name +
+                                      "_model] is not implemented!")
+
+    def net(self, input, class_dim=1000):
+        scale = self.scale
+        inplanes = self.inplanes
+
+        kernel_size_lis = self.kernel_size_lis
+        exp_lis = self.exp_lis
+        depth_lis = self.depth_lis
+        cfg_channel = self.cfg_channel
+        cfg_stride = self.cfg_stride
+        cfg_se = self.cfg_se
+        cfg_act = self.cfg_act
+
+        cls_ch_squeeze = self.cls_ch_squeeze
+        cls_ch_expand = self.cls_ch_expand
+        #conv1
+        conv = self.conv_bn_layer(
+            input,
+            filter_size=3,
+            num_filters=self.make_divisible(inplanes * scale),
+            stride=2,
+            padding=1,
+            num_groups=1,
+            if_act=True,
+            act='hard_swish',
+            name='conv1')
+        inplanes = self.make_divisible(inplanes * scale)
+
+        #conv2
+        num_mid_filter = self.make_divisible(scale * inplanes)
+        _num_out_filter = cfg_channel[0]
+        num_out_filter = self.make_divisible(scale * _num_out_filter)
+        conv = self.residual_unit(
+            input=conv,
+            num_in_filter=inplanes,
+            num_mid_filter=num_mid_filter,
+            num_out_filter=num_out_filter,
+            act=cfg_act[0],
+            stride=cfg_stride[0],
+            filter_size=3,
+            use_se=cfg_se[0],
+            name='conv2',
+            short=True)
+        inplanes = self.make_divisible(scale * cfg_channel[0])
+
+        i = 3
+        for depth_id in range(len(depth_lis)):
+            for repeat_time in range(depth_lis[depth_id]):
+                num_mid_filter = self.make_divisible(
+                    scale * _num_out_filter *
+                    exp_lis[depth_id * 4 + repeat_time])
+                _num_out_filter = cfg_channel[depth_id + 1]
+                num_out_filter = self.make_divisible(scale * _num_out_filter)
+                stride = cfg_stride[depth_id + 1] if repeat_time == 0 else 1
+                conv = self.residual_unit(
+                    input=conv,
+                    num_in_filter=inplanes,
+                    num_mid_filter=num_mid_filter,
+                    num_out_filter=num_out_filter,
+                    act=cfg_act[depth_id + 1],
+                    stride=stride,
+                    filter_size=kernel_size_lis[depth_id * 4 + repeat_time],
+                    use_se=cfg_se[depth_id + 1],
+                    name='conv' + str(i))
+
+                inplanes = self.make_divisible(scale *
+                                               cfg_channel[depth_id + 1])
+                i += 1
+
+        conv = self.conv_bn_layer(
+            input=conv,
+            filter_size=1,
+            num_filters=self.make_divisible(scale * cls_ch_squeeze),
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            act='hard_swish',
+            name='conv_last')
+        conv = fluid.layers.pool2d(
+            input=conv, pool_type='avg', global_pooling=True, use_cudnn=False)
+        conv = fluid.layers.conv2d(
+            input=conv,
+            num_filters=cls_ch_expand,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=None,
+            param_attr=ParamAttr(name='last_1x1_conv_weights'),
+            bias_attr=False)
+        conv = fluid.layers.hard_swish(conv)
+        drop = fluid.layers.dropout(x=conv, dropout_prob=0.2)
+        out = fluid.layers.fc(input=drop,
+                              size=class_dim,
+                              param_attr=ParamAttr(name='fc_weights'),
+                              bias_attr=ParamAttr(name='fc_offset'))
+        return out
+
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      num_groups=1,
+                      if_act=True,
+                      act=None,
+                      name=None,
+                      use_cudnn=True,
+                      res_last_bn_init=False):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(name=name + '_weights'),
+            bias_attr=False)
+        bn_name = name + '_bn'
+        bn = fluid.layers.batch_norm(
+            input=conv,
+            param_attr=ParamAttr(
+                name=bn_name + "_scale",
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=0.0)),
+            bias_attr=ParamAttr(
+                name=bn_name + "_offset",
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=0.0)),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+        if if_act:
+            if act == 'relu':
+                bn = fluid.layers.relu(bn)
+            elif act == 'hard_swish':
+                bn = fluid.layers.hard_swish(bn)
+        return bn
+
+    def make_divisible(self, v, divisor=8, min_value=None):
+        if min_value is None:
+            min_value = divisor
+        new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+        if new_v < 0.9 * v:
+            new_v += divisor
+        return new_v
+
+    def se_block(self, input, num_out_filter, ratio=4, name=None):
+        num_mid_filter = num_out_filter // ratio
+        pool = fluid.layers.pool2d(
+            input=input, pool_type='avg', global_pooling=True, use_cudnn=False)
+        conv1 = fluid.layers.conv2d(
+            input=pool,
+            filter_size=1,
+            num_filters=num_mid_filter,
+            act='relu',
+            param_attr=ParamAttr(name=name + '_1_weights'),
+            bias_attr=ParamAttr(name=name + '_1_offset'))
+        conv2 = fluid.layers.conv2d(
+            input=conv1,
+            filter_size=1,
+            num_filters=num_out_filter,
+            act='hard_sigmoid',
+            param_attr=ParamAttr(name=name + '_2_weights'),
+            bias_attr=ParamAttr(name=name + '_2_offset'))
+        scale = fluid.layers.elementwise_mul(x=input, y=conv2, axis=0)
+        return scale
+
+    def residual_unit(self,
+                      input,
+                      num_in_filter,
+                      num_mid_filter,
+                      num_out_filter,
+                      stride,
+                      filter_size,
+                      act=None,
+                      use_se=False,
+                      name=None,
+                      short=False):
+
+        if not short:
+            conv0 = self.conv_bn_layer(
+                input=input,
+                filter_size=1,
+                num_filters=num_mid_filter,
+                stride=1,
+                padding=0,
+                if_act=True,
+                act=act,
+                name=name + '_expand')
+        else:
+            conv0 = input
+
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            filter_size=filter_size,
+            num_filters=num_mid_filter,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            if_act=True,
+            act=act,
+            num_groups=num_mid_filter,
+            use_cudnn=False,
+            name=name + '_depthwise')
+        if use_se:
+            conv1 = self.se_block(
+                input=conv1, num_out_filter=num_mid_filter, name=name + '_se')
+
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            filter_size=1,
+            num_filters=num_out_filter,
+            stride=1,
+            padding=0,
+            if_act=False,
+            name=name + '_linear',
+            res_last_bn_init=True)
+        if num_in_filter != num_out_filter or stride != 1:
+            return conv2
+        else:
+            return fluid.layers.elementwise_add(x=input, y=conv2, act=None)
+
+
+def SlimMobileNet_v1(token):
+    token = [
+        5, 3, 3, 7, 3, 3, 5, 7, 3, 3, 3, 3, 3, 3, 7, 3, 5, 3, 3, 3, 3, 3, 3, 6,
+        3, 3, 3, 3, 4, 4, 4, 6, 4, 3, 4, 3, 6, 4, 3, 3, 2, 2, 2, 2, 4
+    ]
+    model = SlimMobileNet(model_name='large', scale=1.0, token=token)
+    return model
+
+
+def SlimMobileNet_v2(token):
+    token = [
+        5, 3, 5, 7, 3, 3, 7, 3, 5, 3, 3, 7, 3, 3, 3, 5, 5, 5, 3, 3, 3, 3, 4, 6,
+        3, 3, 6, 3, 4, 4, 3, 4, 4, 4, 3, 6, 6, 4, 3, 3, 2, 2, 3, 2, 4
+    ]
+    model = SlimMobileNet(model_name='large', scale=1.0, token=token)
+    return model
+
+
+def SlimMobileNet_v3(token):
+    token = [
+        3, 3, 3, 3, 5, 3, 7, 7, 7, 3, 3, 7, 5, 3, 5, 7, 5, 3, 3, 3, 3, 3, 3, 3,
+        3, 4, 3, 4, 3, 6, 4, 4, 4, 4, 6, 3, 6, 4, 6, 3, 2, 2, 3, 2, 4
+    ]
+    model = SlimMobileNet(model_name='large', scale=1.0, token=token)
+    return model
+
+
+def SlimMobileNet_v4(token):
+    token = [
+        3, 3, 3, 3, 5, 3, 3, 5, 7, 3, 5, 5, 5, 3, 3, 7, 3, 5, 3, 3, 3, 3, 4, 6,
+        3, 4, 4, 6, 4, 6, 4, 6, 4, 6, 4, 4, 6, 6, 6, 4, 2, 3, 3, 3, 4
+    ]
+    model = SlimMobileNet(model_name='large', scale=1.0, token=token)
+    return model
+
+
+def SlimMobileNet_v5(token):
+    token = [
+        7, 7, 3, 5, 7, 3, 5, 3, 7, 5, 3, 3, 5, 3, 7, 5, 7, 7, 5, 3, 3, 3, 6, 3,
+        4, 6, 3, 6, 6, 3, 6, 4, 6, 6, 4, 3, 6, 6, 6, 6, 4, 4, 4, 4, 4
+    ]
+    model = SlimMobileNet(model_name='large', scale=1.0, token=token)
+    return model
+
+
+if __name__ == "__main__":
+    pass
diff --git a/paddleslim/models/slimfacenet.py b/paddleslim/models/slimfacenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..5276a515c2d08e78360eae1ccab01471fb46f5ed
--- /dev/null
+++ b/paddleslim/models/slimfacenet.py
@@ -0,0 +1,373 @@
+# ================================================================
+#   Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import datetime
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+
+
+class SlimFaceNet():
+    def __init__(self, class_dim, scale=0.6, arch=None):
+
+        assert arch is not None
+        self.arch = arch
+        self.class_dim = class_dim
+        kernels = [3]
+        expansions = [2, 4, 6]
+        SE = [0, 1]
+        self.table = []
+        for k in kernels:
+            for e in expansions:
+                for se in SE:
+                    self.table.append((k, e, se))
+
+        if scale == 1.0:
+            # 100% - channel
+            self.Slimfacenet_bottleneck_setting = [
+                # t, c , n ,s
+                [2, 64, 5, 2],
+                [4, 128, 1, 2],
+                [2, 128, 6, 1],
+                [4, 128, 1, 2],
+                [2, 128, 2, 1]
+            ]
+        elif scale == 0.9:
+            # 90% - channel
+            self.Slimfacenet_bottleneck_setting = [
+                # t, c , n ,s
+                [2, 56, 5, 2],
+                [4, 116, 1, 2],
+                [2, 116, 6, 1],
+                [4, 116, 1, 2],
+                [2, 116, 2, 1]
+            ]
+        elif scale == 0.75:
+            # 75% - channel
+            self.Slimfacenet_bottleneck_setting = [
+                # t, c , n ,s
+                [2, 48, 5, 2],
+                [4, 96, 1, 2],
+                [2, 96, 6, 1],
+                [4, 96, 1, 2],
+                [2, 96, 2, 1]
+            ]
+        elif scale == 0.6:
+            # 60% - channel
+            self.Slimfacenet_bottleneck_setting = [
+                # t, c , n ,s
+                [2, 40, 5, 2],
+                [4, 76, 1, 2],
+                [2, 76, 6, 1],
+                [4, 76, 1, 2],
+                [2, 76, 2, 1]
+            ]
+        else:
+            print('WRONG scale')
+            exit()
+        self.extract_feature = True
+
+    def set_extract_feature_flag(self, flag):
+        self.extract_feature = flag
+
+    def net(self, input, label=None):
+        x = self.conv_bn_layer(
+            input,
+            filter_size=3,
+            num_filters=64,
+            stride=2,
+            padding=1,
+            num_groups=1,
+            if_act=True,
+            name='conv3x3')
+        x = self.conv_bn_layer(
+            x,
+            filter_size=3,
+            num_filters=64,
+            stride=1,
+            padding=1,
+            num_groups=64,
+            if_act=True,
+            name='dw_conv3x3')
+
+        in_c = 64
+        cnt = 0
+        for _exp, out_c, times, _stride in self.Slimfacenet_bottleneck_setting:
+            for i in range(times):
+                stride = _stride if i == 0 else 1
+                filter_size, exp, se = self.table[self.arch[cnt]]
+                se = False if se == 0 else True
+                x = self.residual_unit(
+                    x,
+                    num_in_filter=in_c,
+                    num_out_filter=out_c,
+                    stride=stride,
+                    filter_size=filter_size,
+                    expansion_factor=exp,
+                    use_se=se,
+                    name='residual_unit' + str(cnt + 1))
+                cnt += 1
+                in_c = out_c
+
+        out_c = 512
+        x = self.conv_bn_layer(
+            x,
+            filter_size=1,
+            num_filters=out_c,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            name='conv1x1')
+        x = self.conv_bn_layer(
+            x,
+            filter_size=(7, 6),
+            num_filters=out_c,
+            stride=1,
+            padding=0,
+            num_groups=out_c,
+            if_act=False,
+            name='global_dw_conv7x7')
+        x = fluid.layers.conv2d(
+            x,
+            num_filters=128,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=None,
+            use_cudnn=True,
+            param_attr=ParamAttr(
+                name='linear_conv1x1_weights',
+                initializer=MSRA(),
+                regularizer=fluid.regularizer.L2Decay(4e-4)),
+            bias_attr=False)
+        bn_name = 'linear_conv1x1_bn'
+        x = fluid.layers.batch_norm(
+            x,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+        x = fluid.layers.reshape(x, shape=[x.shape[0], x.shape[1]])
+
+        if self.extract_feature:
+            return x
+
+        out = self.arc_margin_product(
+            x, label, self.class_dim, s=32.0, m=0.50, mode=2)
+        softmax = fluid.layers.softmax(input=out)
+        cost = fluid.layers.cross_entropy(input=softmax, label=label)
+        loss = fluid.layers.mean(x=cost)
+        acc = fluid.layers.accuracy(input=out, label=label, k=1)
+        return loss, acc
+
+    def residual_unit(self,
+                      input,
+                      num_in_filter,
+                      num_out_filter,
+                      stride,
+                      filter_size,
+                      expansion_factor,
+                      use_se=False,
+                      name=None):
+
+        num_expfilter = int(round(num_in_filter * expansion_factor))
+        input_data = input
+
+        expand_conv = self.conv_bn_layer(
+            input=input,
+            filter_size=1,
+            num_filters=num_expfilter,
+            stride=1,
+            padding=0,
+            if_act=True,
+            name=name + '_expand')
+
+        depthwise_conv = self.conv_bn_layer(
+            input=expand_conv,
+            filter_size=filter_size,
+            num_filters=num_expfilter,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            if_act=True,
+            num_groups=num_expfilter,
+            use_cudnn=True,
+            name=name + '_depthwise')
+
+        if use_se:
+            depthwise_conv = self.se_block(
+                input=depthwise_conv,
+                num_out_filter=num_expfilter,
+                name=name + '_se')
+
+        linear_conv = self.conv_bn_layer(
+            input=depthwise_conv,
+            filter_size=1,
+            num_filters=num_out_filter,
+            stride=1,
+            padding=0,
+            if_act=False,
+            name=name + '_linear')
+        if num_in_filter != num_out_filter or stride != 1:
+            return linear_conv
+        else:
+            return fluid.layers.elementwise_add(
+                x=input_data, y=linear_conv, act=None)
+
+    def se_block(self, input, num_out_filter, ratio=4, name=None):
+        num_mid_filter = int(num_out_filter // ratio)
+        pool = fluid.layers.pool2d(
+            input=input, pool_type='avg', global_pooling=True, use_cudnn=False)
+        conv1 = fluid.layers.conv2d(
+            input=pool,
+            filter_size=1,
+            num_filters=num_mid_filter,
+            act=None,
+            param_attr=ParamAttr(name=name + '_1_weights'),
+            bias_attr=ParamAttr(name=name + '_1_offset'))
+        conv1 = fluid.layers.prelu(
+            conv1,
+            mode='channel',
+            param_attr=ParamAttr(
+                name=name + '_prelu',
+                regularizer=fluid.regularizer.L2Decay(0.0)))
+        conv2 = fluid.layers.conv2d(
+            input=conv1,
+            filter_size=1,
+            num_filters=num_out_filter,
+            act='hard_sigmoid',
+            param_attr=ParamAttr(name=name + '_2_weights'),
+            bias_attr=ParamAttr(name=name + '_2_offset'))
+        scale = fluid.layers.elementwise_mul(x=input, y=conv2, axis=0)
+        return scale
+
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      num_groups=1,
+                      if_act=True,
+                      name=None,
+                      use_cudnn=True):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(
+                name=name + '_weights', initializer=MSRA()),
+            bias_attr=False)
+        bn_name = name + '_bn'
+        bn = fluid.layers.batch_norm(
+            input=conv,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+        if if_act:
+            return fluid.layers.prelu(
+                bn,
+                mode='channel',
+                param_attr=ParamAttr(
+                    name=name + '_prelu',
+                    regularizer=fluid.regularizer.L2Decay(0.0)))
+        else:
+            return bn
+
+    def arc_margin_product(self, input, label, out_dim, s=32.0, m=0.50,
+                           mode=2):
+        input_norm = fluid.layers.sqrt(
+            fluid.layers.reduce_sum(
+                fluid.layers.square(input), dim=1))
+        input = fluid.layers.elementwise_div(input, input_norm, axis=0)
+
+        weight = fluid.layers.create_parameter(
+            shape=[out_dim, input.shape[1]],
+            dtype='float32',
+            name='weight_norm',
+            attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Xavier(),
+                regularizer=fluid.regularizer.L2Decay(4e-4)))
+
+        weight_norm = fluid.layers.sqrt(
+            fluid.layers.reduce_sum(
+                fluid.layers.square(weight), dim=1))
+        weight = fluid.layers.elementwise_div(weight, weight_norm, axis=0)
+        weight = fluid.layers.transpose(weight, perm=[1, 0])
+        cosine = fluid.layers.mul(input, weight)
+        sine = fluid.layers.sqrt(1.0 - fluid.layers.square(cosine))
+
+        cos_m = math.cos(m)
+        sin_m = math.sin(m)
+        phi = cosine * cos_m - sine * sin_m
+
+        th = math.cos(math.pi - m)
+        mm = math.sin(math.pi - m) * m
+
+        if mode == 1:
+            phi = self.paddle_where_more_than(cosine, 0, phi, cosine)
+        elif mode == 2:
+            phi = self.paddle_where_more_than(cosine, th, phi, cosine - mm)
+        else:
+            pass
+
+        one_hot = fluid.one_hot(input=label, depth=out_dim)
+        output = fluid.layers.elementwise_mul(
+            one_hot, phi) + fluid.layers.elementwise_mul(
+                (1.0 - one_hot), cosine)
+        output = output * s
+        return output
+
+    def paddle_where_more_than(self, target, limit, x, y):
+        mask = fluid.layers.cast(x=(target > limit), dtype='float32')
+        output = fluid.layers.elementwise_mul(
+            mask, x) + fluid.layers.elementwise_mul((1.0 - mask), y)
+        return output
+
+
+def SlimFaceNet_A_x0_60(class_dim=None, scale=0.6, arch=None):
+    scale = 0.6
+    arch = [0, 1, 5, 1, 0, 2, 1, 2, 0, 1, 2, 1, 1, 0, 1]
+    return SlimFaceNet(class_dim=class_dim, scale=scale, arch=arch)
+
+
+def SlimFaceNet_B_x0_75(class_dim=None, scale=0.6, arch=None):
+    scale = 0.75
+    arch = [1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 3, 2, 2, 3]
+    return SlimFaceNet(class_dim=class_dim, scale=scale, arch=arch)
+
+
+def SlimFaceNet_C_x0_75(class_dim=None, scale=0.6, arch=None):
+    scale = 0.75
+    arch = [1, 3, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 5, 5, 5]
+    return SlimFaceNet(class_dim=class_dim, scale=scale, arch=arch)
+
+
+if __name__ == "__main__":
+    x = fluid.data(name='x', shape=[-1, 3, 112, 112], dtype='float32')
+    print(x.shape)
+    model = SlimFaceNet(10000, [1, 3, 3, 1, 1, 0, 0, 1, 0, 1, 1, 0, 5, 5, 3])
+    y = model.net(x)
diff --git a/paddleslim/models/util.py b/paddleslim/models/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..962db792e1c5214e96e6ffd73cdec0140b39b430
--- /dev/null
+++ b/paddleslim/models/util.py
@@ -0,0 +1,32 @@
+from __future__ import absolute_import
+import paddle.fluid as fluid
+from ..models import classification_models
+
+__all__ = ["image_classification"]
+
+model_list = classification_models.model_list
+
+
+def image_classification(model, image_shape, class_num, use_gpu=False):
+    assert model in model_list
+    train_program = fluid.Program()
+    startup_program = fluid.Program()
+    with fluid.program_guard(train_program, startup_program):
+        image = fluid.layers.data(
+            name='image', shape=image_shape, dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        model = classification_models.__dict__[model]()
+        out = model.net(input=image, class_dim=class_num)
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+        val_program = fluid.default_main_program().clone(for_test=True)
+
+        opt = fluid.optimizer.Momentum(0.1, 0.9)
+        opt.minimize(avg_cost)
+        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+    return exe, train_program, val_program, (image, label), (
+        acc_top1.name, acc_top5.name, avg_cost.name, out.name)
diff --git a/paddleslim/nas/__init__.py b/paddleslim/nas/__init__.py
index c86051a8676762a76ce85b641ac90917359366b3..d438e54c572efb25fdf9f33db0d4c4d10a2487e7 100644
--- a/paddleslim/nas/__init__.py
+++ b/paddleslim/nas/__init__.py
@@ -11,8 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from __future__ import absolute_import
+from ..nas import search_space
 from .search_space import *
-from .sa_nas import SANAS
+from ..nas import sa_nas
+from .sa_nas import *
+from .rl_nas import *
+from ..nas import darts
+from .darts import *
 
-__all__ = ['SANAS']
+__all__ = []
+__all__ += sa_nas.__all__
+__all__ += search_space.__all__
+__all__ += rl_nas.__all__
+__all__ += darts.__all__
diff --git a/paddleslim/nas/darts/__init__.py b/paddleslim/nas/darts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..71bc85583f7aa1e5440ff1ee65a580ecbfcfd5a2
--- /dev/null
+++ b/paddleslim/nas/darts/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from ..darts import train_search
+from .train_search import *
+from ..darts import search_space
+from .search_space import *
+
+__all__ = []
+__all__ += train_search.__all__
+__all__ += search_space.__all__
diff --git a/paddleslim/nas/darts/architect.py b/paddleslim/nas/darts/architect.py
new file mode 100644
index 0000000000000000000000000000000000000000..ada39ba5cb5084c4fa075ade1179c5914801f0c7
--- /dev/null
+++ b/paddleslim/nas/darts/architect.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.base import to_variable
+
+
+class Architect(object):
+    def __init__(self, model, eta, arch_learning_rate, unrolled, parallel):
+        self.network_momentum = 0.9
+        self.network_weight_decay = 3e-4
+        self.eta = eta
+        self.model = model
+        self.optimizer = fluid.optimizer.Adam(
+            arch_learning_rate,
+            0.5,
+            0.999,
+            regularization=fluid.regularizer.L2Decay(1e-3),
+            parameter_list=self.model.arch_parameters())
+        self.unrolled = unrolled
+        self.parallel = parallel
+        if self.unrolled:
+            self.unrolled_model = self.model.new()
+            self.unrolled_model_params = [
+                p for p in self.unrolled_model.parameters()
+                if p.name not in [
+                    a.name for a in self.unrolled_model.arch_parameters()
+                ] and p.trainable
+            ]
+            self.unrolled_optimizer = fluid.optimizer.MomentumOptimizer(
+                self.eta,
+                self.network_momentum,
+                regularization=fluid.regularizer.L2DecayRegularizer(
+                    self.network_weight_decay),
+                parameter_list=self.unrolled_model_params)
+
+        if self.parallel:
+            strategy = fluid.dygraph.parallel.prepare_context()
+            self.parallel_model = fluid.dygraph.parallel.DataParallel(
+                self.model, strategy)
+            if self.unrolled:
+                self.parallel_unrolled_model = fluid.dygraph.parallel.DataParallel(
+                    self.unrolled_model, strategy)
+
+    def get_model(self):
+        return self.parallel_model if self.parallel else self.model
+
+    def step(self, input_train, target_train, input_valid, target_valid):
+        if self.unrolled:
+            params_grads = self._backward_step_unrolled(
+                input_train, target_train, input_valid, target_valid)
+            self.optimizer.apply_gradients(params_grads)
+        else:
+            loss = self._backward_step(input_valid, target_valid)
+            self.optimizer.minimize(loss)
+        self.optimizer.clear_gradients()
+
+    def _backward_step(self, input_valid, target_valid):
+        loss = self.model._loss(input_valid, target_valid)
+        if self.parallel:
+            loss = self.parallel_model.scale_loss(loss)
+            loss.backward()
+            self.parallel_model.apply_collective_grads()
+        else:
+            loss.backward()
+        return loss
+
+    def _backward_step_unrolled(self, input_train, target_train, input_valid,
+                                target_valid):
+        self._compute_unrolled_model(input_train, target_train)
+        unrolled_loss = self.unrolled_model._loss(input_valid, target_valid)
+
+        if self.parallel:
+            unrolled_loss = self.parallel_unrolled_model.scale_loss(
+                unrolled_loss)
+            unrolled_loss.backward()
+            self.parallel_unrolled_model.apply_collective_grads()
+        else:
+            unrolled_loss.backward()
+
+        vector = [
+            to_variable(param._grad_ivar().numpy())
+            for param in self.unrolled_model_params
+        ]
+        arch_params_grads = [
+            (alpha, to_variable(ualpha._grad_ivar().numpy()))
+            for alpha, ualpha in zip(self.model.arch_parameters(),
+                                     self.unrolled_model.arch_parameters())
+        ]
+        self.unrolled_model.clear_gradients()
+
+        implicit_grads = self._hessian_vector_product(vector, input_train,
+                                                      target_train)
+        for (p, g), ig in zip(arch_params_grads, implicit_grads):
+            new_g = g - (ig * self.unrolled_optimizer.current_step_lr())
+            fluid.layers.assign(new_g.detach(), g)
+        return arch_params_grads
+
+    def _compute_unrolled_model(self, input, target):
+        for x, y in zip(self.unrolled_model.parameters(),
+                        self.model.parameters()):
+            fluid.layers.assign(y.detach(), x)
+
+        loss = self.unrolled_model._loss(input, target)
+        if self.parallel:
+            loss = self.parallel_unrolled_model.scale_loss(loss)
+            loss.backward()
+            self.parallel_unrolled_model.apply_collective_grads()
+        else:
+            loss.backward()
+
+        self.unrolled_optimizer.minimize(loss)
+        self.unrolled_model.clear_gradients()
+
+    def _hessian_vector_product(self, vector, input, target, r=1e-2):
+        R = r * fluid.layers.rsqrt(
+            fluid.layers.sum([
+                fluid.layers.reduce_sum(fluid.layers.square(v)) for v in vector
+            ]))
+
+        model_params = [
+            p for p in self.model.parameters()
+            if p.name not in [a.name for a in self.model.arch_parameters()] and
+            p.trainable
+        ]
+        for param, grad in zip(model_params, vector):
+            param_p = param + grad * R
+            fluid.layers.assign(param_p.detach(), param)
+        loss = self.model._loss(input, target)
+        if self.parallel:
+            loss = self.parallel_model.scale_loss(loss)
+            loss.backward()
+            self.parallel_model.apply_collective_grads()
+        else:
+            loss.backward()
+
+        grads_p = [
+            to_variable(param._grad_ivar().numpy())
+            for param in self.model.arch_parameters()
+        ]
+
+        for param, grad in zip(model_params, vector):
+            param_n = param - grad * R * 2
+            fluid.layers.assign(param_n.detach(), param)
+        self.model.clear_gradients()
+
+        loss = self.model._loss(input, target)
+        if self.parallel:
+            loss = self.parallel_model.scale_loss(loss)
+            loss.backward()
+            self.parallel_model.apply_collective_grads()
+        else:
+            loss.backward()
+
+        grads_n = [
+            to_variable(param._grad_ivar().numpy())
+            for param in self.model.arch_parameters()
+        ]
+        for param, grad in zip(model_params, vector):
+            param_o = param + grad * R
+            fluid.layers.assign(param_o.detach(), param)
+        self.model.clear_gradients()
+        arch_grad = [(p - n) / (2 * R) for p, n in zip(grads_p, grads_n)]
+        return arch_grad
diff --git a/paddleslim/nas/darts/architect_for_bert.py b/paddleslim/nas/darts/architect_for_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1f6df26175539ccf4660fd440963395a850fc7b
--- /dev/null
+++ b/paddleslim/nas/darts/architect_for_bert.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.base import to_variable
+
+
+class Architect(object):
+    def __init__(self, model, eta, arch_learning_rate, place, unrolled):
+        self.network_momentum = 0.9
+        self.network_weight_decay = 1e-3
+        self.eta = eta
+        self.model = model
+        self.optimizer = fluid.optimizer.Adam(
+            arch_learning_rate,
+            0.5,
+            0.999,
+            regularization=fluid.regularizer.L2Decay(1e-3),
+            parameter_list=self.model.arch_parameters())
+        self.place = place
+        self.unrolled = unrolled
+        if self.unrolled:
+            self.unrolled_model = self.model.new()
+            self.unrolled_model_params = [
+                p for p in self.unrolled_model.parameters()
+                if p.name not in [
+                    a.name for a in self.unrolled_model.arch_parameters()
+                ] and p.trainable
+            ]
+            self.unrolled_optimizer = fluid.optimizer.MomentumOptimizer(
+                self.eta,
+                self.network_momentum,
+                regularization=fluid.regularizer.L2DecayRegularizer(
+                    self.network_weight_decay),
+                parameter_list=self.unrolled_model_params)
+
+    def step(self, train_data, valid_data, epoch):
+        if self.unrolled:
+            params_grads = self._backward_step_unrolled(train_data, valid_data)
+            self.optimizer.apply_gradients(params_grads)
+        else:
+            loss = self._backward_step(valid_data, epoch)
+            self.optimizer.minimize(loss)
+        self.optimizer.clear_gradients()
+
+    def _backward_step(self, valid_data, epoch):
+        loss = self.model.loss(valid_data, epoch)
+        loss[0].backward()
+        return loss[0]
+
+    def _backward_step_unrolled(self, train_data, valid_data):
+        self._compute_unrolled_model(train_data)
+        unrolled_loss = self.unrolled_model.loss(valid_data)
+
+        unrolled_loss.backward()
+        vector = [
+            to_variable(param._grad_ivar().numpy())
+            for param in self.unrolled_model_params
+        ]
+        arch_params_grads = [
+            (alpha, to_variable(ualpha._grad_ivar().numpy()))
+            for alpha, ualpha in zip(self.model.arch_parameters(),
+                                     self.unrolled_model.arch_parameters())
+        ]
+        self.unrolled_model.clear_gradients()
+
+        implicit_grads = self._hessian_vector_product(vector, train_data)
+        for (p, g), ig in zip(arch_params_grads, implicit_grads):
+            new_g = g - (ig * self.unrolled_optimizer.current_step_lr())
+            g.value().get_tensor().set(new_g.numpy(), self.place)
+        return arch_params_grads
+
+    def _compute_unrolled_model(self, data):
+        for x, y in zip(self.unrolled_model.parameters(),
+                        self.model.parameters()):
+            x.value().get_tensor().set(y.numpy(), self.place)
+        loss = self.unrolled_model._loss(data)
+        loss.backward()
+        self.unrolled_optimizer.minimize(loss)
+        self.unrolled_model.clear_gradients()
+
+    def _hessian_vector_product(self, vector, data, r=1e-2):
+        R = r * fluid.layers.rsqrt(
+            fluid.layers.sum([
+                fluid.layers.reduce_sum(fluid.layers.square(v)) for v in vector
+            ]))
+
+        model_params = [
+            p for p in self.model.parameters()
+            if p.name not in [a.name for a in self.model.arch_parameters()] and
+            p.trainable
+        ]
+        for param, grad in zip(model_params, vector):
+            param_p = param + grad * R
+            param.value().get_tensor().set(param_p.numpy(), self.place)
+        loss = self.model.loss(data)
+        loss.backward()
+        grads_p = [
+            to_variable(param._grad_ivar().numpy())
+            for param in self.model.arch_parameters()
+        ]
+
+        for param, grad in zip(model_params, vector):
+            param_n = param - grad * R * 2
+            param.value().get_tensor().set(param_n.numpy(), self.place)
+        self.model.clear_gradients()
+
+        loss = self.model.loss(data)
+        loss.backward()
+        grads_n = [
+            to_variable(param._grad_ivar().numpy())
+            for param in self.model.arch_parameters()
+        ]
+        for param, grad in zip(model_params, vector):
+            param_o = param + grad * R
+            param.value().get_tensor().set(param_o.numpy(), self.place)
+        self.model.clear_gradients()
+        arch_grad = [(p - n) / (2 * R) for p, n in zip(grads_p, grads_n)]
+        return arch_grad
diff --git a/paddleslim/nas/darts/get_genotype.py b/paddleslim/nas/darts/get_genotype.py
new file mode 100644
index 0000000000000000000000000000000000000000..87a1da14a6ccf84c2adaeb73f603f103774dbf1f
--- /dev/null
+++ b/paddleslim/nas/darts/get_genotype.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.fluid as fluid
+from collections import namedtuple
+
+Genotype = namedtuple('Genotype', 'normal normal_concat reduce reduce_concat')
+
+
+def get_genotype(model):
+    def _parse(weights, weights2=None):
+        gene = []
+        n = 2
+        start = 0
+        for i in range(model._steps):
+            end = start + n
+            W = weights[start:end].copy()
+            if model._method == "PC-DARTS":
+                W2 = weights2[start:end].copy()
+                for j in range(n):
+                    W[j, :] = W[j, :] * W2[j]
+            edges = sorted(range(i + 2), key=lambda x: -max(W[x][k] for k in range(len(W[x])) if k != model._primitives.index('none')))[:2]
+            for j in edges:
+                k_best = None
+                for k in range(len(W[j])):
+                    if k != model._primitives.index('none'):
+                        if k_best is None or W[j][k] > W[j][k_best]:
+                            k_best = k
+                gene.append((model._primitives[k_best], j))
+            start = end
+            n += 1
+        return gene
+
+    weightsr2 = None
+    weightsn2 = None
+    if model._method == "PC-DARTS":
+        n = 3
+        start = 2
+        weightsr2 = fluid.layers.softmax(model.betas_reduce[0:2])
+        weightsn2 = fluid.layers.softmax(model.betas_normal[0:2])
+        for i in range(model._steps - 1):
+            end = start + n
+            tw2 = fluid.layers.softmax(model.betas_reduce[start:end])
+            tn2 = fluid.layers.softmax(model.betas_normal[start:end])
+            start = end
+            n += 1
+            weightsr2 = fluid.layers.concat([weightsr2, tw2])
+            weightsn2 = fluid.layers.concat([weightsn2, tn2])
+        weightsr2 = weightsr2.numpy()
+        weightsn2 = weightsn2.numpy()
+
+    gene_normal = _parse(
+        fluid.layers.softmax(model.alphas_normal).numpy(), weightsn2)
+    gene_reduce = _parse(
+        fluid.layers.softmax(model.alphas_reduce).numpy(), weightsr2)
+
+    concat = range(2 + model._steps - model._multiplier, model._steps + 2)
+    genotype = Genotype(
+        normal=gene_normal,
+        normal_concat=concat,
+        reduce=gene_reduce,
+        reduce_concat=concat)
+    return genotype
diff --git a/paddleslim/nas/darts/search_space/__init__.py b/paddleslim/nas/darts/search_space/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..49be316d98c9c44c8d9f07c835730d43a9613925
--- /dev/null
+++ b/paddleslim/nas/darts/search_space/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from ..search_space import conv_bert
+from .conv_bert import *
+
+__all__ = []
+__all__ += conv_bert.__all__
diff --git a/paddleslim/nas/darts/search_space/conv_bert/__init__.py b/paddleslim/nas/darts/search_space/conv_bert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..16c5ae86f778d388ad0ffab2711440baa2f6030e
--- /dev/null
+++ b/paddleslim/nas/darts/search_space/conv_bert/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from ..conv_bert import cls
+from .cls import *
+
+__all__ = []
+__all__ += cls.__all__
diff --git a/paddleslim/nas/darts/search_space/conv_bert/cls.py b/paddleslim/nas/darts/search_space/conv_bert/cls.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c46c443117399993d22200f8832b1bd1f66dcf2
--- /dev/null
+++ b/paddleslim/nas/darts/search_space/conv_bert/cls.py
@@ -0,0 +1,164 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT fine-tuning in Paddle Dygraph Mode."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import six
+import sys
+if six.PY2:
+    reload(sys)
+    sys.setdefaultencoding('utf8')
+import ast
+import time
+import argparse
+import numpy as np
+import multiprocessing
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import to_variable, Layer, Linear
+from paddle.fluid.dygraph.base import to_variable
+from .reader.cls import *
+from .model.bert import BertModelLayer
+from .optimization import Optimizer
+from .utils.init import init_from_static_model
+from paddleslim.teachers.bert import BERTClassifier
+
+__all__ = ["AdaBERTClassifier"]
+
+
+class AdaBERTClassifier(Layer):
+    def __init__(self,
+                 num_labels,
+                 n_layer=8,
+                 emb_size=768,
+                 hidden_size=768,
+                 gamma=0.8,
+                 beta=4,
+                 task_name='mnli',
+                 conv_type="conv_bn",
+                 search_layer=False,
+                 teacher_model=None,
+                 data_dir=None,
+                 use_fixed_gumbel=False,
+                 gumbel_alphas=None,
+                 fix_emb=False,
+                 t=5.0):
+        super(AdaBERTClassifier, self).__init__()
+        self._n_layer = n_layer
+        self._num_labels = num_labels
+        self._emb_size = emb_size
+        self._hidden_size = hidden_size
+        self._gamma = gamma
+        self._beta = beta
+        self._conv_type = conv_type
+        self._search_layer = search_layer
+        self._teacher_model = teacher_model
+        self._data_dir = data_dir
+        self.use_fixed_gumbel = use_fixed_gumbel
+
+        self.T = t
+        print(
+            "----------------------load teacher model and test----------------------------------------"
+        )
+        self.teacher = BERTClassifier(
+            num_labels, task_name=task_name, model_path=self._teacher_model)
+        # global setting, will be overwritten when training(about 1% acc loss)
+        self.teacher.eval()
+        self.teacher.test(self._data_dir)
+        print(
+            "----------------------finish load teacher model and test----------------------------------------"
+        )
+        self.student = BertModelLayer(
+            num_labels=num_labels,
+            n_layer=self._n_layer,
+            emb_size=self._emb_size,
+            hidden_size=self._hidden_size,
+            conv_type=self._conv_type,
+            search_layer=self._search_layer,
+            use_fixed_gumbel=self.use_fixed_gumbel,
+            gumbel_alphas=gumbel_alphas)
+
+        fix_emb = False
+        for s_emb, t_emb in zip(self.student.emb_names(),
+                                self.teacher.emb_names()):
+            t_emb.stop_gradient = True
+            if fix_emb:
+                s_emb.stop_gradient = True
+            print(
+                "Assigning embedding[{}] from teacher to embedding[{}] in student.".
+                format(t_emb.name, s_emb.name))
+            fluid.layers.assign(input=t_emb, output=s_emb)
+            print(
+                "Assigned embedding[{}] from teacher to embedding[{}] in student.".
+                format(t_emb.name, s_emb.name))
+
+    def forward(self, data_ids, epoch):
+        return self.student(data_ids, epoch)
+
+    def arch_parameters(self):
+        return self.student.arch_parameters()
+
+    def loss(self, data_ids, epoch):
+        labels = data_ids[4]
+
+        s_logits = self.student(data_ids, epoch)
+
+        t_enc_outputs, t_logits, t_losses, t_accs, _ = self.teacher(data_ids)
+
+        #define kd loss
+        kd_weights = []
+        for i in range(len(s_logits)):
+            j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits))))
+            kd_weights.append(t_losses[j].numpy())
+
+        kd_weights = np.array(kd_weights)
+        kd_weights = np.squeeze(kd_weights)
+        kd_weights = to_variable(kd_weights)
+        kd_weights = fluid.layers.softmax(-kd_weights)
+
+        kd_losses = []
+        for i in range(len(s_logits)):
+            j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits))))
+            t_logit = t_logits[j]
+            s_logit = s_logits[i]
+            t_logit.stop_gradient = True
+            t_probs = fluid.layers.softmax(t_logit)  # P_j^T
+            s_probs = fluid.layers.softmax(s_logit / self.T)  #P_j^S
+            #kd_loss = -t_probs * fluid.layers.log(s_probs)
+            kd_loss = fluid.layers.cross_entropy(
+                input=s_probs, label=t_probs, soft_label=True)
+            kd_loss = fluid.layers.reduce_mean(kd_loss)
+            kd_loss = fluid.layers.scale(kd_loss, scale=kd_weights[i])
+            kd_losses.append(kd_loss)
+        kd_loss = fluid.layers.sum(kd_losses)
+
+        losses = []
+        for logit in s_logits:
+            ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
+                logits=logit, label=labels, return_softmax=True)
+            loss = fluid.layers.mean(x=ce_loss)
+            losses.append(loss)
+
+            num_seqs = fluid.layers.create_tensor(dtype='int64')
+            accuracy = fluid.layers.accuracy(
+                input=probs, label=labels, total=num_seqs)
+        ce_loss = fluid.layers.sum(losses)
+
+        total_loss = (1 - self._gamma) * ce_loss + self._gamma * kd_loss
+
+        return total_loss, accuracy, ce_loss, kd_loss, s_logits
diff --git a/paddleslim/nas/darts/search_space/conv_bert/model/__init__.py b/paddleslim/nas/darts/search_space/conv_bert/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/paddleslim/nas/darts/search_space/conv_bert/model/bert.py b/paddleslim/nas/darts/search_space/conv_bert/model/bert.py
new file mode 100755
index 0000000000000000000000000000000000000000..38de45e6d88058800b4880d7d2354f5a2b5605f9
--- /dev/null
+++ b/paddleslim/nas/darts/search_space/conv_bert/model/bert.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"dygraph transformer layers"
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import six
+import json
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, to_variable, Layer, guard
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+from paddle.fluid import ParamAttr
+from paddle.fluid.initializer import MSRA
+from .transformer_encoder import EncoderLayer
+
+
+class BertModelLayer(Layer):
+    def __init__(self,
+                 num_labels,
+                 emb_size=128,
+                 hidden_size=768,
+                 n_layer=12,
+                 voc_size=30522,
+                 max_position_seq_len=512,
+                 sent_types=2,
+                 return_pooled_out=True,
+                 initializer_range=1.0,
+                 conv_type="conv_bn",
+                 search_layer=False,
+                 use_fp16=False,
+                 use_fixed_gumbel=False,
+                 gumbel_alphas=None):
+        super(BertModelLayer, self).__init__()
+
+        self._emb_size = emb_size
+        self._hidden_size = hidden_size
+        self._n_layer = n_layer
+        self._voc_size = voc_size
+        self._max_position_seq_len = max_position_seq_len
+        self._sent_types = sent_types
+        self.return_pooled_out = return_pooled_out
+
+        self.use_fixed_gumbel = use_fixed_gumbel
+
+        self._word_emb_name = "s_word_embedding"
+        self._pos_emb_name = "s_pos_embedding"
+        self._sent_emb_name = "s_sent_embedding"
+        self._dtype = "float16" if use_fp16 else "float32"
+
+        self._conv_type = conv_type
+        self._search_layer = search_layer
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=initializer_range)
+
+        self._src_emb = Embedding(
+            size=[self._voc_size, self._emb_size],
+            param_attr=fluid.ParamAttr(
+                name=self._word_emb_name, initializer=self._param_initializer),
+            dtype=self._dtype)
+
+        self._pos_emb = Embedding(
+            size=[self._max_position_seq_len, self._emb_size],
+            param_attr=fluid.ParamAttr(
+                name=self._pos_emb_name, initializer=self._param_initializer),
+            dtype=self._dtype)
+
+        self._sent_emb = Embedding(
+            size=[self._sent_types, self._emb_size],
+            param_attr=fluid.ParamAttr(
+                name=self._sent_emb_name, initializer=self._param_initializer),
+            dtype=self._dtype)
+
+        self._emb_fac = Linear(
+            input_dim=self._emb_size,
+            output_dim=self._hidden_size,
+            param_attr=fluid.ParamAttr(name="s_emb_factorization"))
+
+        self._encoder = EncoderLayer(
+            num_labels=num_labels,
+            n_layer=self._n_layer,
+            hidden_size=self._hidden_size,
+            search_layer=self._search_layer,
+            use_fixed_gumbel=self.use_fixed_gumbel,
+            gumbel_alphas=gumbel_alphas)
+
+    def emb_names(self):
+        return self._src_emb.parameters() + self._pos_emb.parameters(
+        ) + self._sent_emb.parameters()
+
+    def emb_names(self):
+        return self._src_emb.parameters() + self._pos_emb.parameters(
+        ) + self._sent_emb.parameters()
+
+    def max_flops(self):
+        return self._encoder.max_flops
+
+    def max_model_size(self):
+        return self._encoder.max_model_size
+
+    def arch_parameters(self):
+        return [self._encoder.alphas]  #, self._encoder.k]
+
+    def forward(self, data_ids, epoch):
+        """
+        forward
+        """
+        ids0 = data_ids[5]
+        ids1 = data_ids[6]
+
+        src_emb_0 = self._src_emb(ids0)
+        src_emb_1 = self._src_emb(ids1)
+        emb_out_0 = self._emb_fac(src_emb_0)
+        emb_out_1 = self._emb_fac(src_emb_1)
+        # (bs, seq_len, hidden_size)
+
+        enc_outputs = self._encoder(emb_out_0, emb_out_1, epoch)
+
+        return enc_outputs
diff --git a/paddleslim/nas/darts/search_space/conv_bert/model/cls.py b/paddleslim/nas/darts/search_space/conv_bert/model/cls.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf809ae23286778f5988a01c1de0072af1366005
--- /dev/null
+++ b/paddleslim/nas/darts/search_space/conv_bert/model/cls.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"dygraph transformer layers"
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import json
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Linear, Layer
+
+from .bert import BertModelLayer
+from paddleslim.teachers.bert import BERTClassifier
+
+
+class ClsModelLayer(Layer):
+    """
+    classify model
+    """
+
+    def __init__(self,
+                 config,
+                 num_labels,
+                 n_layers=12,
+                 is_training=True,
+                 return_pooled_out=True,
+                 loss_scaling=1.0,
+                 use_fp16=False):
+        super(ClsModelLayer, self).__init__()
+        self.config = config
+        self.is_training = is_training
+        self.use_fp16 = use_fp16
+        self.loss_scaling = loss_scaling
+        self.n_layers = n_layers
+
+        self.bert_layer = BertModelLayer(
+            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
+
+        self.cls_fc = list()
+        for i in range(self.n_layers):
+            fc = Linear(
+                input_dim=self.config["hidden_size"],
+                output_dim=num_labels,
+                param_attr=fluid.ParamAttr(
+                    name="cls_out_%d_w" % i,
+                    initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+                bias_attr=fluid.ParamAttr(
+                    name="cls_out_%d_b" % i,
+                    initializer=fluid.initializer.Constant(0.)))
+            fc = self.add_sublayer("cls_fc_%d" % i, fc)
+            self.cls_fc.append(fc)
+
+    def forward(self, data_ids):
+        """
+        forward
+        """
+        src_ids = data_ids[0]
+        position_ids = data_ids[1]
+        sentence_ids = data_ids[2]
+        input_mask = data_ids[3]
+        labels = data_ids[4]
+
+        enc_outputs, next_sent_feats = self.bert_layer(
+            src_ids, position_ids, sentence_ids, input_mask)
+        logits = []
+        losses = []
+        accuracys = []
+        for next_sent_feat, fc in zip(next_sent_feats, self.cls_fc):
+
+            cls_feat = fluid.layers.dropout(
+                x=next_sent_feat,
+                dropout_prob=0.1,
+                dropout_implementation="upscale_in_train")
+            logit = fc(cls_feat)
+            logits.append(logit)
+
+            ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
+                logits=logit, label=labels, return_softmax=True)
+            loss = fluid.layers.mean(x=ce_loss)
+            losses.append(loss)
+
+            if self.use_fp16 and self.loss_scaling > 1.0:
+                loss *= self.loss_scaling
+
+            num_seqs = fluid.layers.create_tensor(dtype='int64')
+            accuracy = fluid.layers.accuracy(
+                input=probs, label=labels, total=num_seqs)
+            accuracys.append(accuracy)
+        total_loss = fluid.layers.sum(losses)
+
+        return total_loss, logits, losses, accuracys, num_seqs
diff --git a/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py b/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py
new file mode 100755
index 0000000000000000000000000000000000000000..bad9a1e266e04022543269b32f3b45e709d97451
--- /dev/null
+++ b/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py
@@ -0,0 +1,327 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"dygraph transformer layers"
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from collections import Iterable
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, Conv2D, BatchNorm, Pool2D, to_variable
+from paddle.fluid.dygraph import to_variable
+from paddle.fluid.initializer import NormalInitializer
+from paddle.fluid import ParamAttr
+from paddle.fluid.initializer import MSRA, ConstantInitializer
+
+ConvBN_PRIMITIVES = [
+    'std_conv_bn_3', 'std_conv_bn_5', 'std_conv_bn_7', 'dil_conv_bn_3',
+    'dil_conv_bn_5', 'dil_conv_bn_7', 'avg_pool_3', 'max_pool_3',
+    'skip_connect', 'none'
+]
+
+
+OPS = {
+    'std_conv_bn_3': lambda n_channel, name: ReluConvBN(n_channel, n_channel, filter_size=[3, 1], dilation=1, name=name),
+    'std_conv_bn_5': lambda n_channel, name: ReluConvBN(n_channel, n_channel, filter_size=[5, 1], dilation=1, name=name),
+    'std_conv_bn_7': lambda n_channel, name: ReluConvBN(n_channel, n_channel, filter_size=[7, 1], dilation=1, name=name),
+    'dil_conv_bn_3': lambda n_channel, name: ReluConvBN(n_channel, n_channel, filter_size=[3, 1], dilation=2, name=name),
+    'dil_conv_bn_5': lambda n_channel, name: ReluConvBN(n_channel, n_channel, filter_size=[5, 1], dilation=2, name=name),
+    'dil_conv_bn_7': lambda n_channel, name: ReluConvBN(n_channel, n_channel, filter_size=[7, 1], dilation=2, name=name),
+
+    'avg_pool_3': lambda n_channel, name: Pool2D(pool_size=(3,1), pool_padding=(1, 0), pool_type='avg'),
+    'max_pool_3': lambda n_channel, name: Pool2D(pool_size=(3,1), pool_padding=(1, 0), pool_type='max'),
+    'none': lambda n_channel, name: Zero(),
+    'skip_connect': lambda n_channel, name: Identity(),
+}
+
+
+class MixedOp(fluid.dygraph.Layer):
+    def __init__(self, n_channel, name=None):
+        super(MixedOp, self).__init__()
+        PRIMITIVES = ConvBN_PRIMITIVES
+        ops = []
+        for primitive in PRIMITIVES:
+            op = OPS[primitive](n_channel, name
+                                if name is None else name + "/" + primitive)
+            if 'pool' in primitive:
+                gama = ParamAttr(
+                    initializer=fluid.initializer.Constant(value=1),
+                    trainable=False)
+                beta = ParamAttr(
+                    initializer=fluid.initializer.Constant(value=0),
+                    trainable=False)
+                BN = BatchNorm(n_channel, param_attr=gama, bias_attr=beta)
+                op = fluid.dygraph.Sequential(op, BN)
+            ops.append(op)
+
+        self._ops = fluid.dygraph.LayerList(ops)
+
+    def forward(self, x, weights):
+        # out = fluid.layers.sums(
+        #     [weights[i] * op(x) for i, op in enumerate(self._ops)])
+        # return out
+
+        for i in range(len(weights.numpy())):
+            if weights[i].numpy() != 0:
+                return self._ops[i](x) * weights[i]
+
+
+def gumbel_softmax(logits, epoch, temperature=1.0, hard=True, eps=1e-10):
+    temperature = temperature * (0.98**epoch)
+    U = np.random.gumbel(0, 1, logits.shape).astype("float32")
+
+    logits = logits + to_variable(U)
+    logits = logits / temperature
+    logits = fluid.layers.softmax(logits)
+
+    if hard:
+        maxes = fluid.layers.reduce_max(logits, dim=1, keep_dim=True)
+        hard = fluid.layers.cast((logits == maxes), logits.dtype)
+        out = hard - logits.detach() + logits
+        # tmp.stop_gradient = True
+        # out = tmp + logits
+    else:
+        out = logits
+
+    return out
+
+
+class Zero(fluid.dygraph.Layer):
+    def __init__(self):
+        super(Zero, self).__init__()
+
+    def forward(self, x):
+        x = fluid.layers.zeros_like(x)
+        return x
+
+
+class Identity(fluid.dygraph.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+class ReluConvBN(fluid.dygraph.Layer):
+    def __init__(self,
+                 in_c=768,
+                 out_c=768,
+                 filter_size=[3, 1],
+                 dilation=1,
+                 stride=1,
+                 affine=False,
+                 use_cudnn=True,
+                 name=None):
+        super(ReluConvBN, self).__init__()
+        conv_param = fluid.ParamAttr(
+            name=name if name is None else (name + "_conv.weights"),
+            initializer=fluid.initializer.MSRA())
+
+        self.conv = Conv2D(
+            in_c,
+            out_c,
+            filter_size,
+            dilation=[dilation, 1],
+            stride=stride,
+            padding=[(filter_size[0] - 1) * dilation // 2, 0],
+            param_attr=conv_param,
+            act=None,
+            bias_attr=False,
+            use_cudnn=use_cudnn)
+
+        gama = ParamAttr(
+            initializer=fluid.initializer.Constant(value=1), trainable=affine)
+        beta = ParamAttr(
+            initializer=fluid.initializer.Constant(value=0), trainable=affine)
+
+        self.bn = BatchNorm(out_c, param_attr=gama, bias_attr=beta)
+
+    def forward(self, inputs):
+        inputs = fluid.layers.relu(inputs)
+        conv = self.conv(inputs)
+        bn = self.bn(conv)
+        return bn
+
+
+class Cell(fluid.dygraph.Layer):
+    def __init__(self, steps, n_channel, name=None):
+        super(Cell, self).__init__()
+        self._steps = steps
+        self.preprocess0 = ReluConvBN(in_c=n_channel, out_c=n_channel)
+        self.preprocess1 = ReluConvBN(in_c=n_channel, out_c=n_channel)
+
+        ops = []
+        for i in range(self._steps):
+            for j in range(2 + i):
+                op = MixedOp(
+                    n_channel,
+                    name=name
+                    if name is None else "%s/step%d_edge%d" % (name, i, j))
+                ops.append(op)
+        self._ops = fluid.dygraph.LayerList(ops)
+
+    def forward(self, s0, s1, weights):
+        s0 = self.preprocess0(s0)
+        s1 = self.preprocess1(s1)
+
+        states = [s0, s1]
+        offset = 0
+        for i in range(self._steps):
+            s = fluid.layers.sums([
+                self._ops[offset + j](h, weights[offset + j])
+                for j, h in enumerate(states)
+            ])
+            offset += len(states)
+            states.append(s)
+        out = fluid.layers.sums(states[-self._steps:])
+        #out = fluid.layers.concat(input=states[-self._steps:], axis=1)
+        return out
+
+
+class EncoderLayer(Layer):
+    """
+    encoder
+    """
+
+    def __init__(self,
+                 num_labels,
+                 n_layer,
+                 hidden_size=768,
+                 name="encoder",
+                 search_layer=True,
+                 use_fixed_gumbel=False,
+                 gumbel_alphas=None):
+        super(EncoderLayer, self).__init__()
+        self._n_layer = n_layer
+        self._hidden_size = hidden_size
+        self._n_channel = 128
+        self._steps = 3
+        self._n_ops = len(ConvBN_PRIMITIVES)
+        self.use_fixed_gumbel = use_fixed_gumbel
+
+        self.stem0 = fluid.dygraph.Sequential(
+            Conv2D(
+                num_channels=1,
+                num_filters=self._n_channel,
+                filter_size=[3, self._hidden_size],
+                padding=[1, 0],
+                param_attr=fluid.ParamAttr(initializer=MSRA()),
+                bias_attr=False),
+            BatchNorm(
+                num_channels=self._n_channel,
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Constant(value=1)),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Constant(value=0))))
+
+        self.stem1 = fluid.dygraph.Sequential(
+            Conv2D(
+                num_channels=1,
+                num_filters=self._n_channel,
+                filter_size=[3, self._hidden_size],
+                padding=[1, 0],
+                param_attr=fluid.ParamAttr(initializer=MSRA()),
+                bias_attr=False),
+            BatchNorm(
+                num_channels=self._n_channel,
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Constant(value=1)),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Constant(value=0))))
+
+        cells = []
+        for i in range(n_layer):
+            cell = Cell(
+                steps=self._steps,
+                n_channel=self._n_channel,
+                name="%s/layer_%d" % (name, i))
+            cells.append(cell)
+
+        self._cells = fluid.dygraph.LayerList(cells)
+
+        k = sum(1 for i in range(self._steps) for n in range(2 + i))
+        num_ops = self._n_ops
+        self.alphas = fluid.layers.create_parameter(
+            shape=[k, num_ops],
+            dtype="float32",
+            default_initializer=NormalInitializer(
+                loc=0.0, scale=1e-3))
+
+        self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
+        self.bns = []
+        self.outs = []
+        for i in range(self._n_layer):
+            bn = BatchNorm(
+                num_channels=self._n_channel,
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Constant(value=1),
+                    trainable=False),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Constant(value=0),
+                    trainable=False))
+            out = Linear(
+                self._n_channel,
+                num_labels,
+                param_attr=ParamAttr(initializer=MSRA()),
+                bias_attr=ParamAttr(initializer=MSRA()))
+            self.bns.append(bn)
+            self.outs.append(out)
+        self._bns = fluid.dygraph.LayerList(self.bns)
+        self._outs = fluid.dygraph.LayerList(self.outs)
+
+        self.use_fixed_gumbel = use_fixed_gumbel
+        #self.gumbel_alphas = gumbel_softmax(self.alphas, 0).detach()
+
+        mrpc_arch = [
+            [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],  # std_conv7 0     # node 0
+            [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],  # dil_conv5 1
+            [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],  # std_conv7 0     # node 1
+            [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],  # dil_conv5 1
+            [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],  # zero 2
+            [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],  # zero 0          # node2
+            [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # std_conv3 1
+            [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],  # zero 2
+            [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]  # dil_conv3 3
+        ]
+        self.gumbel_alphas = to_variable(
+            np.array(mrpc_arch).astype(np.float32))
+        self.gumbel_alphas.stop_gradient = True
+        print("gumbel_alphas: \n", self.gumbel_alphas.numpy())
+
+    def forward(self, enc_input_0, enc_input_1, epoch, flops=[],
+                model_size=[]):
+        alphas = self.gumbel_alphas if self.use_fixed_gumbel else gumbel_softmax(
+            self.alphas, epoch)
+
+        s0 = fluid.layers.unsqueeze(enc_input_0, [1])
+        s1 = fluid.layers.unsqueeze(enc_input_1, [1])
+        s0 = self.stem0(s0)
+        s1 = self.stem1(s1)
+
+        enc_outputs = []
+        for i in range(self._n_layer):
+            s0, s1 = s1, self._cells[i](s0, s1, alphas)
+            # (bs, n_channel, seq_len, 1)
+            tmp = self._bns[i](s1)
+            tmp = self.pool2d_avg(tmp)
+            tmp = fluid.layers.reshape(tmp, shape=[-1, 0])
+            tmp = self._outs[i](tmp)
+            enc_outputs.append(tmp)
+
+        return enc_outputs
diff --git a/paddleslim/nas/darts/search_space/conv_bert/optimization.py b/paddleslim/nas/darts/search_space/conv_bert/optimization.py
new file mode 100755
index 0000000000000000000000000000000000000000..bf004ae030b6235910e13bb01f538a117a21043a
--- /dev/null
+++ b/paddleslim/nas/darts/search_space/conv_bert/optimization.py
@@ -0,0 +1,170 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimization and learning rate scheduling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle.fluid as fluid
+
+from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
+
+
+class ConstantLR(LearningRateDecay):
+    def __init__(self, learning_rate, begin=0, step=1, dtype='float32'):
+        super(ConstantLR, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+
+    def step(self):
+        return self.learning_rate
+
+
+class LinearDecay(LearningRateDecay):
+    def __init__(self,
+                 learning_rate,
+                 warmup_steps,
+                 decay_steps,
+                 end_learning_rate=0.0001,
+                 power=1.0,
+                 cycle=False,
+                 begin=0,
+                 step=1,
+                 dtype='float32'):
+        super(LinearDecay, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+        self.warmup_steps = warmup_steps
+        self.decay_steps = decay_steps
+        self.end_learning_rate = end_learning_rate
+        self.power = power
+        self.cycle = cycle
+
+    def step(self):
+        if self.step_num < self.warmup_steps:
+            decayed_lr = self.learning_rate * (self.step_num /
+                                               self.warmup_steps)
+            decayed_lr = self.create_lr_var(decayed_lr)
+        else:
+            tmp_step_num = self.step_num
+            tmp_decay_steps = self.decay_steps
+            if self.cycle:
+                div_res = fluid.layers.ceil(
+                    self.create_lr_var(tmp_step_num / float(self.decay_steps)))
+                if tmp_step_num == 0:
+                    div_res = self.create_lr_var(1.0)
+                tmp_decay_steps = self.decay_steps * div_res
+            else:
+                tmp_step_num = self.create_lr_var(
+                    tmp_step_num
+                    if tmp_step_num < self.decay_steps else self.decay_steps)
+                decayed_lr = (self.learning_rate - self.end_learning_rate) * \
+                    ((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate
+
+        return decayed_lr
+
+
+class Optimizer(object):
+    def __init__(self,
+                 warmup_steps,
+                 num_train_steps,
+                 learning_rate,
+                 model_cls,
+                 weight_decay,
+                 scheduler='linear_warmup_decay',
+                 loss_scaling=1.0,
+                 parameter_list=None):
+        self.warmup_steps = warmup_steps
+        self.num_train_steps = num_train_steps
+        self.learning_rate = learning_rate
+        self.model_cls = model_cls
+        self.weight_decay = weight_decay
+        self.scheduler = scheduler
+        self.loss_scaling = loss_scaling
+        self.parameter_list = parameter_list
+
+        self.scheduled_lr = 0.0
+        self.optimizer = self.lr_schedule()
+
+    def lr_schedule(self):
+        if self.warmup_steps > 0:
+            if self.scheduler == 'noam_decay':
+                self.scheduled_lr = fluid.dygraph.NoamDecay(1 / (
+                    self.warmup_steps * (self.learning_rate**2)),
+                                                            self.warmup_steps)
+            elif self.scheduler == 'linear_warmup_decay':
+                self.scheduled_lr = LinearDecay(self.learning_rate,
+                                                self.warmup_steps,
+                                                self.num_train_steps, 0.0)
+            else:
+                raise ValueError("Unkown learning rate scheduler, should be "
+                                 "'noam_decay' or 'linear_warmup_decay'")
+            optimizer = fluid.optimizer.Adam(
+                learning_rate=self.scheduled_lr,
+                parameter_list=self.parameter_list)
+        else:
+            self.scheduled_lr = ConstantLR(self.learning_rate)
+            optimizer = fluid.optimizer.Adam(
+                learning_rate=self.scheduled_lr,
+                parameter_list=self.parameter_list)
+
+        return optimizer
+
+    def exclude_from_weight_decay(self, name):
+        if name.find("layer_norm") > -1:
+            return True
+        bias_suffix = ["_bias", "_b", ".b_0"]
+        for suffix in bias_suffix:
+            if name.endswith(suffix):
+                return True
+        return False
+
+    def optimization(self, loss, use_data_parallel=False, model=None):
+        param_list = dict()
+
+        clip_norm_thres = 1.0
+        #grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm_thres)
+
+        if use_data_parallel:
+            loss = model.scale_loss(loss)
+
+        loss.backward()
+
+        if self.weight_decay > 0:
+            for param in self.model_cls.parameters():
+                param_list[param.name] = param * 1.0
+                param_list[param.name].stop_gradient = True
+
+        if use_data_parallel:
+            assert model is not None
+            model.apply_collective_grads()
+
+        #_, param_grads = self.optimizer.minimize(loss, grad_clip=grad_clip)
+        _, param_grads = self.optimizer.minimize(loss)
+
+        if self.weight_decay > 0:
+            for param, grad in param_grads:
+                if self.exclude_from_weight_decay(param.name):
+                    continue
+                if isinstance(self.scheduled_lr.step(), float):
+                    updated_param = param.numpy() - param_list[
+                        param.name].numpy(
+                        ) * self.weight_decay * self.scheduled_lr.step()
+                else:
+                    updated_param = param.numpy(
+                    ) - param_list[param.name].numpy(
+                    ) * self.weight_decay * self.scheduled_lr.step().numpy()
+                updated_param_var = fluid.dygraph.to_variable(updated_param)
+                param = updated_param_var
+                #param = fluid.layers.reshape(x=updated_param_var, shape=list(updated_param_var.shape))
diff --git a/paddleslim/nas/darts/search_space/conv_bert/reader/__init__.py b/paddleslim/nas/darts/search_space/conv_bert/reader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/paddleslim/nas/darts/search_space/conv_bert/reader/batching.py b/paddleslim/nas/darts/search_space/conv_bert/reader/batching.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a214700a9e2db27900602c235c32e435e7b85fb
--- /dev/null
+++ b/paddleslim/nas/darts/search_space/conv_bert/reader/batching.py
@@ -0,0 +1,189 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask, padding and batching."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
+    """
+    Add mask for batch_tokens, return out, mask_label, mask_pos;
+    Note: mask_pos responding the batch_tokens after padded;
+    """
+    max_len = max([len(sent) for sent in batch_tokens])
+    mask_label = []
+    mask_pos = []
+    prob_mask = np.random.rand(total_token_num)
+    # Note: the first token is [CLS], so [low=1]
+    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
+    pre_sent_len = 0
+    prob_index = 0
+    for sent_index, sent in enumerate(batch_tokens):
+        mask_flag = False
+        prob_index += pre_sent_len
+        for token_index, token in enumerate(sent):
+            prob = prob_mask[prob_index + token_index]
+            if prob > 0.15:
+                continue
+            elif 0.03 < prob <= 0.15:
+                # mask
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = MASK
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            elif 0.015 < prob <= 0.03:
+                # random replace
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = replace_ids[prob_index + token_index]
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            else:
+                # keep the original token
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    mask_pos.append(sent_index * max_len + token_index)
+        pre_sent_len = len(sent)
+
+        # ensure at least mask one word in a sentence
+        while not mask_flag:
+            token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
+            if sent[token_index] != SEP and sent[token_index] != CLS:
+                mask_label.append(sent[token_index])
+                sent[token_index] = MASK
+                mask_flag = True
+                mask_pos.append(sent_index * max_len + token_index)
+    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
+    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+    return batch_tokens, mask_label, mask_pos
+
+
+def prepare_batch_data(insts,
+                       total_token_num,
+                       voc_size=0,
+                       pad_id=None,
+                       cls_id=None,
+                       sep_id=None,
+                       mask_id=None,
+                       return_input_mask=True,
+                       return_max_len=True,
+                       return_num_token=False):
+    """
+    1. generate Tensor of data
+    2. generate Tensor of position
+    3. generate self attention mask, [shape: batch_size *  max_len * max_len]
+    """
+
+    batch_src_ids = [inst[0] for inst in insts]
+    batch_sent_ids = [inst[1] for inst in insts]
+    batch_pos_ids = [inst[2] for inst in insts]
+    labels_list = []
+    # compatible with squad, whose example includes start/end positions, 
+    # or unique id
+
+    for i in range(3, len(insts[0]), 1):
+        labels = [inst[i] for inst in insts]
+        labels = np.array(labels).astype("int64").reshape([-1, 1])
+        labels_list.append(labels)
+
+    # First step: do mask without padding
+    if mask_id >= 0:
+        out, mask_label, mask_pos = mask(
+            batch_src_ids,
+            total_token_num,
+            vocab_size=voc_size,
+            CLS=cls_id,
+            SEP=sep_id,
+            MASK=mask_id)
+    else:
+        out = batch_src_ids
+    # Second step: padding
+    src_id, self_input_mask = pad_batch_data(
+        out, pad_idx=pad_id, return_input_mask=True)
+    pos_id = pad_batch_data(
+        batch_pos_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    sent_id = pad_batch_data(
+        batch_sent_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+
+    if mask_id >= 0:
+        return_list = [
+            src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos
+        ] + labels_list
+    else:
+        return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+def pad_batch_data(insts,
+                   pad_idx=0,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and input mask.
+    """
+    return_list = []
+    max_len = max(len(inst) for inst in insts)
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+
+    inst_data = np.array([
+        list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
+    ])
+    return_list += [inst_data.astype("int64").reshape([-1, max_len])]
+
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len])]
+
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] *
+                                    (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+
+    if return_max_len:
+        return_list += [max_len]
+
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+if __name__ == "__main__":
+    pass
diff --git a/paddleslim/nas/darts/search_space/conv_bert/reader/cls.py b/paddleslim/nas/darts/search_space/conv_bert/reader/cls.py
new file mode 100644
index 0000000000000000000000000000000000000000..79b487719569f64ece8a15ef959a9c36a691c824
--- /dev/null
+++ b/paddleslim/nas/darts/search_space/conv_bert/reader/cls.py
@@ -0,0 +1,552 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import os
+import types
+import csv
+import numpy as np
+from . import tokenization
+from .batching import prepare_batch_data
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def __init__(self,
+                 data_dir,
+                 vocab_path,
+                 max_seq_len,
+                 do_lower_case,
+                 in_tokens,
+                 random_seed=None):
+        self.data_dir = data_dir
+        self.max_seq_len = max_seq_len
+        self.tokenizer = tokenization.FullTokenizer(
+            vocab_file=vocab_path, do_lower_case=do_lower_case)
+        self.vocab = self.tokenizer.vocab
+        self.in_tokens = in_tokens
+
+        np.random.seed(random_seed)
+
+        self.current_train_example = -1
+        self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
+        self.current_train_epoch = -1
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_test_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for prediction."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    def convert_example(self, index, example, labels, max_seq_len, tokenizer):
+        """Converts a single `InputExample` into a single `InputFeatures`."""
+        feature = convert_single_example(index, example, labels, max_seq_len,
+                                         tokenizer)
+        return feature
+
+    def generate_instance(self, feature):
+        """
+        generate instance with given feature
+
+        Args:
+            feature: InputFeatures(object). A single set of features of data.
+        """
+        input_pos = list(range(len(feature.input_ids)))
+        return [
+            feature.input_ids, feature.segment_ids, input_pos, feature.label_id
+        ]
+
+    def generate_batch_data(self,
+                            batch_data,
+                            total_token_num,
+                            voc_size=-1,
+                            mask_id=-1,
+                            return_input_mask=True,
+                            return_max_len=False,
+                            return_num_token=False):
+        return prepare_batch_data(
+            batch_data,
+            total_token_num,
+            voc_size=-1,
+            pad_id=self.vocab["[PAD]"],
+            cls_id=self.vocab["[CLS]"],
+            sep_id=self.vocab["[SEP]"],
+            mask_id=-1,
+            return_input_mask=True,
+            return_max_len=False,
+            return_num_token=False)
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with io.open(input_file, "r", encoding="utf8") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                lines.append(line)
+            return lines
+
+    def get_num_examples(self, phase):
+        """Get number of examples for train, dev or test."""
+        if phase not in ['train', 'dev', 'test']:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'dev', 'test'].")
+        return self.num_examples[phase]
+
+    def get_train_progress(self):
+        """Gets progress for training phase."""
+        return self.current_train_example, self.current_train_epoch
+
+    def data_generator(self,
+                       batch_size,
+                       phase='train',
+                       epoch=1,
+                       dev_count=1,
+                       shuffle=True,
+                       shuffle_seed=None):
+        """
+        Generate data for train, dev or test.
+    
+        Args:
+          batch_size: int. The batch size of generated data.
+          phase: string. The phase for which to generate data.
+          epoch: int. Total epoches to generate data.
+          shuffle: bool. Whether to shuffle examples.
+        """
+        if phase == 'train':
+            examples = self.get_train_examples(self.data_dir)
+            self.num_examples['train'] = len(examples)
+        elif phase == 'dev':
+            examples = self.get_dev_examples(self.data_dir)
+            self.num_examples['dev'] = len(examples)
+        elif phase == 'test':
+            examples = self.get_test_examples(self.data_dir)
+            self.num_examples['test'] = len(examples)
+        else:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'dev', 'test'].")
+
+        def instance_reader():
+            for epoch_index in range(epoch):
+                if shuffle:
+                    if shuffle_seed is not None:
+                        np.random.seed(shuffle_seed)
+                    np.random.shuffle(examples)
+                if phase == 'train':
+                    self.current_train_epoch = epoch_index
+                for (index, example) in enumerate(examples):
+                    if phase == 'train':
+                        self.current_train_example = index + 1
+                    feature = self.convert_example(
+                        index, example,
+                        self.get_labels(), self.max_seq_len, self.tokenizer)
+
+                    instance = self.generate_instance(feature)
+                    yield instance
+
+        def batch_reader(reader, batch_size, in_tokens):
+            batch, total_token_num, max_len = [], 0, 0
+            for instance in reader():
+                token_ids, sent_ids, pos_ids, label = instance[:4]
+                max_len = max(max_len, len(token_ids))
+                if in_tokens:
+                    to_append = (len(batch) + 1) * max_len <= batch_size
+                else:
+                    to_append = len(batch) < batch_size
+                if to_append:
+                    batch.append(instance)
+                    total_token_num += len(token_ids)
+                else:
+                    yield batch, total_token_num
+                    batch, total_token_num, max_len = [instance], len(
+                        token_ids), len(token_ids)
+
+            if len(batch) > 0:
+                yield batch, total_token_num
+
+        def wrapper():
+            all_dev_batches = []
+            for batch_data, total_token_num in batch_reader(
+                    instance_reader, batch_size, self.in_tokens):
+                batch_data = self.generate_batch_data(
+                    batch_data,
+                    total_token_num,
+                    voc_size=-1,
+                    mask_id=-1,
+                    return_input_mask=True,
+                    return_max_len=False,
+                    return_num_token=False)
+                if len(all_dev_batches) < dev_count:
+                    all_dev_batches.append(batch_data)
+
+                if len(all_dev_batches) == dev_count:
+                    for batch in all_dev_batches:
+                        yield batch
+                    all_dev_batches = []
+
+        return wrapper
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+    Args:
+      guid: Unique id for the example.
+      text_a: string. The untokenized text of the first sequence. For single
+        sequence tasks, only this sequence must be specified.
+      text_b: (Optional) string. The untokenized text of the second sequence.
+        Only must be specified for sequence pair tasks.
+      label: (Optional) string. The label of the example. This should be
+        specified for train and dev examples, but not for test examples.
+    """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+
+
+class XnliProcessor(DataProcessor):
+    """Processor for the XNLI data set."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        self.language = "zh"
+        lines = self._read_tsv(
+            os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" %
+                         self.language))
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "train-%d" % (i)
+            text_a = tokenization.convert_to_unicode(line[0])
+            text_b = tokenization.convert_to_unicode(line[1])
+            label = tokenization.convert_to_unicode(line[2])
+            if label == tokenization.convert_to_unicode("contradictory"):
+                label = tokenization.convert_to_unicode("contradiction")
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        self.language = "zh"
+        lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "dev-%d" % (i)
+            language = tokenization.convert_to_unicode(line[0])
+            if language != tokenization.convert_to_unicode(self.language):
+                continue
+            text_a = tokenization.convert_to_unicode(line[6])
+            text_b = tokenization.convert_to_unicode(line[7])
+            label = tokenization.convert_to_unicode(line[1])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        self.language = "zh"
+        lines = self._read_tsv(os.path.join(data_dir, "xnli.test.tsv"))
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "test-%d" % (i)
+            language = tokenization.convert_to_unicode(line[0])
+            if language != tokenization.convert_to_unicode(self.language):
+                continue
+            text_a = tokenization.convert_to_unicode(line[6])
+            text_b = tokenization.convert_to_unicode(line[7])
+            label = tokenization.convert_to_unicode(line[1])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
+            "dev_matched")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type,
+                              tokenization.convert_to_unicode(line[0]))
+            text_a = tokenization.convert_to_unicode(line[8])
+            text_b = tokenization.convert_to_unicode(line[9])
+            if set_type == "test":
+                label = "contradiction"
+            else:
+                label = tokenization.convert_to_unicode(line[-1])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = tokenization.convert_to_unicode(line[3])
+            text_b = tokenization.convert_to_unicode(line[4])
+            if set_type == "test":
+                label = "0"
+            else:
+                label = tokenization.convert_to_unicode(line[0])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class ColaProcessor(DataProcessor):
+    """Processor for the CoLA data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            # Only the test set has a header
+            if set_type == "test" and i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            if set_type == "test":
+                text_a = tokenization.convert_to_unicode(line[1])
+                label = "0"
+            else:
+                text_a = tokenization.convert_to_unicode(line[3])
+                label = tokenization.convert_to_unicode(line[1])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+def convert_single_example_to_unicode(guid, single_example):
+    text_a = tokenization.convert_to_unicode(single_example[0])
+    text_b = tokenization.convert_to_unicode(single_example[1])
+    label = tokenization.convert_to_unicode(single_example[2])
+    return InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
+
+
+def convert_single_example(ex_index, example, label_list, max_seq_length,
+                           tokenizer):
+    """Converts a single `InputExample` into a single `InputFeatures`."""
+    label_map = {}
+    for (i, label) in enumerate(label_list):
+        label_map[label] = i
+
+    tokens_a = tokenizer.tokenize(example.text_a)
+    tokens_b = None
+    if example.text_b:
+        tokens_b = tokenizer.tokenize(example.text_b)
+
+    if tokens_b:
+        # Modifies `tokens_a` and `tokens_b` in place so that the total
+        # length is less than the specified length.
+        # Account for [CLS], [SEP], [SEP] with "- 3"
+        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+    else:
+        # Account for [CLS] and [SEP] with "- 2"
+        if len(tokens_a) > max_seq_length - 2:
+            tokens_a = tokens_a[0:(max_seq_length - 2)]
+
+    # The convention in BERT is:
+    # (a) For sequence pairs:
+    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+    # (b) For single sequences:
+    #  tokens:   [CLS] the dog is hairy . [SEP]
+    #  type_ids: 0     0   0   0  0     0 0
+    #
+    # Where "type_ids" are used to indicate whether this is the first
+    # sequence or the second sequence. The embedding vectors for `type=0` and
+    # `type=1` were learned during pre-training and are added to the wordpiece
+    # embedding vector (and position vector). This is not *strictly* necessary
+    # since the [SEP] token unambiguously separates the sequences, but it makes
+    # it easier for the model to learn the concept of sequences.
+    #
+    # For classification tasks, the first vector (corresponding to [CLS]) is
+    # used as as the "sentence vector". Note that this only makes sense because
+    # the entire model is fine-tuned.
+    tokens = []
+    segment_ids = []
+    tokens.append("[CLS]")
+    segment_ids.append(0)
+    for token in tokens_a:
+        tokens.append(token)
+        segment_ids.append(0)
+    tokens.append("[SEP]")
+    segment_ids.append(0)
+
+    if tokens_b:
+        for token in tokens_b:
+            tokens.append(token)
+            segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+    # The mask has 1 for real tokens and 0 for padding tokens. Only real
+    # tokens are attended to.
+    input_mask = [1] * len(input_ids)
+
+    label_id = label_map[example.label]
+
+    feature = InputFeatures(
+        input_ids=input_ids,
+        input_mask=input_mask,
+        segment_ids=segment_ids,
+        label_id=label_id)
+    return feature
+
+
+def convert_examples_to_features(examples, label_list, max_seq_length,
+                                 tokenizer):
+    """Convert a set of `InputExample`s to a list of `InputFeatures`."""
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            print("Writing example %d of %d" % (ex_index, len(examples)))
+
+        feature = convert_single_example(ex_index, example, label_list,
+                                         max_seq_length, tokenizer)
+
+        features.append(feature)
+    return features
+
+
+if __name__ == '__main__':
+    pass
diff --git a/paddleslim/nas/darts/search_space/conv_bert/reader/pretraining.py b/paddleslim/nas/darts/search_space/conv_bert/reader/pretraining.py
new file mode 100644
index 0000000000000000000000000000000000000000..c21a43d33caedd9a01c02dacbedd01a16e1eec9f
--- /dev/null
+++ b/paddleslim/nas/darts/search_space/conv_bert/reader/pretraining.py
@@ -0,0 +1,289 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import os
+import numpy as np
+import types
+import gzip
+import logging
+import re
+import six
+import collections
+import tokenization
+
+import paddle
+import paddle.fluid as fluid
+
+from batching import prepare_batch_data
+
+
+class DataReader(object):
+    def __init__(self,
+                 data_dir,
+                 vocab_path,
+                 batch_size=4096,
+                 in_tokens=True,
+                 max_seq_len=512,
+                 shuffle_files=True,
+                 epoch=100,
+                 voc_size=0,
+                 is_test=False,
+                 generate_neg_sample=False):
+
+        self.vocab = self.load_vocab(vocab_path)
+        self.data_dir = data_dir
+        self.batch_size = batch_size
+        self.in_tokens = in_tokens
+        self.shuffle_files = shuffle_files
+        self.epoch = epoch
+        self.current_epoch = 0
+        self.current_file_index = 0
+        self.total_file = 0
+        self.current_file = None
+        self.voc_size = voc_size
+        self.max_seq_len = max_seq_len
+        self.pad_id = self.vocab["[PAD]"]
+        self.cls_id = self.vocab["[CLS]"]
+        self.sep_id = self.vocab["[SEP]"]
+        self.mask_id = self.vocab["[MASK]"]
+        self.is_test = is_test
+        self.generate_neg_sample = generate_neg_sample
+        if self.in_tokens:
+            assert self.batch_size >= self.max_seq_len, "The number of " \
+                   "tokens in batch should not be smaller than max seq length."
+
+        if self.is_test:
+            self.epoch = 1
+            self.shuffle_files = False
+
+    def get_progress(self):
+        """return current progress of traning data
+        """
+        return self.current_epoch, self.current_file_index, self.total_file, self.current_file
+
+    def parse_line(self, line, max_seq_len=512):
+        """ parse one line to token_ids, sentence_ids, pos_ids, label
+        """
+        line = line.strip().decode().split(";")
+        assert len(line) == 4, "One sample must have 4 fields!"
+        (token_ids, sent_ids, pos_ids, label) = line
+        token_ids = [int(token) for token in token_ids.split(" ")]
+        sent_ids = [int(token) for token in sent_ids.split(" ")]
+        pos_ids = [int(token) for token in pos_ids.split(" ")]
+        assert len(token_ids) == len(sent_ids) == len(
+            pos_ids
+        ), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids)"
+        label = int(label)
+        if len(token_ids) > max_seq_len:
+            return None
+        return [token_ids, sent_ids, pos_ids, label]
+
+    def read_file(self, file):
+        assert file.endswith('.gz'), "[ERROR] %s is not a gzip file" % file
+        file_path = self.data_dir + "/" + file
+        with gzip.open(file_path, "rb") as f:
+            for line in f:
+                parsed_line = self.parse_line(
+                    line, max_seq_len=self.max_seq_len)
+                if parsed_line is None:
+                    continue
+                yield parsed_line
+
+    def convert_to_unicode(self, text):
+        """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+        if six.PY3:
+            if isinstance(text, str):
+                return text
+            elif isinstance(text, bytes):
+                return text.decode("utf-8", "ignore")
+            else:
+                raise ValueError("Unsupported string type: %s" % (type(text)))
+        elif six.PY2:
+            if isinstance(text, str):
+                return text.decode("utf-8", "ignore")
+            elif isinstance(text, unicode):
+                return text
+            else:
+                raise ValueError("Unsupported string type: %s" % (type(text)))
+        else:
+            raise ValueError("Not running on Python2 or Python 3?")
+
+    def load_vocab(self, vocab_file):
+        """Loads a vocabulary file into a dictionary."""
+        vocab = collections.OrderedDict()
+        fin = open(vocab_file)
+        for num, line in enumerate(fin):
+            items = self.convert_to_unicode(line.strip()).split("\t")
+            if len(items) > 2:
+                break
+            token = items[0]
+            index = items[1] if len(items) == 2 else num
+            token = token.strip()
+            vocab[token] = int(index)
+        return vocab
+
+    def random_pair_neg_samples(self, pos_samples):
+        """ randomly generate negtive samples using pos_samples
+
+            Args:
+                pos_samples: list of positive samples
+            
+            Returns:
+                neg_samples: list of negtive samples
+        """
+        np.random.shuffle(pos_samples)
+        num_sample = len(pos_samples)
+        neg_samples = []
+        miss_num = 0
+
+        for i in range(num_sample):
+            pair_index = (i + 1) % num_sample
+            origin_src_ids = pos_samples[i][0]
+            origin_sep_index = origin_src_ids.index(2)
+            pair_src_ids = pos_samples[pair_index][0]
+            pair_sep_index = pair_src_ids.index(2)
+
+            src_ids = origin_src_ids[:origin_sep_index + 1] + pair_src_ids[
+                pair_sep_index + 1:]
+            if len(src_ids) >= self.max_seq_len:
+                miss_num += 1
+                continue
+            sent_ids = [0] * len(origin_src_ids[:origin_sep_index + 1]) + [
+                1
+            ] * len(pair_src_ids[pair_sep_index + 1:])
+            pos_ids = list(range(len(src_ids)))
+            neg_sample = [src_ids, sent_ids, pos_ids, 0]
+            assert len(src_ids) == len(sent_ids) == len(
+                pos_ids
+            ), "[ERROR]len(src_id) == lne(sent_id) == len(pos_id) must be True"
+            neg_samples.append(neg_sample)
+        return neg_samples, miss_num
+
+    def mixin_negtive_samples(self, pos_sample_generator, buffer=1000):
+        """ 1. generate negtive samples by randomly group sentence_1 and sentence_2 of positive samples
+            2. combine negtive samples and positive samples
+            
+            Args:
+                pos_sample_generator: a generator producing a parsed positive sample, which is a list: [token_ids, sent_ids, pos_ids, 1]
+
+            Returns:
+                sample: one sample from shuffled positive samples and negtive samples
+        """
+        pos_samples = []
+        num_total_miss = 0
+        pos_sample_num = 0
+        try:
+            while True:
+                while len(pos_samples) < buffer:
+                    pos_sample = next(pos_sample_generator)
+                    label = pos_sample[3]
+                    assert label == 1, "positive sample's label must be 1"
+                    pos_samples.append(pos_sample)
+                    pos_sample_num += 1
+
+                neg_samples, miss_num = self.random_pair_neg_samples(
+                    pos_samples)
+                num_total_miss += miss_num
+                samples = pos_samples + neg_samples
+                pos_samples = []
+                np.random.shuffle(samples)
+                for sample in samples:
+                    yield sample
+        except StopIteration:
+            print("stopiteration: reach end of file")
+            if len(pos_samples) == 1:
+                yield pos_samples[0]
+            elif len(pos_samples) == 0:
+                yield None
+            else:
+                neg_samples, miss_num = self.random_pair_neg_samples(
+                    pos_samples)
+                num_total_miss += miss_num
+                samples = pos_samples + neg_samples
+                pos_samples = []
+                np.random.shuffle(samples)
+                for sample in samples:
+                    yield sample
+            print("miss_num:%d\tideal_total_sample_num:%d\tmiss_rate:%f" %
+                  (num_total_miss, pos_sample_num * 2,
+                   num_total_miss / (pos_sample_num * 2)))
+
+    def data_generator(self):
+        """
+        data_generator
+        """
+        files = os.listdir(self.data_dir)
+        self.total_file = len(files)
+        assert self.total_file > 0, "[Error] data_dir is empty"
+
+        def wrapper():
+            def reader():
+                for epoch in range(self.epoch):
+                    self.current_epoch = epoch + 1
+                    if self.shuffle_files:
+                        np.random.shuffle(files)
+                    for index, file in enumerate(files):
+                        self.current_file_index = index + 1
+                        self.current_file = file
+                        sample_generator = self.read_file(file)
+                        if not self.is_test and self.generate_neg_sample:
+                            sample_generator = self.mixin_negtive_samples(
+                                sample_generator)
+                        for sample in sample_generator:
+                            if sample is None:
+                                continue
+                            yield sample
+
+            def batch_reader(reader, batch_size, in_tokens):
+                batch, total_token_num, max_len = [], 0, 0
+                for parsed_line in reader():
+                    token_ids, sent_ids, pos_ids, label = parsed_line
+                    max_len = max(max_len, len(token_ids))
+                    if in_tokens:
+                        to_append = (len(batch) + 1) * max_len <= batch_size
+                    else:
+                        to_append = len(batch) < batch_size
+                    if to_append:
+                        batch.append(parsed_line)
+                        total_token_num += len(token_ids)
+                    else:
+                        yield batch, total_token_num
+                        batch, total_token_num, max_len = [parsed_line], len(
+                            token_ids), len(token_ids)
+
+                if len(batch) > 0:
+                    yield batch, total_token_num
+
+            for batch_data, total_token_num in batch_reader(
+                    reader, self.batch_size, self.in_tokens):
+                yield prepare_batch_data(
+                    batch_data,
+                    total_token_num,
+                    voc_size=self.voc_size,
+                    pad_id=self.pad_id,
+                    cls_id=self.cls_id,
+                    sep_id=self.sep_id,
+                    mask_id=self.mask_id,
+                    return_input_mask=True,
+                    return_max_len=False,
+                    return_num_token=False)
+
+        return wrapper
+
+
+if __name__ == "__main__":
+    pass
diff --git a/paddleslim/nas/darts/search_space/conv_bert/reader/squad.py b/paddleslim/nas/darts/search_space/conv_bert/reader/squad.py
new file mode 100644
index 0000000000000000000000000000000000000000..651c46f966d228e626cdd25e1fb73809801716d0
--- /dev/null
+++ b/paddleslim/nas/darts/search_space/conv_bert/reader/squad.py
@@ -0,0 +1,935 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Run BERT on SQuAD 1.1 and SQuAD 2.0."""
+
+import six
+import math
+import json
+import random
+import collections
+import tokenization
+from batching import prepare_batch_data
+
+
+class SquadExample(object):
+    """A single training/test example for simple sequence classification.
+
+     For examples without an answer, the start and end position are -1.
+  """
+
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=False):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+        s += ", question_text: %s" % (
+            tokenization.printable_text(self.question_text))
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position:
+            s += ", start_position: %d" % (self.start_position)
+        if self.start_position:
+            s += ", end_position: %d" % (self.end_position)
+        if self.start_position:
+            s += ", is_impossible: %r" % (self.is_impossible)
+        return s
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self,
+                 unique_id,
+                 example_index,
+                 doc_span_index,
+                 tokens,
+                 token_to_orig_map,
+                 token_is_max_context,
+                 input_ids,
+                 input_mask,
+                 segment_ids,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=None):
+        self.unique_id = unique_id
+        self.example_index = example_index
+        self.doc_span_index = doc_span_index
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+        self.token_is_max_context = token_is_max_context
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+
+def read_squad_examples(input_file, is_training,
+                        version_2_with_negative=False):
+    """Read a SQuAD json file into a list of SquadExample."""
+    with open(input_file, "r") as reader:
+        input_data = json.load(reader)["data"]
+
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+
+    examples = []
+    for entry in input_data:
+        for paragraph in entry["paragraphs"]:
+            paragraph_text = paragraph["context"]
+            doc_tokens = []
+            char_to_word_offset = []
+            prev_is_whitespace = True
+            for c in paragraph_text:
+                if is_whitespace(c):
+                    prev_is_whitespace = True
+                else:
+                    if prev_is_whitespace:
+                        doc_tokens.append(c)
+                    else:
+                        doc_tokens[-1] += c
+                    prev_is_whitespace = False
+                char_to_word_offset.append(len(doc_tokens) - 1)
+
+            for qa in paragraph["qas"]:
+                qas_id = qa["id"]
+                question_text = qa["question"]
+                start_position = None
+                end_position = None
+                orig_answer_text = None
+                is_impossible = False
+                if is_training:
+
+                    if version_2_with_negative:
+                        is_impossible = qa["is_impossible"]
+                    if (len(qa["answers"]) != 1) and (not is_impossible):
+                        raise ValueError(
+                            "For training, each question should have exactly 1 answer."
+                        )
+                    if not is_impossible:
+                        answer = qa["answers"][0]
+                        orig_answer_text = answer["text"]
+                        answer_offset = answer["answer_start"]
+                        answer_length = len(orig_answer_text)
+                        start_position = char_to_word_offset[answer_offset]
+                        end_position = char_to_word_offset[answer_offset +
+                                                           answer_length - 1]
+                        # Only add answers where the text can be exactly recovered from the
+                        # document. If this CAN'T happen it's likely due to weird Unicode
+                        # stuff so we will just skip the example.
+                        #
+                        # Note that this means for training mode, every example is NOT
+                        # guaranteed to be preserved.
+                        actual_text = " ".join(doc_tokens[start_position:(
+                            end_position + 1)])
+                        cleaned_answer_text = " ".join(
+                            tokenization.whitespace_tokenize(orig_answer_text))
+                        if actual_text.find(cleaned_answer_text) == -1:
+                            print("Could not find answer: '%s' vs. '%s'",
+                                  actual_text, cleaned_answer_text)
+                            continue
+                    else:
+                        start_position = -1
+                        end_position = -1
+                        orig_answer_text = ""
+
+                example = SquadExample(
+                    qas_id=qas_id,
+                    question_text=question_text,
+                    doc_tokens=doc_tokens,
+                    orig_answer_text=orig_answer_text,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=is_impossible)
+                examples.append(example)
+
+    return examples
+
+
+def convert_examples_to_features(
+        examples,
+        tokenizer,
+        max_seq_length,
+        doc_stride,
+        max_query_length,
+        is_training,
+        #output_fn
+):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    unique_id = 1000000000
+
+    for (example_index, example) in enumerate(examples):
+        query_tokens = tokenizer.tokenize(example.question_text)
+
+        if len(query_tokens) > max_query_length:
+            query_tokens = query_tokens[0:max_query_length]
+
+        tok_to_orig_index = []
+        orig_to_tok_index = []
+        all_doc_tokens = []
+        for (i, token) in enumerate(example.doc_tokens):
+            orig_to_tok_index.append(len(all_doc_tokens))
+            sub_tokens = tokenizer.tokenize(token)
+            for sub_token in sub_tokens:
+                tok_to_orig_index.append(i)
+                all_doc_tokens.append(sub_token)
+
+        tok_start_position = None
+        tok_end_position = None
+        if is_training and example.is_impossible:
+            tok_start_position = -1
+            tok_end_position = -1
+        if is_training and not example.is_impossible:
+            tok_start_position = orig_to_tok_index[example.start_position]
+            if example.end_position < len(example.doc_tokens) - 1:
+                tok_end_position = orig_to_tok_index[example.end_position +
+                                                     1] - 1
+            else:
+                tok_end_position = len(all_doc_tokens) - 1
+            (tok_start_position, tok_end_position) = _improve_answer_span(
+                all_doc_tokens, tok_start_position, tok_end_position,
+                tokenizer, example.orig_answer_text)
+
+        # The -3 accounts for [CLS], [SEP] and [SEP]
+        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+
+        # We can have documents that are longer than the maximum sequence length.
+        # To deal with this we do a sliding window approach, where we take chunks
+        # of the up to our max length with a stride of `doc_stride`.
+        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+            "DocSpan", ["start", "length"])
+        doc_spans = []
+        start_offset = 0
+        while start_offset < len(all_doc_tokens):
+            length = len(all_doc_tokens) - start_offset
+            if length > max_tokens_for_doc:
+                length = max_tokens_for_doc
+            doc_spans.append(_DocSpan(start=start_offset, length=length))
+            if start_offset + length == len(all_doc_tokens):
+                break
+            start_offset += min(length, doc_stride)
+
+        for (doc_span_index, doc_span) in enumerate(doc_spans):
+            tokens = []
+            token_to_orig_map = {}
+            token_is_max_context = {}
+            segment_ids = []
+            tokens.append("[CLS]")
+            segment_ids.append(0)
+            for token in query_tokens:
+                tokens.append(token)
+                segment_ids.append(0)
+            tokens.append("[SEP]")
+            segment_ids.append(0)
+
+            for i in range(doc_span.length):
+                split_token_index = doc_span.start + i
+                token_to_orig_map[len(tokens)] = tok_to_orig_index[
+                    split_token_index]
+
+                is_max_context = _check_is_max_context(
+                    doc_spans, doc_span_index, split_token_index)
+                token_is_max_context[len(tokens)] = is_max_context
+                tokens.append(all_doc_tokens[split_token_index])
+                segment_ids.append(1)
+            tokens.append("[SEP]")
+            segment_ids.append(1)
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            input_mask = [1] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            #while len(input_ids) < max_seq_length:
+            #  input_ids.append(0)
+            #  input_mask.append(0)
+            #  segment_ids.append(0)
+
+            #assert len(input_ids) == max_seq_length
+            #assert len(input_mask) == max_seq_length
+            #assert len(segment_ids) == max_seq_length
+
+            start_position = None
+            end_position = None
+            if is_training and not example.is_impossible:
+                # For training, if our document chunk does not contain an annotation
+                # we throw it out, since there is nothing to predict.
+                doc_start = doc_span.start
+                doc_end = doc_span.start + doc_span.length - 1
+                out_of_span = False
+                if not (tok_start_position >= doc_start and
+                        tok_end_position <= doc_end):
+                    out_of_span = True
+                if out_of_span:
+                    start_position = 0
+                    end_position = 0
+                else:
+                    doc_offset = len(query_tokens) + 2
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset
+
+            if is_training and example.is_impossible:
+                start_position = 0
+                end_position = 0
+            """
+            if example_index < 3:
+                print("*** Example ***")
+                print("unique_id: %s" % (unique_id))
+                print("example_index: %s" % (example_index))
+                print("doc_span_index: %s" % (doc_span_index))
+                print("tokens: %s" % " ".join(
+                    [tokenization.printable_text(x) for x in tokens]))
+                print("token_to_orig_map: %s" % " ".join([
+                    "%d:%d" % (x, y)
+                    for (x, y) in six.iteritems(token_to_orig_map)
+                ]))
+                print("token_is_max_context: %s" % " ".join([
+                    "%d:%s" % (x, y)
+                    for (x, y) in six.iteritems(token_is_max_context)
+                ]))
+                print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                print("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                print("segment_ids: %s" %
+                      " ".join([str(x) for x in segment_ids]))
+                if is_training and example.is_impossible:
+                    print("impossible example")
+                if is_training and not example.is_impossible:
+                    answer_text = " ".join(tokens[start_position:(end_position +
+                                                                  1)])
+                    print("start_position: %d" % (start_position))
+                    print("end_position: %d" % (end_position))
+                    print("answer: %s" %
+                          (tokenization.printable_text(answer_text)))
+            """
+
+            feature = InputFeatures(
+                unique_id=unique_id,
+                example_index=example_index,
+                doc_span_index=doc_span_index,
+                tokens=tokens,
+                token_to_orig_map=token_to_orig_map,
+                token_is_max_context=token_is_max_context,
+                input_ids=input_ids,
+                input_mask=input_mask,
+                segment_ids=segment_ids,
+                start_position=start_position,
+                end_position=end_position,
+                is_impossible=example.is_impossible)
+
+            unique_id += 1
+
+            yield feature
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+
+    # The SQuAD annotations are character based. We first project them to
+    # whitespace-tokenized words. But then after WordPiece tokenization, we can
+    # often find a "better match". For example:
+    #
+    #   Question: What year was John Smith born?
+    #   Context: The leader was John Smith (1895-1943).
+    #   Answer: 1895
+    #
+    # The original whitespace-tokenized answer will be "(1895-1943).". However
+    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
+    # the exact answer, 1895.
+    #
+    # However, this is not always possible. Consider the following:
+    #
+    #   Question: What country is the top exporter of electornics?
+    #   Context: The Japanese electronics industry is the lagest in the world.
+    #   Answer: Japan
+    #
+    # In this case, the annotator chose "Japan" as a character sub-span of
+    # the word "Japanese". Since our WordPiece tokenizer does not split
+    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
+    # in SQuAD, but does happen.
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+
+    # Because of the sliding window approach taken to scoring documents, a single
+    # token can appear in multiple documents. E.g.
+    #  Doc: the man went to the store and bought a gallon of milk
+    #  Span A: the man went to the
+    #  Span B: to the store and bought
+    #  Span C: and bought a gallon of
+    #  ...
+    #
+    # Now the word 'bought' will have two scores from spans B and C. We only
+    # want to consider the score with "maximum context", which we define as
+    # the *minimum* of its left and right context (the *sum* of left and
+    # right context will always be the same, of course).
+    #
+    # In the example the maximum context for 'bought' would be span C since
+    # it has 1 left context and 3 right context, while span B has 4 left context
+    # and 0 right context.
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context,
+                    num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+class DataProcessor(object):
+    def __init__(self, vocab_path, do_lower_case, max_seq_length, in_tokens,
+                 doc_stride, max_query_length):
+        self._tokenizer = tokenization.FullTokenizer(
+            vocab_file=vocab_path, do_lower_case=do_lower_case)
+        self._max_seq_length = max_seq_length
+        self._doc_stride = doc_stride
+        self._max_query_length = max_query_length
+        self._in_tokens = in_tokens
+
+        self.vocab = self._tokenizer.vocab
+        self.vocab_size = len(self.vocab)
+        self.pad_id = self.vocab["[PAD]"]
+        self.cls_id = self.vocab["[CLS]"]
+        self.sep_id = self.vocab["[SEP]"]
+        self.mask_id = self.vocab["[MASK]"]
+
+        self.current_train_example = -1
+        self.num_train_examples = -1
+        self.current_train_epoch = -1
+
+        self.train_examples = None
+        self.predict_examples = None
+        self.num_examples = {'train': -1, 'predict': -1}
+
+    def get_train_progress(self):
+        """Gets progress for training phase."""
+        return self.current_train_example, self.current_train_epoch
+
+    def get_examples(self,
+                     data_path,
+                     is_training,
+                     version_2_with_negative=False):
+        examples = read_squad_examples(
+            input_file=data_path,
+            is_training=is_training,
+            version_2_with_negative=version_2_with_negative)
+        return examples
+
+    def get_num_examples(self, phase):
+        if phase not in ['train', 'predict']:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'predict'].")
+        return self.num_examples[phase]
+
+    def get_features(self, examples, is_training):
+        features = convert_examples_to_features(
+            examples=examples,
+            tokenizer=self._tokenizer,
+            max_seq_length=self._max_seq_length,
+            doc_stride=self._doc_stride,
+            max_query_length=self._max_query_length,
+            is_training=is_training)
+        return features
+
+    def data_generator(self,
+                       data_path,
+                       batch_size,
+                       phase='train',
+                       shuffle=False,
+                       dev_count=1,
+                       version_2_with_negative=False,
+                       epoch=1):
+        if phase == 'train':
+            self.train_examples = self.get_examples(
+                data_path,
+                is_training=True,
+                version_2_with_negative=version_2_with_negative)
+            examples = self.train_examples
+            self.num_examples['train'] = len(self.train_examples)
+        elif phase == 'predict':
+            self.predict_examples = self.get_examples(
+                data_path,
+                is_training=False,
+                version_2_with_negative=version_2_with_negative)
+            examples = self.predict_examples
+            self.num_examples['predict'] = len(self.predict_examples)
+        else:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'predict'].")
+
+        def batch_reader(features, batch_size, in_tokens):
+            batch, total_token_num, max_len = [], 0, 0
+            for (index, feature) in enumerate(features):
+                if phase == 'train':
+                    self.current_train_example = index + 1
+                seq_len = len(feature.input_ids)
+                labels = [feature.unique_id
+                          ] if feature.start_position is None else [
+                              feature.start_position, feature.end_position
+                          ]
+                example = [
+                    feature.input_ids, feature.segment_ids, range(seq_len)
+                ] + labels
+                max_len = max(max_len, seq_len)
+
+                #max_len = max(max_len, len(token_ids))
+                if in_tokens:
+                    to_append = (len(batch) + 1) * max_len <= batch_size
+                else:
+                    to_append = len(batch) < batch_size
+
+                if to_append:
+                    batch.append(example)
+                    total_token_num += seq_len
+                else:
+                    yield batch, total_token_num
+                    batch, total_token_num, max_len = [example
+                                                       ], seq_len, seq_len
+            if len(batch) > 0:
+                yield batch, total_token_num
+
+        def wrapper():
+            for epoch_index in range(epoch):
+                if shuffle:
+                    random.shuffle(examples)
+                if phase == 'train':
+                    self.current_train_epoch = epoch_index
+                    features = self.get_features(examples, is_training=True)
+                else:
+                    features = self.get_features(examples, is_training=False)
+
+                all_dev_batches = []
+                for batch_data, total_token_num in batch_reader(
+                        features, batch_size, self._in_tokens):
+                    batch_data = prepare_batch_data(
+                        batch_data,
+                        total_token_num,
+                        voc_size=-1,
+                        pad_id=self.pad_id,
+                        cls_id=self.cls_id,
+                        sep_id=self.sep_id,
+                        mask_id=-1,
+                        return_input_mask=True,
+                        return_max_len=False,
+                        return_num_token=False)
+                    if len(all_dev_batches) < dev_count:
+                        all_dev_batches.append(batch_data)
+
+                    if len(all_dev_batches) == dev_count:
+                        for batch in all_dev_batches:
+                            yield batch
+                        all_dev_batches = []
+
+        return wrapper
+
+
+def write_predictions(all_examples, all_features, all_results, n_best_size,
+                      max_answer_length, do_lower_case, output_prediction_file,
+                      output_nbest_file, output_null_log_odds_file,
+                      version_2_with_negative, null_score_diff_threshold,
+                      verbose):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    print("Writing predictions to: %s" % (output_prediction_file))
+    print("Writing nbest to: %s" % (output_nbest_file))
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", [
+            "feature_index", "start_index", "end_index", "start_logit",
+            "end_logit"
+        ])
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min mull score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if version_2_with_negative:
+                feature_null_score = result.start_logits[
+                    0] + result.end_logits[0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index,
+                                                            False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index]))
+
+        if version_2_with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit))
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_logit + x.end_logit),
+            reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index:(pred.end_index +
+                                                              1)]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end +
+                                                                 1)]
+                tok_text = " ".join(tok_tokens)
+
+                # De-tokenize WordPieces that have been split off.
+                tok_text = tok_text.replace(" ##", "")
+                tok_text = tok_text.replace("##", "")
+
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = " ".join(orig_tokens)
+
+                final_text = get_final_text(tok_text, orig_text, do_lower_case,
+                                            verbose)
+                if final_text in seen_predictions:
+                    continue
+
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_logit=pred.start_logit,
+                    end_logit=pred.end_logit))
+
+        # if we didn't inlude the empty option in the n-best, inlcude it
+        if version_2_with_negative:
+            if "" not in seen_predictions:
+                nbest.append(
+                    _NbestPrediction(
+                        text="",
+                        start_logit=null_start_logit,
+                        end_logit=null_end_logit))
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(
+                    text="empty", start_logit=0.0, end_logit=0.0))
+
+        assert len(nbest) >= 1
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+        # debug
+        if best_non_null_entry is None:
+            print("Emmm..., sth wrong")
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1
+
+        if not version_2_with_negative:
+            all_predictions[example.qas_id] = nbest_json[0]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = score_null - best_non_null_entry.start_logit - (
+                best_non_null_entry.end_logit)
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heruistic between
+    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose:
+            print("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose:
+            print("Length not equal after stripping spaces: '%s' vs '%s'",
+                  orig_ns_text, tok_ns_text)
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        if verbose:
+            print("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        if verbose:
+            print("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(
+        enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
+
+
+if __name__ == '__main__':
+    train_file = 'squad/train-v1.1.json'
+    vocab_file = 'uncased_L-12_H-768_A-12/vocab.txt'
+    do_lower_case = True
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=vocab_file, do_lower_case=do_lower_case)
+    train_examples = read_squad_examples(
+        input_file=train_file, is_training=True)
+    print("begin converting")
+    for (index, feature) in enumerate(
+            convert_examples_to_features(
+                examples=train_examples,
+                tokenizer=tokenizer,
+                max_seq_length=384,
+                doc_stride=128,
+                max_query_length=64,
+                is_training=True,
+                #output_fn=train_writer.process_feature
+            )):
+        if index < 10:
+            print(index, feature.input_ids, feature.input_mask,
+                  feature.segment_ids)
+    #for (index, example) in enumerate(train_examples):
+    #    if index < 5:
+    #        print(example)
diff --git a/paddleslim/nas/darts/search_space/conv_bert/reader/tokenization.py b/paddleslim/nas/darts/search_space/conv_bert/reader/tokenization.py
new file mode 100644
index 0000000000000000000000000000000000000000..08570f30fe9e6a8036a15095e67e6e8dd8686c14
--- /dev/null
+++ b/paddleslim/nas/darts/search_space/conv_bert/reader/tokenization.py
@@ -0,0 +1,371 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import unicodedata
+import six
+import io
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    fin = io.open(vocab_file, encoding="utf8")
+    for num, line in enumerate(fin):
+        items = convert_to_unicode(line.strip()).split("\t")
+        if len(items) > 2:
+            break
+        token = items[0]
+        index = items[1] if len(items) == 2 else num
+        token = token.strip()
+        vocab[token] = int(index)
+    return vocab
+
+
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class CharTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in text.lower().split(" "):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+
+        Args:
+            do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+            input = "unaffable"
+            output = ["un", "##aff", "##able"]
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through `BasicTokenizer.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        text = convert_to_unicode(text)
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
diff --git a/paddleslim/nas/darts/search_space/conv_bert/utils/__init__.py b/paddleslim/nas/darts/search_space/conv_bert/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/paddleslim/nas/darts/search_space/conv_bert/utils/convert_static_to_dygraph.py b/paddleslim/nas/darts/search_space/conv_bert/utils/convert_static_to_dygraph.py
new file mode 100755
index 0000000000000000000000000000000000000000..cbd4f7f74003cbcb1f7f800e7f72e69fbbb3a5f9
--- /dev/null
+++ b/paddleslim/nas/darts/search_space/conv_bert/utils/convert_static_to_dygraph.py
@@ -0,0 +1,228 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import shutil
+import sys
+import os
+
+
+def usage():
+    """
+    usage information
+    """
+    print
+    print("please use command: ")
+    print(
+        "python convert_static_to_dygraph.py input_params_dir output_params_dir"
+    )
+    print
+
+
+def convert_static_to_dygraph(static_model_path, dygraph_model_path):
+    """
+    convert paddle static bert model to dygraph model 
+    """
+
+    def mkdir(path):
+        if not os.path.isdir(path):
+            if os.path.split(path)[0]:
+                mkdir(os.path.split(path)[0])
+        else:
+            return
+        os.mkdir(path)
+
+    if os.path.exists(dygraph_model_path):
+        shutil.rmtree(dygraph_model_path)
+    mkdir(dygraph_model_path)
+
+    if not os.path.exists(static_model_path):
+        print("paddle static model path doesn't exist.....")
+        return -1
+
+    file_list = []
+    for root, dirs, files in os.walk(static_model_path):
+        file_list.extend(files)
+
+    os.makedirs(os.path.join(dygraph_model_path, "PretrainModelLayer_0"))
+    os.makedirs(
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/BertModelLayer_0"))
+    os.makedirs(
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/PrePostProcessLayer_0"))
+    os.makedirs(
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0"))
+
+    #os.chdir(static_model_path)
+    #convert embedding file
+    embedding_type = ["word", "pos", "sent"]
+    for i in range(3):
+        src_name = embedding_type[i] + "_embedding"
+        trg_name = "Embedding_" + str(i) + "." + src_name
+        shutil.copyfile(
+            os.path.join(static_model_path, src_name),
+            os.path.join(dygraph_model_path,
+                         "PretrainModelLayer_0/BertModelLayer_0/" + trg_name))
+
+    #convert pre_encoder file
+    shutil.copyfile(
+        os.path.join(static_model_path, "pre_encoder_layer_norm_scale"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
+        ))
+    shutil.copyfile(
+        os.path.join(static_model_path, "pre_encoder_layer_norm_bias"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
+        ))
+
+    #convert mask lm params file
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_out_fc.b_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/Layer_0.mask_lm_out_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_fc.b_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_0.mask_lm_trans_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_fc.w_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_0.mask_lm_trans_fc.w_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_layer_norm_bias"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
+        ))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_layer_norm_scale"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
+        ))
+    shutil.copyfile(
+        os.path.join(static_model_path, "next_sent_fc.b_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_1.next_sent_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "next_sent_fc.w_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_1.next_sent_fc.w_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "pooled_fc.b_0"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "pooled_fc.w_0"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.w_0"))
+
+    encoder_num = 0
+    for f in file_list:
+        if not f.startswith("encoder_layer"):
+            continue
+        layer_num = f.split('_')[2]
+        if int(layer_num) > encoder_num:
+            encoder_num = int(layer_num)
+
+    encoder_num += 1
+    for i in range(encoder_num):
+        encoder_dir = "EncoderSubLayer_" + str(i)
+        os.makedirs(
+            os.path.join(dygraph_model_path,
+                         "PretrainModelLayer_0/BertModelLayer_0/" +
+                         "EncoderLayer_0/", encoder_dir))
+        os.makedirs(
+            os.path.join(dygraph_model_path,
+                         "PretrainModelLayer_0/BertModelLayer_0/" +
+                         "EncoderLayer_0/", encoder_dir +
+                         "/PositionwiseFeedForwardLayer_0"))
+        os.makedirs(
+            os.path.join(
+                dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
+                "EncoderLayer_0/", encoder_dir + "/MultiHeadAttentionLayer_0"))
+        os.makedirs(
+            os.path.join(
+                dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
+                "EncoderLayer_0/", encoder_dir + "/PrePostProcessLayer_1"))
+        os.makedirs(
+            os.path.join(
+                dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
+                "EncoderLayer_0/", encoder_dir + "/PrePostProcessLayer_3"))
+
+    encoder_map_dict = {
+        "ffn_fc_0.b_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_0.ffn_fc_0.b_0"),
+        "ffn_fc_0.w_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_0.ffn_fc_0.w_0"),
+        "ffn_fc_1.b_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_1.ffn_fc_1.b_0"),
+        "ffn_fc_1.w_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_1.ffn_fc_1.w_0"),
+        "multi_head_att_key_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_1.key_fc.b_0"),
+        "multi_head_att_key_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_1.key_fc.w_0"),
+        "multi_head_att_output_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_3.output_fc.b_0"),
+        "multi_head_att_output_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_3.output_fc.w_0"),
+        "multi_head_att_query_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_0.query_fc.b_0"),
+        "multi_head_att_query_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_0.query_fc.w_0"),
+        "multi_head_att_value_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_2.value_fc.b_0"),
+        "multi_head_att_value_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_2.value_fc.w_0"),
+        "post_att_layer_norm_bias":
+        ("PrePostProcessLayer_1", "LayerNorm_0.post_att_layer_norm_bias"),
+        "post_att_layer_norm_scale":
+        ("PrePostProcessLayer_1", "LayerNorm_0.post_att_layer_norm_scale"),
+        "post_ffn_layer_norm_bias":
+        ("PrePostProcessLayer_3", "LayerNorm_0.post_ffn_layer_norm_bias"),
+        "post_ffn_layer_norm_scale":
+        ("PrePostProcessLayer_3", "LayerNorm_0.post_ffn_layer_norm_scale")
+    }
+
+    for f in file_list:
+        if not f.startswith("encoder_layer"):
+            continue
+        layer_num = f.split('_')[2]
+        suffix_name = "_".join(f.split('_')[3:])
+        in_dir = encoder_map_dict[suffix_name][0]
+        rename = encoder_map_dict[suffix_name][1]
+        encoder_layer = "EncoderSubLayer_" + layer_num
+        shutil.copyfile(
+            os.path.join(static_model_path, f),
+            os.path.join(
+                dygraph_model_path,
+                "PretrainModelLayer_0/BertModelLayer_0/EncoderLayer_0/" +
+                encoder_layer + "/" + in_dir + "/" + rename))
+
+
+if __name__ == "__main__":
+
+    if len(sys.argv) < 3:
+        usage()
+        exit(1)
+    static_model_path = sys.argv[1]
+    dygraph_model_path = sys.argv[2]
+    convert_static_to_dygraph(static_model_path, dygraph_model_path)
diff --git a/paddleslim/nas/darts/search_space/conv_bert/utils/fp16.py b/paddleslim/nas/darts/search_space/conv_bert/utils/fp16.py
new file mode 100644
index 0000000000000000000000000000000000000000..e153c2b9a1029897def264278c5dbe72e1f369f5
--- /dev/null
+++ b/paddleslim/nas/darts/search_space/conv_bert/utils/fp16.py
@@ -0,0 +1,97 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+
+
+def cast_fp16_to_fp32(i, o, prog):
+    prog.global_block().append_op(
+        type="cast",
+        inputs={"X": i},
+        outputs={"Out": o},
+        attrs={
+            "in_dtype": fluid.core.VarDesc.VarType.FP16,
+            "out_dtype": fluid.core.VarDesc.VarType.FP32
+        })
+
+
+def cast_fp32_to_fp16(i, o, prog):
+    prog.global_block().append_op(
+        type="cast",
+        inputs={"X": i},
+        outputs={"Out": o},
+        attrs={
+            "in_dtype": fluid.core.VarDesc.VarType.FP32,
+            "out_dtype": fluid.core.VarDesc.VarType.FP16
+        })
+
+
+def copy_to_master_param(p, block):
+    v = block.vars.get(p.name, None)
+    if v is None:
+        raise ValueError("no param name %s found!" % p.name)
+    new_p = fluid.framework.Parameter(
+        block=block,
+        shape=v.shape,
+        dtype=fluid.core.VarDesc.VarType.FP32,
+        type=v.type,
+        lod_level=v.lod_level,
+        stop_gradient=p.stop_gradient,
+        trainable=p.trainable,
+        optimize_attr=p.optimize_attr,
+        regularizer=p.regularizer,
+        gradient_clip_attr=p.gradient_clip_attr,
+        error_clip=p.error_clip,
+        name=v.name + ".master")
+    return new_p
+
+
+def create_master_params_grads(params_grads, main_prog, startup_prog,
+                               loss_scaling):
+    master_params_grads = []
+    tmp_role = main_prog._current_role
+    OpRole = fluid.core.op_proto_and_checker_maker.OpRole
+    main_prog._current_role = OpRole.Backward
+    for p, g in params_grads:
+        # create master parameters
+        master_param = copy_to_master_param(p, main_prog.global_block())
+        startup_master_param = startup_prog.global_block()._clone_variable(
+            master_param)
+        startup_p = startup_prog.global_block().var(p.name)
+        cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
+        # cast fp16 gradients to fp32 before apply gradients
+        if g.name.find("layer_norm") > -1:
+            if loss_scaling > 1:
+                scaled_g = g / float(loss_scaling)
+            else:
+                scaled_g = g
+            master_params_grads.append([p, scaled_g])
+            continue
+        master_grad = fluid.layers.cast(g, "float32")
+        if loss_scaling > 1:
+            master_grad = master_grad / float(loss_scaling)
+        master_params_grads.append([master_param, master_grad])
+    main_prog._current_role = tmp_role
+    return master_params_grads
+
+
+def master_param_to_train_param(master_params_grads, params_grads, main_prog):
+    for idx, m_p_g in enumerate(master_params_grads):
+        train_p, _ = params_grads[idx]
+        if train_p.name.find("layer_norm") > -1:
+            continue
+        with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
+            cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
diff --git a/paddleslim/nas/darts/search_space/conv_bert/utils/init.py b/paddleslim/nas/darts/search_space/conv_bert/utils/init.py
new file mode 100644
index 0000000000000000000000000000000000000000..52f9b38082fd79258c292c9970e3d65ffb9a2d52
--- /dev/null
+++ b/paddleslim/nas/darts/search_space/conv_bert/utils/init.py
@@ -0,0 +1,245 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import six
+import ast
+import copy
+
+import numpy as np
+import paddle.fluid as fluid
+
+
+def cast_fp32_to_fp16(exe, main_program):
+    print("Cast parameters to float16 data format.")
+    for param in main_program.global_block().all_parameters():
+        if not param.name.endswith(".master"):
+            param_t = fluid.global_scope().find_var(param.name).get_tensor()
+            data = np.array(param_t)
+            if param.name.find("layer_norm") == -1:
+                param_t.set(np.float16(data).view(np.uint16), exe.place)
+            master_param_var = fluid.global_scope().find_var(param.name +
+                                                             ".master")
+            if master_param_var is not None:
+                master_param_var.get_tensor().set(data, exe.place)
+
+
+def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
+    assert os.path.exists(
+        init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
+
+    def existed_persitables(var):
+        if not fluid.io.is_persistable(var):
+            return False
+        return os.path.exists(os.path.join(init_checkpoint_path, var.name))
+
+    fluid.io.load_vars(
+        exe,
+        init_checkpoint_path,
+        main_program=main_program,
+        predicate=existed_persitables)
+    print("Load model from {}".format(init_checkpoint_path))
+
+    if use_fp16:
+        cast_fp32_to_fp16(exe, main_program)
+
+
+def init_pretraining_params(exe,
+                            pretraining_params_path,
+                            main_program,
+                            use_fp16=False):
+    assert os.path.exists(pretraining_params_path
+                          ), "[%s] cann't be found." % pretraining_params_path
+
+    def existed_params(var):
+        if not isinstance(var, fluid.framework.Parameter):
+            return False
+        return os.path.exists(os.path.join(pretraining_params_path, var.name))
+
+    fluid.io.load_vars(
+        exe,
+        pretraining_params_path,
+        main_program=main_program,
+        predicate=existed_params)
+    print("Load pretraining parameters from {}.".format(
+        pretraining_params_path))
+
+    if use_fp16:
+        cast_fp32_to_fp16(exe, main_program)
+
+
+def init_from_static_model(dir_path, cls_model, bert_config):
+    def load_numpy_weight(file_name):
+        if six.PY2:
+            res = np.load(os.path.join(dir_path, file_name), allow_pickle=True)
+        else:
+            res = np.load(
+                os.path.join(dir_path, file_name),
+                allow_pickle=True,
+                encoding='latin1')
+        assert res is not None
+        return res
+
+    # load word embedding
+    _param = load_numpy_weight("word_embedding")
+    cls_model.bert_layer._src_emb.set_dict({"weight": _param})
+    print("INIT word embedding")
+
+    _param = load_numpy_weight("pos_embedding")
+    cls_model.bert_layer._pos_emb.set_dict({"weight": _param})
+    print("INIT pos embedding")
+
+    _param = load_numpy_weight("sent_embedding")
+    cls_model.bert_layer._sent_emb.set_dict({"weight": _param})
+    print("INIT sent embedding")
+
+    _param0 = load_numpy_weight("pooled_fc.w_0")
+    _param1 = load_numpy_weight("pooled_fc.b_0")
+    cls_model.bert_layer.pooled_fc.set_dict({
+        "weight": _param0,
+        "bias": _param1
+    })
+    print("INIT pooled_fc")
+
+    _param0 = load_numpy_weight("pre_encoder_layer_norm_scale")
+    _param1 = load_numpy_weight("pre_encoder_layer_norm_bias")
+    cls_model.bert_layer.pre_process_layer._sub_layers[
+        "layer_norm_0"].set_dict({
+            "weight": _param0,
+            "bias": _param1
+        })
+    print("INIT pre_encoder layer norm")
+
+    for _i in range(bert_config["num_hidden_layers"]):
+        _param_weight = "encoder_layer_%d_multi_head_att_query_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_query_fc.b_0" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        cls_model.bert_layer._encoder._sub_layers[
+            "esl_%d" % _i]._multihead_attention_layer._q_fc.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        print("INIT multi_head_att_query_fc %d" % _i)
+
+        _param_weight = "encoder_layer_%d_multi_head_att_key_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_key_fc.b_0" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        cls_model.bert_layer._encoder._sub_layers[
+            "esl_%d" % _i]._multihead_attention_layer._k_fc.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        print("INIT multi_head_att_key_fc %d" % _i)
+
+        _param_weight = "encoder_layer_%d_multi_head_att_value_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_value_fc.b_0" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        cls_model.bert_layer._encoder._sub_layers[
+            "esl_%d" % _i]._multihead_attention_layer._v_fc.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        print("INIT multi_head_att_value_fc %d" % _i)
+
+        # init output fc
+        _param_weight = "encoder_layer_%d_multi_head_att_output_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_output_fc.b_0" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        cls_model.bert_layer._encoder._sub_layers[
+            "esl_%d" % _i]._multihead_attention_layer._proj_fc.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        print("INIT multi_head_att_output_fc %d" % _i)
+
+        # init layer_norm 1
+        _param_weight = "encoder_layer_%d_post_att_layer_norm_scale" % _i
+        _param_bias = "encoder_layer_%d_post_att_layer_norm_bias" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        cls_model.bert_layer._encoder._sub_layers[
+            "esl_%d" % _i]._postprocess_layer.layer_norm_0.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        print("INIT layer norm in attention at %d layer" % _i)
+
+        # init layer_norm 2
+        _param_weight = "encoder_layer_%d_post_ffn_layer_norm_scale" % _i
+        _param_bias = "encoder_layer_%d_post_ffn_layer_norm_bias" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        cls_model.bert_layer._encoder._sub_layers[
+            "esl_%d" % _i]._postprocess_layer2.layer_norm_0.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        print("INIT layer norm in FFN at %d layer" % _i)
+
+        # init FFN 1
+        _param_weight = "encoder_layer_%d_ffn_fc_0.w_0" % _i
+        _param_bias = "encoder_layer_%d_ffn_fc_0.b_0" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        cls_model.bert_layer._encoder._sub_layers[
+            "esl_%d" % _i]._positionwise_feed_forward._i2h.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        print("INIT FFN-1 at %d layer" % _i)
+
+        # init FFN 2
+        _param_weight = "encoder_layer_%d_ffn_fc_1.w_0" % _i
+        _param_bias = "encoder_layer_%d_ffn_fc_1.b_0" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        cls_model.bert_layer._encoder._sub_layers[
+            "esl_%d" % _i]._positionwise_feed_forward._h2o.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        print("INIT FFN-2 at %d layer" % _i)
+
+    # init cls fc
+    #_param_weight = "cls_out_w"
+    #_param_bias = "cls_out_b"
+
+    #_param_weight = load_numpy_weight(_param_weight)
+    #_param_bias = load_numpy_weight(_param_bias)
+
+    #cls_model.cls_fc.set_dict({"weight":_param_weight, "bias":_param_bias})
+    #print("INIT CLS FC layer")
+    return True
diff --git a/paddleslim/nas/darts/train_search.py b/paddleslim/nas/darts/train_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce37a44fe9b1a35a2adff58fcdaf13321f7d8ec2
--- /dev/null
+++ b/paddleslim/nas/darts/train_search.py
@@ -0,0 +1,260 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+__all__ = ['DARTSearch', 'count_parameters_in_MB']
+
+import os
+import logging
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.base import to_variable
+from ...common import AvgrageMeter, get_logger
+from .architect import Architect
+from .get_genotype import get_genotype
+logger = get_logger(__name__, level=logging.INFO)
+
+
+def count_parameters_in_MB(all_params):
+    """Count the parameters in the target list.
+    Args:
+        all_params(list): List of Variables.
+
+    Returns:
+        float: The total count(MB) of target parameter list.
+    """
+
+    parameters_number = 0
+    for param in all_params:
+        if param.trainable and 'aux' not in param.name:
+            parameters_number += np.prod(param.shape)
+    return parameters_number / 1e6
+
+
+class DARTSearch(object):
+    """Used for Differentiable ARchiTecture Search(DARTS)
+
+    Args:
+        model(Paddle DyGraph model): Super Network for Search.
+        train_reader(Python Generator): Generator to provide training data.
+        valid_reader(Python Generator): Generator to provide validation  data.
+        place(fluid.CPUPlace()|fluid.CUDAPlace(N)): This parameter represents the executor run on which device.
+        learning_rate(float): Model parameter initial learning rate. Default: 0.025.
+        batch_size(int): Minibatch size. Default: 64.
+        arch_learning_rate(float): Learning rate for arch encoding. Default: 3e-4.
+        unrolled(bool): Use one-step unrolled validation loss. Default: False.
+        num_epochs(int): Epoch number. Default: 50.
+        epochs_no_archopt(int): Epochs skip architecture optimize at begining. Default: 0.
+        use_multiprocess(bool): Whether to use multiprocess in dataloader. Default: False.
+        use_data_parallel(bool): Whether to use data parallel mode. Default: False.
+        log_freq(int): Log frequency. Default: 50.
+
+    """
+
+    def __init__(self,
+                 model,
+                 train_reader,
+                 valid_reader,
+                 place,
+                 learning_rate=0.025,
+                 batchsize=64,
+                 num_imgs=50000,
+                 arch_learning_rate=3e-4,
+                 unrolled=False,
+                 num_epochs=50,
+                 epochs_no_archopt=0,
+                 use_multiprocess=False,
+                 use_data_parallel=False,
+                 save_dir='./',
+                 log_freq=50):
+        self.model = model
+        self.train_reader = train_reader
+        self.valid_reader = valid_reader
+        self.place = place,
+        self.learning_rate = learning_rate
+        self.batchsize = batchsize
+        self.num_imgs = num_imgs
+        self.arch_learning_rate = arch_learning_rate
+        self.unrolled = unrolled
+        self.epochs_no_archopt = epochs_no_archopt
+        self.num_epochs = num_epochs
+        self.use_multiprocess = use_multiprocess
+        self.use_data_parallel = use_data_parallel
+        self.save_dir = save_dir
+        self.log_freq = log_freq
+
+    def train_one_epoch(self, train_loader, valid_loader, architect, optimizer,
+                        epoch):
+        objs = AvgrageMeter()
+        top1 = AvgrageMeter()
+        top5 = AvgrageMeter()
+        self.model.train()
+
+        for step_id, (
+                train_data,
+                valid_data) in enumerate(zip(train_loader(), valid_loader())):
+            train_image, train_label = train_data
+            valid_image, valid_label = valid_data
+            train_image = to_variable(train_image)
+            train_label = to_variable(train_label)
+            train_label.stop_gradient = True
+            valid_image = to_variable(valid_image)
+            valid_label = to_variable(valid_label)
+            valid_label.stop_gradient = True
+            n = train_image.shape[0]
+
+            if epoch >= self.epochs_no_archopt:
+                architect.step(train_image, train_label, valid_image,
+                               valid_label)
+
+            logits = self.model(train_image)
+            prec1 = fluid.layers.accuracy(input=logits, label=train_label, k=1)
+            prec5 = fluid.layers.accuracy(input=logits, label=train_label, k=5)
+            loss = fluid.layers.reduce_mean(
+                fluid.layers.softmax_with_cross_entropy(logits, train_label))
+
+            if self.use_data_parallel:
+                loss = self.model.scale_loss(loss)
+                loss.backward()
+                self.model.apply_collective_grads()
+            else:
+                loss.backward()
+
+            optimizer.minimize(loss)
+            self.model.clear_gradients()
+
+            objs.update(loss.numpy(), n)
+            top1.update(prec1.numpy(), n)
+            top5.update(prec5.numpy(), n)
+
+            if step_id % self.log_freq == 0:
+                #logger.info("Train Epoch {}, Step {}, loss {:.6f}; ce: {:.6f}; kd: {:.6f}; e: {:.6f}".format(
+                #    epoch, step_id, objs.avg[0], ce_losses.avg[0], kd_losses.avg[0], e_losses.avg[0]))
+                logger.info(
+                    "Train Epoch {}, Step {}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}".
+                    format(epoch, step_id, objs.avg[0], top1.avg[0], top5.avg[
+                        0]))
+        return top1.avg[0]
+
+    def valid_one_epoch(self, valid_loader, epoch):
+        objs = AvgrageMeter()
+        top1 = AvgrageMeter()
+        top5 = AvgrageMeter()
+        self.model.eval()
+
+        for step_id, (image, label) in enumerate(valid_loader):
+            image = to_variable(image)
+            label = to_variable(label)
+            n = image.shape[0]
+            logits = self.model(image)
+            prec1 = fluid.layers.accuracy(input=logits, label=label, k=1)
+            prec5 = fluid.layers.accuracy(input=logits, label=label, k=5)
+            loss = fluid.layers.reduce_mean(
+                fluid.layers.softmax_with_cross_entropy(logits, label))
+            objs.update(loss.numpy(), n)
+            top1.update(prec1.numpy(), n)
+            top5.update(prec5.numpy(), n)
+
+            if step_id % self.log_freq == 0:
+                logger.info(
+                    "Valid Epoch {}, Step {}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}".
+                    format(epoch, step_id, objs.avg[0], top1.avg[0], top5.avg[
+                        0]))
+        return top1.avg[0]
+
+    def train(self):
+        """Start search process.
+
+        """
+
+        model_parameters = [
+            p for p in self.model.parameters()
+            if p.name not in [a.name for a in self.model.arch_parameters()]
+        ]
+        logger.info("param size = {:.6f}MB".format(
+            count_parameters_in_MB(model_parameters)))
+
+        device_num = fluid.dygraph.parallel.Env().nranks
+        step_per_epoch = int(self.num_imgs * 0.5 /
+                             (self.batchsize * device_num))
+        if self.unrolled:
+            step_per_epoch *= 2
+
+        learning_rate = fluid.dygraph.CosineDecay(
+            self.learning_rate, step_per_epoch, self.num_epochs)
+
+        clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
+        optimizer = fluid.optimizer.MomentumOptimizer(
+            learning_rate,
+            0.9,
+            regularization=fluid.regularizer.L2DecayRegularizer(3e-4),
+            parameter_list=model_parameters,
+            grad_clip=clip)
+
+        if self.use_data_parallel:
+            self.train_reader = fluid.contrib.reader.distributed_batch_reader(
+                self.train_reader)
+
+        train_loader = fluid.io.DataLoader.from_generator(
+            capacity=64,
+            use_double_buffer=True,
+            iterable=True,
+            return_list=True,
+            use_multiprocess=self.use_multiprocess)
+        valid_loader = fluid.io.DataLoader.from_generator(
+            capacity=64,
+            use_double_buffer=True,
+            iterable=True,
+            return_list=True,
+            use_multiprocess=self.use_multiprocess)
+
+        train_loader.set_batch_generator(self.train_reader, places=self.place)
+        valid_loader.set_batch_generator(self.valid_reader, places=self.place)
+
+        base_model = self.model
+        architect = Architect(
+            model=self.model,
+            eta=learning_rate,
+            arch_learning_rate=self.arch_learning_rate,
+            unrolled=self.unrolled,
+            parallel=self.use_data_parallel)
+
+        self.model = architect.get_model()
+
+        save_parameters = (not self.use_data_parallel) or (
+            self.use_data_parallel and
+            fluid.dygraph.parallel.Env().local_rank == 0)
+
+        for epoch in range(self.num_epochs):
+            logger.info('Epoch {}, lr {:.6f}'.format(
+                epoch, optimizer.current_step_lr()))
+
+            genotype = get_genotype(base_model)
+            logger.info('genotype = %s', genotype)
+
+            train_top1 = self.train_one_epoch(train_loader, valid_loader,
+                                              architect, optimizer, epoch)
+            logger.info("Epoch {}, train_acc {:.6f}".format(epoch, train_top1))
+
+            if epoch == self.num_epochs - 1:
+                valid_top1 = self.valid_one_epoch(valid_loader, epoch)
+                logger.info("Epoch {}, valid_acc {:.6f}".format(epoch,
+                                                                valid_top1))
+            if save_parameters:
+                fluid.save_dygraph(
+                    self.model.state_dict(),
+                    os.path.join(self.save_dir, str(epoch), "params"))
diff --git a/paddleslim/nas/early_stop/__init__.py b/paddleslim/nas/early_stop/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..748208aa6bbc7213099583e6d94da9b1ccb9597e
--- /dev/null
+++ b/paddleslim/nas/early_stop/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from .early_stop import EarlyStopBase
+from .median_stop import MedianStop
+
+__all__ = ['EarlyStopBase', 'MedianStop']
diff --git a/paddleslim/nas/early_stop/early_stop.py b/paddleslim/nas/early_stop/early_stop.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d05d8d129b489e883da1581f700555058da5137
--- /dev/null
+++ b/paddleslim/nas/early_stop/early_stop.py
@@ -0,0 +1,32 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['EarlyStopBase']
+
+
+class EarlyStopBase(object):
+    """ Abstract early Stop algorithm.
+    """
+
+    def get_status(self, iter, result):
+        """Get experiment status.
+        """
+        raise NotImplementedError(
+            'get_status in Early Stop algorithm NOT implemented.')
+
+    def client_end(self):
+        """ Stop a client, this function may useful for the client that result is better and better.
+        """
+        raise NotImplementedError(
+            'client_end in Early Stop algorithm NOT implemented.')
diff --git a/paddleslim/nas/early_stop/median_stop/__init__.py b/paddleslim/nas/early_stop/median_stop/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..290ea074b64755c8bc825112450a7a7fcc0aa865
--- /dev/null
+++ b/paddleslim/nas/early_stop/median_stop/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from .median_stop import MedianStop
+
+__all__ = ['MedianStop']
diff --git a/paddleslim/nas/early_stop/median_stop/median_stop.py b/paddleslim/nas/early_stop/median_stop/median_stop.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a1cd7cd67846701dac1620a671afdd5dc4ddbae
--- /dev/null
+++ b/paddleslim/nas/early_stop/median_stop/median_stop.py
@@ -0,0 +1,184 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from multiprocessing.managers import BaseManager
+from ..early_stop import EarlyStopBase
+from ....common.log_helper import get_logger
+
+PublicAuthKey = u'AbcXyz3'
+
+__all__ = ['MedianStop']
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+completed_history = dict()
+
+
+def return_completed_history():
+    return completed_history
+
+
+class MedianStop(EarlyStopBase):
+    """
+    Median Stop, reference:
+    Args:
+        strategy<class instance>: the stategy of search.
+        start_epoch<int>: which step to start early stop algorithm.
+        mode<str>: bigger is better or smaller is better, chooice in ['maxmize', 'minimize']. Default: maxmize.
+    """
+
+    def __init__(self, strategy, start_epoch, mode='maxmize'):
+        self._start_epoch = start_epoch
+        self._running_history = dict()
+        self._strategy = strategy
+        self._mode = mode
+        self._is_server = self._strategy._is_server
+        self._manager = self._start_manager()
+        assert self._mode in [
+            'maxmize', 'minimize'
+        ], 'mode of MedianStop must be \'maxmize\' or \'minimize\', but received mode is {}'.format(
+            self._mode)
+
+    def _start_manager(self):
+        self._server_ip = self._strategy._server_ip
+        self._server_port = self._strategy._server_port + 1
+
+        if self._is_server:
+            BaseManager.register(
+                'get_completed_history', callable=return_completed_history)
+            base_manager = BaseManager(
+                address=(self._server_ip, self._server_port),
+                authkey=PublicAuthKey.encode())
+
+            base_manager.start()
+        else:
+            BaseManager.register('get_completed_history')
+            base_manager = BaseManager(
+                address=(self._server_ip, self._server_port),
+                authkey=PublicAuthKey.encode())
+            base_manager.connect()
+        return base_manager
+
+    def _update_data(self, exp_name, result):
+        if exp_name not in self._running_history.keys():
+            self._running_history[exp_name] = []
+        self._running_history[exp_name].append(result)
+
+    def _convert_running2completed(self, exp_name, status):
+        """
+        Convert experiment record from running to complete.
+
+        Args:
+           exp_name<str>: the name of experiment.
+           status<str>: the status of this experiment.
+        """
+        _logger.debug('the status of this experiment is {}'.format(status))
+        completed_avg_history = dict()
+        if exp_name in self._running_history:
+            if status == "GOOD":
+                count = 0
+                history_sum = 0
+                result = []
+                for res in self._running_history[exp_name]:
+                    count += 1
+                    history_sum += res
+                    result.append(history_sum / count)
+                completed_avg_history[exp_name] = result
+            self._running_history.pop(exp_name)
+
+        if len(completed_avg_history) > 0:
+            while True:
+                try:
+                    new_dict = self._manager.get_completed_history()
+                    new_dict.update(completed_avg_history)
+                    break
+                except Exception as err:
+                    _logger.error("update data error: {}".format(err))
+
+    def get_status(self, step, result, epochs):
+        """ 
+        Get current experiment status
+        
+        Args:
+            step: step in this client.
+            result: the result of this epoch.
+            epochs: whole epochs.
+
+        Return:
+            the status of this experiment.
+        """
+        exp_name = self._strategy._client_name + str(step)
+        self._update_data(exp_name, result)
+
+        _logger.debug("running history after update data: {}".format(
+            self._running_history))
+
+        curr_step = len(self._running_history[exp_name])
+        status = "GOOD"
+        if curr_step < self._start_epoch:
+            return status
+
+        res_same_step = []
+
+        def list2dict(lists):
+            res_dict = dict()
+            for l in lists:
+                tmp_dict = dict()
+                tmp_dict[l[0]] = l[1]
+                res_dict.update(tmp_dict)
+            return res_dict
+
+        while True:
+            try:
+                completed_avg_history = self._manager.get_completed_history()
+                break
+            except Exception as err:
+                _logger.error("get status error: {}".format(err))
+
+        if len(completed_avg_history.keys()) == 0:
+            for exp in self._running_history.keys():
+                if curr_step <= len(self._running_history[exp]):
+                    res_same_step.append(self._running_history[exp][curr_step -
+                                                                    1])
+        else:
+            completed_avg_history_dict = list2dict(completed_avg_history.items(
+            ))
+
+            for exp in completed_avg_history.keys():
+                if curr_step <= len(completed_avg_history_dict[exp]):
+                    res_same_step.append(completed_avg_history_dict[exp][
+                        curr_step - 1])
+
+        _logger.debug("result of same step in other experiment: {}".format(
+            res_same_step))
+        if res_same_step:
+            res_same_step.sort()
+
+            if self._mode == 'maxmize' and result < res_same_step[(
+                    len(res_same_step) - 1) // 2]:
+                status = "BAD"
+
+            if self._mode == 'minimize' and result > res_same_step[len(
+                    res_same_step) // 2]:
+                status = "BAD"
+
+        if curr_step == epochs:
+            self._convert_running2completed(exp_name, status)
+
+        return status
+
+    def __del__(self):
+        if self._is_server:
+            self._manager.shutdown()
diff --git a/paddleslim/nas/ofa/__init__.py b/paddleslim/nas/ofa/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..db1394baf6dc59286b678f302b80fe2c5de404c1
--- /dev/null
+++ b/paddleslim/nas/ofa/__init__.py
@@ -0,0 +1,17 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .ofa import OFA, RunConfig, DistillConfig
+from .convert_super import supernet
+from .layers import *
diff --git a/paddleslim/nas/ofa/convert_super.py b/paddleslim/nas/ofa/convert_super.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7ff8a1e530cef850415049c1d8a1b42dfcc0345
--- /dev/null
+++ b/paddleslim/nas/ofa/convert_super.py
@@ -0,0 +1,417 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import decorator
+import logging
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import framework
+from paddle.fluid.dygraph.nn import Conv2D, Conv2DTranspose, Linear, BatchNorm, InstanceNorm
+from .layers import *
+from ...common import get_logger
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+__all__ = ['supernet']
+
+WEIGHT_LAYER = ['conv', 'linear']
+
+
+### TODO: add decorator
+class Convert:
+    def __init__(self, context):
+        self.context = context
+
+    def convert(self, model):
+        # search the first and last weight layer, don't change out channel of the last weight layer
+        # don't change in channel of the first weight layer
+        first_weight_layer_idx = -1
+        last_weight_layer_idx = -1
+        weight_layer_count = 0
+        # NOTE: pre_channel store for shortcut module
+        pre_channel = 0
+        cur_channel = None
+        for idx, layer in enumerate(model):
+            cls_name = layer.__class__.__name__.lower()
+            if 'conv' in cls_name or 'linear' in cls_name:
+                weight_layer_count += 1
+                last_weight_layer_idx = idx
+                if first_weight_layer_idx == -1:
+                    first_weight_layer_idx = idx
+
+        if getattr(self.context, 'channel', None) != None:
+            assert len(
+                self.context.channel
+            ) == weight_layer_count, "length of channel must same as weight layer."
+
+        for idx, layer in enumerate(model):
+            if isinstance(layer, Conv2D):
+                attr_dict = layer.__dict__
+                key = attr_dict['_full_name']
+
+                new_attr_name = [
+                    '_stride', '_dilation', '_groups', '_param_attr',
+                    '_bias_attr', '_use_cudnn', '_act', '_dtype'
+                ]
+
+                new_attr_dict = dict()
+                new_attr_dict['candidate_config'] = dict()
+                self.kernel_size = getattr(self.context, 'kernel_size', None)
+
+                if self.kernel_size != None:
+                    new_attr_dict['transform_kernel'] = True
+
+                # if the kernel_size of conv is 1, don't change it.
+                #if self.kernel_size and int(attr_dict['_filter_size'][0]) != 1:
+                if self.kernel_size and int(attr_dict['_filter_size']) != 1:
+                    new_attr_dict['filter_size'] = max(self.kernel_size)
+                    new_attr_dict['candidate_config'].update({
+                        'kernel_size': self.kernel_size
+                    })
+                else:
+                    new_attr_dict['filter_size'] = attr_dict['_filter_size']
+
+                if self.context.expand:
+                    ### first super convolution
+                    if idx == first_weight_layer_idx:
+                        new_attr_dict['num_channels'] = attr_dict[
+                            '_num_channels']
+                    else:
+                        new_attr_dict[
+                            'num_channels'] = self.context.expand * attr_dict[
+                                '_num_channels']
+                    ### last super convolution
+                    if idx == last_weight_layer_idx:
+                        new_attr_dict['num_filters'] = attr_dict['_num_filters']
+                    else:
+                        new_attr_dict[
+                            'num_filters'] = self.context.expand * attr_dict[
+                                '_num_filters']
+                        new_attr_dict['candidate_config'].update({
+                            'expand_ratio': self.context.expand_ratio
+                        })
+                elif self.context.channel:
+                    if attr_dict['_groups'] != None and (
+                            int(attr_dict['_groups']) ==
+                            int(attr_dict['_num_channels'])):
+                        ### depthwise conv, if conv is depthwise, use pre channel as cur_channel
+                        _logger.warn(
+                        "If convolution is a depthwise conv, output channel change" \
+                        " to the same channel with input, output channel in search is not used."
+                        )
+                        cur_channel = pre_channel
+                    else:
+                        cur_channel = self.context.channel[0]
+                    self.context.channel = self.context.channel[1:]
+                    if idx == first_weight_layer_idx:
+                        new_attr_dict['num_channels'] = attr_dict[
+                            '_num_channels']
+                    else:
+                        new_attr_dict['num_channels'] = max(pre_channel)
+
+                    if idx == last_weight_layer_idx:
+                        new_attr_dict['num_filters'] = attr_dict['_num_filters']
+                    else:
+                        new_attr_dict['num_filters'] = max(cur_channel)
+                        new_attr_dict['candidate_config'].update({
+                            'channel': cur_channel
+                        })
+                        pre_channel = cur_channel
+                else:
+                    new_attr_dict['num_filters'] = attr_dict['_num_filters']
+                    new_attr_dict['num_channels'] = attr_dict['_num_channels']
+
+                for attr in new_attr_name:
+                    new_attr_dict[attr[1:]] = attr_dict[attr]
+
+                del layer
+
+                if attr_dict['_groups'] == None or int(attr_dict[
+                        '_groups']) == 1:
+                    ### standard conv
+                    layer = Block(SuperConv2D(**new_attr_dict), key=key)
+                elif int(attr_dict['_groups']) == int(attr_dict[
+                        '_num_channels']):
+                    # if conv is depthwise conv, groups = in_channel, out_channel = in_channel,
+                    # channel in candidate_config = in_channel_list
+                    if 'channel' in new_attr_dict['candidate_config']:
+                        new_attr_dict['num_channels'] = max(cur_channel)
+                        new_attr_dict['num_filters'] = new_attr_dict[
+                            'num_channels']
+                        new_attr_dict['candidate_config'][
+                            'channel'] = cur_channel
+                    new_attr_dict['groups'] = new_attr_dict['num_channels']
+                    layer = Block(
+                        SuperDepthwiseConv2D(**new_attr_dict), key=key)
+                else:
+                    ### group conv
+                    layer = Block(SuperGroupConv2D(**new_attr_dict), key=key)
+                model[idx] = layer
+
+            elif isinstance(layer, BatchNorm) and (
+                    getattr(self.context, 'expand', None) != None or
+                    getattr(self.context, 'channel', None) != None):
+                # num_features in BatchNorm don't change after last weight operators
+                if idx > last_weight_layer_idx:
+                    continue
+
+                attr_dict = layer.__dict__
+                new_attr_name = [
+                    '_param_attr', '_bias_attr', '_act', '_dtype', '_in_place',
+                    '_data_layout', '_momentum', '_epsilon', '_is_test',
+                    '_use_global_stats', '_trainable_statistics'
+                ]
+                new_attr_dict = dict()
+                if self.context.expand:
+                    new_attr_dict['num_channels'] = self.context.expand * int(
+                        layer._parameters['weight'].shape[0])
+                elif self.context.channel:
+                    new_attr_dict['num_channels'] = max(cur_channel)
+
+                for attr in new_attr_name:
+                    new_attr_dict[attr[1:]] = attr_dict[attr]
+
+                del layer, attr_dict
+
+                layer = SuperBatchNorm(**new_attr_dict)
+                model[idx] = layer
+
+            ### assume output_size = None, filter_size != None
+            ### NOTE: output_size != None may raise error, solve when it happend. 
+            elif isinstance(layer, Conv2DTranspose):
+                attr_dict = layer.__dict__
+                key = attr_dict['_full_name']
+
+                new_attr_name = [
+                    '_stride', '_dilation', '_groups', '_param_attr',
+                    '_bias_attr', '_use_cudnn', '_act', '_dtype', '_output_size'
+                ]
+                assert attr_dict[
+                    '_filter_size'] != None, "Conv2DTranspose only support filter size != None now"
+
+                new_attr_dict = dict()
+                new_attr_dict['candidate_config'] = dict()
+                self.kernel_size = getattr(self.context, 'kernel_size', None)
+
+                if self.kernel_size != None:
+                    new_attr_dict['transform_kernel'] = True
+
+                # if the kernel_size of conv transpose is 1, don't change it.
+                if self.kernel_size and int(attr_dict['_filter_size'][0]) != 1:
+                    new_attr_dict['filter_size'] = max(self.kernel_size)
+                    new_attr_dict['candidate_config'].update({
+                        'kernel_size': self.kernel_size
+                    })
+                else:
+                    new_attr_dict['filter_size'] = attr_dict['_filter_size']
+
+                if self.context.expand:
+                    ### first super convolution transpose
+                    if idx == first_weight_layer_idx:
+                        new_attr_dict['num_channels'] = attr_dict[
+                            '_num_channels']
+                    else:
+                        new_attr_dict[
+                            'num_channels'] = self.context.expand * attr_dict[
+                                '_num_channels']
+                    ### last super convolution transpose
+                    if idx == last_weight_layer_idx:
+                        new_attr_dict['num_filters'] = attr_dict['_num_filters']
+                    else:
+                        new_attr_dict[
+                            'num_filters'] = self.context.expand * attr_dict[
+                                '_num_filters']
+                        new_attr_dict['candidate_config'].update({
+                            'expand_ratio': self.context.expand_ratio
+                        })
+                elif self.context.channel:
+                    if attr_dict['_groups'] != None and (
+                            int(attr_dict['_groups']) ==
+                            int(attr_dict['_num_channels'])):
+                        ### depthwise conv_transpose
+                        _logger.warn(
+                        "If convolution is a depthwise conv_transpose, output channel " \
+                        "change to the same channel with input, output channel in search is not used."
+                        )
+                        cur_channel = pre_channel
+                    else:
+                        cur_channel = self.context.channel[0]
+                    self.context.channel = self.context.channel[1:]
+                    if idx == first_weight_layer_idx:
+                        new_attr_dict['num_channels'] = attr_dict[
+                            '_num_channels']
+                    else:
+                        new_attr_dict['num_channels'] = max(pre_channel)
+
+                    if idx == last_weight_layer_idx:
+                        new_attr_dict['num_filters'] = attr_dict['_num_filters']
+                    else:
+                        new_attr_dict['num_filters'] = max(cur_channel)
+                        new_attr_dict['candidate_config'].update({
+                            'channel': cur_channel
+                        })
+                        pre_channel = cur_channel
+                else:
+                    new_attr_dict['num_filters'] = attr_dict['_num_filters']
+                    new_attr_dict['num_channels'] = attr_dict['_num_channels']
+
+                for attr in new_attr_name:
+                    new_attr_dict[attr[1:]] = attr_dict[attr]
+
+                del layer
+
+                if new_attr_dict['output_size'] == []:
+                    new_attr_dict['output_size'] = None
+
+                if attr_dict['_groups'] == None or int(attr_dict[
+                        '_groups']) == 1:
+                    ### standard conv_transpose
+                    layer = Block(
+                        SuperConv2DTranspose(**new_attr_dict), key=key)
+                elif int(attr_dict['_groups']) == int(attr_dict[
+                        '_num_channels']):
+                    # if conv is depthwise conv, groups = in_channel, out_channel = in_channel,
+                    # channel in candidate_config = in_channel_list
+                    if 'channel' in new_attr_dict['candidate_config']:
+                        new_attr_dict['num_channels'] = max(cur_channel)
+                        new_attr_dict['num_filters'] = new_attr_dict[
+                            'num_channels']
+                        new_attr_dict['candidate_config'][
+                            'channel'] = cur_channel
+                    new_attr_dict['groups'] = new_attr_dict['num_channels']
+                    layer = Block(
+                        SuperDepthwiseConv2DTranspose(**new_attr_dict), key=key)
+                else:
+                    ### group conv_transpose
+                    layer = Block(
+                        SuperGroupConv2DTranspose(**new_attr_dict), key=key)
+                model[idx] = layer
+
+            elif isinstance(layer, Linear) and (
+                    getattr(self.context, 'expand', None) != None or
+                    getattr(self.context, 'channel', None) != None):
+                attr_dict = layer.__dict__
+                key = attr_dict['_full_name']
+                ### TODO(paddle): add _param_attr and _bias_attr as private variable of Linear
+                #new_attr_name = ['_act', '_dtype', '_param_attr', '_bias_attr']
+                new_attr_name = ['_act', '_dtype']
+                in_nc, out_nc = layer._parameters['weight'].shape
+
+                new_attr_dict = dict()
+                new_attr_dict['candidate_config'] = dict()
+                if self.context.expand:
+                    if idx == first_weight_layer_idx:
+                        new_attr_dict['input_dim'] = int(in_nc)
+                    else:
+                        new_attr_dict['input_dim'] = self.context.expand * int(
+                            in_nc)
+
+                    if idx == last_weight_layer_idx:
+                        new_attr_dict['output_dim'] = int(out_nc)
+                    else:
+                        new_attr_dict['output_dim'] = self.context.expand * int(
+                            out_nc)
+                        new_attr_dict['candidate_config'].update({
+                            'expand_ratio': self.context.expand_ratio
+                        })
+                elif self.context.channel:
+                    cur_channel = self.context.channel[0]
+                    self.context.channel = self.context.channel[1:]
+                    if idx == first_weight_layer_idx:
+                        new_attr_dict['input_dim'] = int(in_nc)
+                    else:
+                        new_attr_dict['input_dim'] = max(pre_channel)
+
+                    if idx == last_weight_layer_idx:
+                        new_attr_dict['output_dim'] = int(out_nc)
+                    else:
+                        new_attr_dict['output_dim'] = max(cur_channel)
+                        new_attr_dict['candidate_config'].update({
+                            'channel': cur_channel
+                        })
+                        pre_channel = cur_channel
+                else:
+                    new_attr_dict['input_dim'] = int(in_nc)
+                    new_attr_dict['output_dim'] = int(out_nc)
+
+                for attr in new_attr_name:
+                    new_attr_dict[attr[1:]] = attr_dict[attr]
+
+                del layer, attr_dict
+
+                layer = Block(SuperLinear(**new_attr_dict), key=key)
+                model[idx] = layer
+
+            elif isinstance(layer, InstanceNorm) and (
+                    getattr(self.context, 'expand', None) != None or
+                    getattr(self.context, 'channel', None) != None):
+                # num_features in InstanceNorm don't change after last weight operators
+                if idx > last_weight_layer_idx:
+                    continue
+
+                attr_dict = layer.__dict__
+                new_attr_name = [
+                    '_param_attr', '_bias_attr', '_dtype', '_epsilon'
+                ]
+                new_attr_dict = dict()
+                if self.context.expand:
+                    new_attr_dict['num_channels'] = self.context.expand * int(
+                        layer._parameters['scale'].shape[0])
+                elif self.context.channel:
+                    new_attr_dict['num_channels'] = max(cur_channel)
+
+                for attr in new_attr_name:
+                    new_attr_dict[attr[1:]] = attr_dict[attr]
+
+                del layer, attr_dict
+
+                layer = SuperInstanceNorm(**new_attr_dict)
+                model[idx] = layer
+
+        return model
+
+
+class supernet:
+    def __init__(self, **kwargs):
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+
+        assert (
+            getattr(self, 'expand_ratio', None) == None or
+            getattr(self, 'channel', None) == None
+        ), "expand_ratio and channel CANNOT be NOT None at the same time."
+
+        self.expand = None
+        if 'expand_ratio' in kwargs.keys():
+            if isinstance(self.expand_ratio, list) or isinstance(
+                    self.expand_ratio, tuple):
+                self.expand = max(self.expand_ratio)
+            elif isinstance(self.expand_ratio, int):
+                self.expand = self.expand_ratio
+
+    def __enter__(self):
+        return Convert(self)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+
+#def ofa_supernet(kernel_size, expand_ratio):
+#    def _ofa_supernet(func):
+#        @functools.wraps(func)
+#        def convert(*args, **kwargs):
+#            supernet_convert(*args, **kwargs)
+#        return convert
+#    return _ofa_supernet
diff --git a/paddleslim/nas/ofa/layers.py b/paddleslim/nas/ofa/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d91f5338a8a1f9ee67cc1d7dab2657d85348454
--- /dev/null
+++ b/paddleslim/nas/ofa/layers.py
@@ -0,0 +1,929 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import logging
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.dygraph_utils as dygraph_utils
+from paddle.fluid.data_feeder import check_variable_and_dtype
+from paddle.fluid.framework import in_dygraph_mode, _varbase_creator
+from paddle.fluid.dygraph.nn import InstanceNorm, Conv2D, Conv2DTranspose, BatchNorm
+
+from ...common import get_logger
+from .utils.utils import compute_start_end, get_same_padding, convert_to_list
+
+__all__ = [
+    'SuperConv2D', 'SuperConv2DTranspose', 'SuperSeparableConv2D',
+    'SuperBatchNorm', 'SuperLinear', 'SuperInstanceNorm', 'Block',
+    'SuperGroupConv2D', 'SuperDepthwiseConv2D', 'SuperGroupConv2DTranspose',
+    'SuperDepthwiseConv2DTranspose'
+]
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+### TODO: if task is elastic width, need to add re_organize_middle_weight in 1x1 conv in MBBlock
+
+_cnt = 0
+
+
+def counter():
+    global _cnt
+    _cnt += 1
+    return _cnt
+
+
+class BaseBlock(fluid.dygraph.Layer):
+    def __init__(self, key=None):
+        super(BaseBlock, self).__init__()
+        if key is not None:
+            self._key = str(key)
+        else:
+            self._key = self.__class__.__name__ + str(counter())
+
+    # set SuperNet class
+    def set_supernet(self, supernet):
+        self.__dict__['supernet'] = supernet
+
+    @property
+    def key(self):
+        return self._key
+
+
+class Block(BaseBlock):
+    """
+    Model is composed of nest blocks.
+
+    Parameters:
+        fn(Layer): instance of super layers, such as: SuperConv2D(3, 5, 3).
+        key(str, optional): key of this layer, one-to-one correspondence between key and candidate config. Default: None.
+    """
+
+    def __init__(self, fn, key=None):
+        super(Block, self).__init__(key)
+        self.fn = fn
+        self.candidate_config = self.fn.candidate_config
+
+    def forward(self, *inputs, **kwargs):
+        out = self.supernet.layers_forward(self, *inputs, **kwargs)
+        return out
+
+
+class SuperConv2D(fluid.dygraph.Conv2D):
+    """
+    This interface is used to construct a callable object of the ``SuperConv2D``  class.
+    The difference between ```SuperConv2D``` and ```Conv2D``` is: ```SuperConv2D``` need 
+    to feed a config dictionary with the format of {'channel', num_of_channel} represents 
+    the channels of the outputs, used to change the first dimension of weight and bias, 
+    only train the first channels of the weight and bias.
+
+    Note: the channel in config need to less than first defined.
+
+    The super convolution2D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input and
+    Output are in NCHW format, where N is batch size, C is the number of
+    the feature map, H is the height of the feature map, and W is the width of the feature map.
+    Filter's shape is [MCHW] , where M is the number of output feature map,
+    C is the number of input feature map, H is the height of the filter,
+    and W is the width of the filter. If the groups is greater than 1,
+    C will equal the number of input feature map divided by the groups.
+    Please refer to UFLDL's `convolution
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
+    for more details.
+    If bias attribution and activation type are provided, bias is added to the
+    output of the convolution, and the corresponding activation function is
+    applied to the final result.
+    For each input :math:`X`, the equation is:
+    .. math::
+        Out = \\sigma (W \\ast X + b)
+    Where:
+    * :math:`X`: Input value, a ``Tensor`` with NCHW format.
+    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+        - Input:
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
+          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
+        - Output:
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+    Parameters:
+        num_channels(int): The number of channels in the input image.
+        num_filters(int): The number of filter. It is as same as the output
+            feature map.
+        filter_size (int or tuple): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        candidate_config(dict, optional): Dictionary descripts candidate config of this layer,
+            such as {'kernel_size': (3, 5, 7), 'channel': (4, 6, 8)}, means the kernel size of 
+            this layer can be choose from (3, 5, 7), the key of candidate_config
+            only can be 'kernel_size', 'channel' and 'expand_ratio', 'channel' and 'expand_ratio'
+            CANNOT be set at the same time. Default: None.
+        transform_kernel(bool, optional): Whether to use transform matrix to transform a large filter
+            to a small filter. Default: False.
+        stride (int or tuple, optional): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: 1.
+        padding (int or tuple, optional): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: 0.
+        dilation (int or tuple, optional): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation. Default: 1.
+        groups (int, optional): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: 1.
+        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
+            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+        bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv2d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True.
+        act (str, optional): Activation type, if it is set to None, activation is not appended.
+            Default: None.
+        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
+    Attribute:
+        **weight** (Parameter): the learnable weights of filter of this layer.
+        **bias** (Parameter or None): the learnable bias of this layer.
+    Returns:
+        None
+    
+    Raises:
+        ValueError: if ``use_cudnn`` is not a bool value.
+    Examples:
+        .. code-block:: python
+          from paddle.fluid.dygraph.base import to_variable
+          import paddle.fluid as fluid
+          from paddleslim.core.layers import SuperConv2D
+          import numpy as np
+          data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+          with fluid.dygraph.guard():
+              super_conv2d = SuperConv2D(3, 10, 3)
+              config = {'channel': 5}
+              data = to_variable(data)
+              conv = super_conv2d(data, config)
+
+    """
+
+    ### NOTE: filter_size, num_channels and num_filters must be the max of candidate to define a largest network.
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 candidate_config={},
+                 transform_kernel=False,
+                 stride=1,
+                 dilation=1,
+                 padding=0,
+                 groups=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_cudnn=True,
+                 act=None,
+                 dtype='float32'):
+        ### NOTE: padding always is 0, add padding in forward because of kernel size is uncertain
+        ### TODO: change padding to any padding
+        super(SuperConv2D, self).__init__(
+            num_channels, num_filters, filter_size, stride, padding, dilation,
+            groups, param_attr, bias_attr, use_cudnn, act, dtype)
+
+        if isinstance(self._filter_size, int):
+            self._filter_size = convert_to_list(self._filter_size, 2)
+
+        self.candidate_config = candidate_config
+        if len(candidate_config.items()) != 0:
+            for k, v in candidate_config.items():
+                candidate_config[k] = list(set(v))
+
+        self.ks_set = candidate_config[
+            'kernel_size'] if 'kernel_size' in candidate_config else None
+
+        self.expand_ratio = candidate_config[
+            'expand_ratio'] if 'expand_ratio' in candidate_config else None
+        self.channel = candidate_config[
+            'channel'] if 'channel' in candidate_config else None
+        self.base_channel = None
+        if self.expand_ratio != None:
+            self.base_channel = int(self._num_filters / max(self.expand_ratio))
+
+        self.transform_kernel = transform_kernel
+        if self.ks_set != None:
+            self.ks_set.sort()
+        if self.transform_kernel != False:
+            scale_param = dict()
+            ### create parameter to transform kernel
+            for i in range(len(self.ks_set) - 1):
+                ks_small = self.ks_set[i]
+                ks_large = self.ks_set[i + 1]
+                param_name = '%dto%d_matrix' % (ks_large, ks_small)
+                ks_t = ks_small**2
+                scale_param[param_name] = self.create_parameter(
+                    attr=fluid.ParamAttr(
+                        name=self._full_name + param_name,
+                        initializer=fluid.initializer.NumpyArrayInitializer(
+                            np.eye(ks_t))),
+                    shape=(ks_t, ks_t),
+                    dtype=self._dtype)
+
+            for name, param in scale_param.items():
+                setattr(self, name, param)
+
+    def get_active_filter(self, in_nc, out_nc, kernel_size):
+        start, end = compute_start_end(self._filter_size[0], kernel_size)
+        ### if NOT transform kernel, intercept a center filter with kernel_size from largest filter
+        filters = self.weight[:out_nc, :in_nc, start:end, start:end]
+        if self.transform_kernel != False and kernel_size < self._filter_size[
+                0]:
+            ### if transform kernel, then use matrix to transform
+            start_filter = self.weight[:out_nc, :in_nc, :, :]
+            for i in range(len(self.ks_set) - 1, 0, -1):
+                src_ks = self.ks_set[i]
+                if src_ks <= kernel_size:
+                    break
+                target_ks = self.ks_set[i - 1]
+                start, end = compute_start_end(src_ks, target_ks)
+                _input_filter = start_filter[:, :, start:end, start:end]
+                _input_filter = fluid.layers.reshape(
+                    _input_filter,
+                    shape=[(_input_filter.shape[0] * _input_filter.shape[1]),
+                           -1])
+                core.ops.matmul(_input_filter,
+                                self.__getattr__('%dto%d_matrix' %
+                                                 (src_ks, target_ks)),
+                                _input_filter, 'transpose_X', False,
+                                'transpose_Y', False, "alpha", 1)
+                _input_filter = fluid.layers.reshape(
+                    _input_filter,
+                    shape=[
+                        filters.shape[0], filters.shape[1], target_ks, target_ks
+                    ])
+                start_filter = _input_filter
+            filters = start_filter
+        return filters
+
+    def get_groups_in_out_nc(self, in_nc, out_nc):
+        ### standard conv
+        return self._groups, in_nc, out_nc
+
+    def forward(self, input, kernel_size=None, expand_ratio=None, channel=None):
+
+        if not in_dygraph_mode():
+            _logger.error("NOT support static graph")
+
+        in_nc = int(input.shape[1])
+        assert (
+            expand_ratio == None or channel == None
+        ), "expand_ratio and channel CANNOT be NOT None at the same time."
+        if expand_ratio != None:
+            out_nc = int(expand_ratio * self.base_channel)
+        elif channel != None:
+            out_nc = int(channel)
+        else:
+            out_nc = self._num_filters
+        ks = int(self._filter_size[0]) if kernel_size == None else int(
+            kernel_size)
+
+        groups, weight_in_nc, weight_out_nc = self.get_groups_in_out_nc(in_nc,
+                                                                        out_nc)
+
+        weight = self.get_active_filter(weight_in_nc, weight_out_nc, ks)
+        padding = convert_to_list(get_same_padding(ks), 2)
+
+        if self._l_type == 'conv2d':
+            attrs = ('strides', self._stride, 'paddings', padding, 'dilations',
+                     self._dilation, 'groups', groups
+                     if groups else 1, 'use_cudnn', self._use_cudnn)
+            out = core.ops.conv2d(input, weight, *attrs)
+        elif self._l_type == 'depthwise_conv2d':
+            attrs = ('strides', self._stride, 'paddings', padding, 'dilations',
+                     self._dilation, 'groups', groups
+                     if groups else self._groups, 'use_cudnn', self._use_cudnn)
+            out = core.ops.depthwise_conv2d(input, weight, *attrs)
+        else:
+            raise ValueError("conv type error")
+
+        pre_bias = out
+        out_nc = int(pre_bias.shape[1])
+        if self.bias is not None:
+            bias = self.bias[:out_nc]
+            pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
+        else:
+            pre_act = pre_bias
+
+        return dygraph_utils._append_activation_in_dygraph(pre_act, self._act)
+
+
+class SuperGroupConv2D(SuperConv2D):
+    def get_groups_in_out_nc(self, in_nc, out_nc):
+        ### groups convolution
+        ### conv: weight: (Cout, Cin/G, Kh, Kw)
+        groups = self._groups
+        in_nc = int(in_nc // groups)
+        return groups, in_nc, out_nc
+
+
+class SuperDepthwiseConv2D(SuperConv2D):
+    ### depthwise convolution
+    def get_groups_in_out_nc(self, in_nc, out_nc):
+        if in_nc != out_nc:
+            _logger.debug(
+                "input channel and output channel in depthwise conv is different, change output channel to input channel! origin channel:(in_nc {}, out_nc {}): ".
+                format(in_nc, out_nc))
+        groups = in_nc
+        out_nc = in_nc
+        return groups, in_nc, out_nc
+
+
+class SuperConv2DTranspose(fluid.dygraph.Conv2DTranspose):
+    """
+    This interface is used to construct a callable object of the ``SuperConv2DTranspose`` 
+    class.
+    The difference between ```SuperConv2DTranspose``` and ```Conv2DTranspose``` is: 
+    ```SuperConv2DTranspose``` need to feed a config dictionary with the format of 
+    {'channel', num_of_channel} represents the channels of the outputs, used to change 
+    the first dimension of weight and bias, only train the first channels of the weight 
+    and bias.
+
+    Note: the channel in config need to less than first defined.
+
+    The super convolution2D transpose layer calculates the output based on the input,
+    filter, and dilations, strides, paddings. Input and output
+    are in NCHW format. Where N is batch size, C is the number of feature map,
+    H is the height of the feature map, and W is the width of the feature map.
+    Filter's shape is [MCHW] , where M is the number of input feature map,
+    C is the number of output feature map, H is the height of the filter,
+    and W is the width of the filter. If the groups is greater than 1,
+    C will equal the number of input feature map divided by the groups.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
+    The details of convolution transpose layer, please refer to the following explanation and references
+    `conv2dtranspose <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_ .
+    For each input :math:`X`, the equation is:
+    .. math::
+        Out = \sigma (W \\ast X + b)
+    Where:
+    * :math:`X`: Input value, a ``Tensor`` with NCHW format.
+    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    Example:
+        - Input:
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
+          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
+        - Output:
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
+           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
+           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
+           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
+    Parameters:
+        num_channels(int): The number of channels in the input image.
+        num_filters(int): The number of the filter. It is as same as the output
+            feature map.
+        filter_size(int or tuple): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        candidate_config(dict, optional): Dictionary descripts candidate config of this layer,
+            such as {'kernel_size': (3, 5, 7), 'channel': (4, 6, 8)}, means the kernel size of 
+            this layer can be choose from (3, 5, 7), the key of candidate_config
+            only can be 'kernel_size', 'channel' and 'expand_ratio', 'channel' and 'expand_ratio'
+            CANNOT be set at the same time. Default: None.
+        transform_kernel(bool, optional): Whether to use transform matrix to transform a large filter
+            to a small filter. Default: False.
+        output_size(int or tuple, optional): The output image size. If output size is a
+            tuple, it must contain two integers, (image_H, image_W). None if use
+            filter_size, padding, and stride to calculate output_size.
+            if output_size and filter_size are specified at the same time, They
+            should follow the formula above. Default: None.
+        padding(int or tuple, optional): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: 0.
+        stride(int or tuple, optional): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: 1.
+        dilation(int or tuple, optional): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation. Default: 1.
+        groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: 1.
+        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+            of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv2d_transpose.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d_transpose
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True.
+        act (str, optional): Activation type, if it is set to None, activation is not appended.
+            Default: None.
+        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
+    Attribute:
+        **weight** (Parameter): the learnable weights of filters of this layer.
+        **bias** (Parameter or None): the learnable bias of this layer.
+    Returns:
+        None
+    Examples:
+       .. code-block:: python
+          import paddle.fluid as fluid
+          from paddleslim.core.layers import SuperConv2DTranspose
+          import numpy as np
+          with fluid.dygraph.guard():
+              data = np.random.random((3, 32, 32, 5)).astype('float32')
+              config = {'channel': 5
+              super_convtranspose = SuperConv2DTranspose(num_channels=32, num_filters=10, filter_size=3)
+              ret = super_convtranspose(fluid.dygraph.base.to_variable(data), config)
+    """
+
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 output_size=None,
+                 candidate_config={},
+                 transform_kernel=False,
+                 stride=1,
+                 dilation=1,
+                 padding=0,
+                 groups=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_cudnn=True,
+                 act=None,
+                 dtype='float32'):
+        ### NOTE: padding always is 0, add padding in forward because of kernel size is uncertain
+        super(SuperConv2DTranspose, self).__init__(
+            num_channels, num_filters, filter_size, output_size, padding,
+            stride, dilation, groups, param_attr, bias_attr, use_cudnn, act,
+            dtype)
+        self.candidate_config = candidate_config
+        if len(self.candidate_config.items()) != 0:
+            for k, v in candidate_config.items():
+                candidate_config[k] = list(set(v))
+        self.ks_set = candidate_config[
+            'kernel_size'] if 'kernel_size' in candidate_config else None
+
+        if isinstance(self._filter_size, int):
+            self._filter_size = convert_to_list(self._filter_size, 2)
+
+        self.expand_ratio = candidate_config[
+            'expand_ratio'] if 'expand_ratio' in candidate_config else None
+        self.channel = candidate_config[
+            'channel'] if 'channel' in candidate_config else None
+        self.base_channel = None
+        if self.expand_ratio:
+            self.base_channel = int(self._num_filters / max(self.expand_ratio))
+
+        self.transform_kernel = transform_kernel
+        if self.ks_set != None:
+            self.ks_set.sort()
+        if self.transform_kernel != False:
+            scale_param = dict()
+            ### create parameter to transform kernel
+            for i in range(len(self.ks_set) - 1):
+                ks_small = self.ks_set[i]
+                ks_large = self.ks_set[i + 1]
+                param_name = '%dto%d_matrix' % (ks_large, ks_small)
+                ks_t = ks_small**2
+                scale_param[param_name] = self.create_parameter(
+                    attr=fluid.ParamAttr(
+                        name=self._full_name + param_name,
+                        initializer=fluid.initializer.NumpyArrayInitializer(
+                            np.eye(ks_t))),
+                    shape=(ks_t, ks_t),
+                    dtype=self._dtype)
+
+            for name, param in scale_param.items():
+                setattr(self, name, param)
+
+    def get_active_filter(self, in_nc, out_nc, kernel_size):
+        start, end = compute_start_end(self._filter_size[0], kernel_size)
+        filters = self.weight[:in_nc, :out_nc, start:end, start:end]
+        if self.transform_kernel != False and kernel_size < self._filter_size[
+                0]:
+            start_filter = self.weight[:in_nc, :out_nc, :, :]
+            for i in range(len(self.ks_set) - 1, 0, -1):
+                src_ks = self.ks_set[i]
+                if src_ks <= kernel_size:
+                    break
+                target_ks = self.ks_set[i - 1]
+                start, end = compute_start_end(src_ks, target_ks)
+                _input_filter = start_filter[:, :, start:end, start:end]
+                _input_filter = fluid.layers.reshape(
+                    _input_filter,
+                    shape=[(_input_filter.shape[0] * _input_filter.shape[1]),
+                           -1])
+                core.ops.matmul(_input_filter,
+                                self.__getattr__('%dto%d_matrix' %
+                                                 (src_ks, target_ks)),
+                                _input_filter, 'transpose_X', False,
+                                'transpose_Y', False, "alpha", 1)
+                _input_filter = fluid.layers.reshape(
+                    _input_filter,
+                    shape=[
+                        filters.shape[0], filters.shape[1], target_ks, target_ks
+                    ])
+                start_filter = _input_filter
+            filters = start_filter
+        return filters
+
+    def get_groups_in_out_nc(self, in_nc, out_nc):
+        ### standard conv
+        return self._groups, in_nc, out_nc
+
+    def forward(self, input, kernel_size=None, expand_ratio=None, channel=None):
+        if not in_dygraph_mode():
+            _logger.error("NOT support static graph")
+
+        in_nc = int(input.shape[1])
+        assert (
+            expand_ratio == None or channel == None
+        ), "expand_ratio and channel CANNOT be NOT None at the same time."
+        if expand_ratio != None:
+            out_nc = int(expand_ratio * self.base_channel)
+        elif channel != None:
+            out_nc = int(channel)
+        else:
+            out_nc = self._num_filters
+
+        ks = int(self._filter_size[0]) if kernel_size == None else int(
+            kernel_size)
+
+        groups, weight_in_nc, weight_out_nc = self.get_groups_in_out_nc(in_nc,
+                                                                        out_nc)
+
+        weight = self.get_active_filter(weight_in_nc, weight_out_nc, ks)
+        padding = convert_to_list(get_same_padding(ks), 2)
+
+        op = getattr(core.ops, self._op_type)
+        out = op(input, weight, 'output_size', self._output_size, 'strides',
+                 self._stride, 'paddings', padding, 'dilations', self._dilation,
+                 'groups', groups, 'use_cudnn', self._use_cudnn)
+        pre_bias = out
+        out_nc = int(pre_bias.shape[1])
+        if self.bias is not None:
+            bias = self.bias[:out_nc]
+            pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
+        else:
+            pre_act = pre_bias
+
+        return dygraph_utils._append_activation_in_dygraph(
+            pre_act, act=self._act)
+
+
+class SuperGroupConv2DTranspose(SuperConv2DTranspose):
+    def get_groups_in_out_nc(self, in_nc, out_nc):
+        ### groups convolution
+        ### groups conv transpose: weight: (Cin, Cout/G, Kh, Kw)
+        groups = self._groups
+        out_nc = int(out_nc // groups)
+        return groups, in_nc, out_nc
+
+
+class SuperDepthwiseConv2DTranspose(SuperConv2DTranspose):
+    def get_groups_in_out_nc(self, in_nc, out_nc):
+        if in_nc != out_nc:
+            _logger.debug(
+                "input channel and output channel in depthwise conv transpose is different, change output channel to input channel! origin channel:(in_nc {}, out_nc {}): ".
+                format(in_nc, out_nc))
+        groups = in_nc
+        out_nc = in_nc
+        return groups, in_nc, out_nc
+
+
+### NOTE: only search channel, write for GAN-compression, maybe change to SuperDepthwiseConv and SuperConv after.
+class SuperSeparableConv2D(fluid.dygraph.Layer):
+    """
+    This interface is used to construct a callable object of the ``SuperSeparableConv2D``
+    class.
+    The difference between ```SuperSeparableConv2D``` and ```SeparableConv2D``` is: 
+    ```SuperSeparableConv2D``` need to feed a config dictionary with the format of 
+    {'channel', num_of_channel} represents the channels of the first conv's outputs and
+    the second conv's inputs, used to change the first dimension of weight and bias, 
+    only train the first channels of the weight and bias.
+
+    The architecture of super separable convolution2D op is [Conv2D, norm layer(may be BatchNorm
+    or InstanceNorm), Conv2D]. The first conv is depthwise conv, the filter number is input channel
+    multiply scale_factor, the group is equal to the number of input channel. The second conv
+    is standard conv, which filter size and stride size are 1. 
+
+    Parameters:
+        num_channels(int): The number of channels in the input image.
+        num_filters(int): The number of the second conv's filter. It is as same as the output
+            feature map.
+        filter_size(int or tuple): The first conv's filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        padding(int or tuple, optional): The first conv's padding size. If padding is a tuple, 
+            it must contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: 0.
+        stride(int or tuple, optional): The first conv's stride size. If stride is a tuple,
+            it must contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: 1.
+        dilation(int or tuple, optional): The first conv's dilation size. If dilation is a tuple, 
+            it must contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation. Default: 1.
+        norm_layer(class): The normalization layer between two convolution. Default: InstanceNorm.
+        bias_attr (ParamAttr or bool, optional): The attribute for the bias of convolution.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, convolution
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        scale_factor(float): The scale factor of the first conv's output channel. Default: 1.
+        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True.
+    Returns:
+        None
+    """
+
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 candidate_config={},
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 norm_layer=InstanceNorm,
+                 bias_attr=None,
+                 scale_factor=1,
+                 use_cudnn=False):
+        super(SuperSeparableConv2D, self).__init__()
+        self.conv = fluid.dygraph.LayerList([
+            fluid.dygraph.nn.Conv2D(
+                num_channels=num_channels,
+                num_filters=num_channels * scale_factor,
+                filter_size=filter_size,
+                stride=stride,
+                padding=padding,
+                use_cudnn=False,
+                groups=num_channels,
+                bias_attr=bias_attr)
+        ])
+
+        self.conv.extend([norm_layer(num_channels * scale_factor)])
+
+        self.conv.extend([
+            Conv2D(
+                num_channels=num_channels * scale_factor,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=1,
+                use_cudnn=use_cudnn,
+                bias_attr=bias_attr)
+        ])
+
+        self.candidate_config = candidate_config
+        self.expand_ratio = candidate_config[
+            'expand_ratio'] if 'expand_ratio' in candidate_config else None
+        self.base_output_dim = None
+        if self.expand_ratio != None:
+            self.base_output_dim = int(self.output_dim / max(self.expand_ratio))
+
+    def forward(self, input, expand_ratio=None, channel=None):
+        if not in_dygraph_mode():
+            _logger.error("NOT support static graph")
+
+        in_nc = int(input.shape[1])
+        assert (
+            expand_ratio == None or channel == None
+        ), "expand_ratio and channel CANNOT be NOT None at the same time."
+        if expand_ratio != None:
+            out_nc = int(expand_ratio * self.base_output_dim)
+        elif channel != None:
+            out_nc = int(channel)
+        else:
+            out_nc = self.conv[0]._num_filters
+
+        weight = self.conv[0].weight[:in_nc]
+        ###  conv1
+        if self.conv[0]._l_type == 'conv2d':
+            attrs = ('strides', self.conv[0]._stride, 'paddings',
+                     self.conv[0]._padding, 'dilations', self.conv[0]._dilation,
+                     'groups', in_nc, 'use_cudnn', self.conv[0]._use_cudnn)
+            out = core.ops.conv2d(input, weight, *attrs)
+        elif self.conv[0]._l_type == 'depthwise_conv2d':
+            attrs = ('strides', self.conv[0]._stride, 'paddings',
+                     self.conv[0]._padding, 'dilations', self.conv[0]._dilation,
+                     'groups', in_nc, 'use_cudnn', self.conv[0]._use_cudnn)
+            out = core.ops.depthwise_conv2d(input, weight, *attrs)
+        else:
+            raise ValueError("conv type error")
+
+        pre_bias = out
+        if self.conv[0].bias is not None:
+            bias = self.conv[0].bias[:in_nc]
+            pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
+        else:
+            pre_act = pre_bias
+
+        conv0_out = dygraph_utils._append_activation_in_dygraph(
+            pre_act, self.conv[0]._act)
+
+        norm_out = self.conv[1](conv0_out)
+
+        weight = self.conv[2].weight[:out_nc, :in_nc, :, :]
+
+        if self.conv[2]._l_type == 'conv2d':
+            attrs = ('strides', self.conv[2]._stride, 'paddings',
+                     self.conv[2]._padding, 'dilations', self.conv[2]._dilation,
+                     'groups', self.conv[2]._groups if self.conv[2]._groups else
+                     1, 'use_cudnn', self.conv[2]._use_cudnn)
+            out = core.ops.conv2d(norm_out, weight, *attrs)
+        elif self.conv[2]._l_type == 'depthwise_conv2d':
+            attrs = ('strides', self.conv[2]._stride, 'paddings',
+                     self.conv[2]._padding, 'dilations', self.conv[2]._dilation,
+                     'groups', self.conv[2]._groups, 'use_cudnn',
+                     self.conv[2]._use_cudnn)
+            out = core.ops.depthwise_conv2d(norm_out, weight, *attrs)
+        else:
+            raise ValueError("conv type error")
+
+        pre_bias = out
+        if self.conv[2].bias is not None:
+            bias = self.conv[2].bias[:out_nc]
+            pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
+        else:
+            pre_act = pre_bias
+
+        conv1_out = dygraph_utils._append_activation_in_dygraph(
+            pre_act, self.conv[2]._act)
+
+        return conv1_out
+
+
+class SuperLinear(fluid.dygraph.Linear):
+    """
+    """
+
+    def __init__(self,
+                 input_dim,
+                 output_dim,
+                 candidate_config={},
+                 param_attr=None,
+                 bias_attr=None,
+                 act=None,
+                 dtype="float32"):
+        super(SuperLinear, self).__init__(input_dim, output_dim, param_attr,
+                                          bias_attr, act, dtype)
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self.output_dim = output_dim
+        self.candidate_config = candidate_config
+        self.expand_ratio = candidate_config[
+            'expand_ratio'] if 'expand_ratio' in candidate_config else None
+        self.base_output_dim = None
+        if self.expand_ratio != None:
+            self.base_output_dim = int(self.output_dim / max(self.expand_ratio))
+
+    def forward(self, input, expand_ratio=None, channel=None):
+        if not in_dygraph_mode():
+            _logger.error("NOT support static graph")
+
+        ### weight: (Cin, Cout)
+        in_nc = int(input.shape[1])
+        assert (
+            expand_ratio == None or channel == None
+        ), "expand_ratio and channel CANNOT be NOT None at the same time."
+        if expand_ratio != None:
+            out_nc = int(expand_ratio * self.base_output_dim)
+        elif channel != None:
+            out_nc = int(channel)
+        else:
+            out_nc = self.output_dim
+
+        weight = self.weight[:in_nc, :out_nc]
+        if self._bias_attr != False:
+            bias = self.bias[:out_nc]
+            use_bias = True
+
+        pre_bias = _varbase_creator(dtype=input.dtype)
+        core.ops.matmul(input, weight, pre_bias, 'transpose_X', False,
+                        'transpose_Y', False, "alpha", 1)
+        if self._bias_attr != False:
+            pre_act = dygraph_utils._append_bias_in_dygraph(
+                pre_bias, bias, axis=len(input.shape) - 1)
+        else:
+            pre_act = pre_bias
+
+        return dygraph_utils._append_activation_in_dygraph(pre_act, self._act)
+
+
+class SuperBatchNorm(fluid.dygraph.BatchNorm):
+    """
+    add comment
+    """
+
+    def __init__(self,
+                 num_channels,
+                 act=None,
+                 is_test=False,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32',
+                 data_layout='NCHW',
+                 in_place=False,
+                 moving_mean_name=None,
+                 moving_variance_name=None,
+                 do_model_average_for_mean_and_var=True,
+                 use_global_stats=False,
+                 trainable_statistics=False):
+        super(SuperBatchNorm, self).__init__(
+            num_channels, act, is_test, momentum, epsilon, param_attr,
+            bias_attr, dtype, data_layout, in_place, moving_mean_name,
+            moving_variance_name, do_model_average_for_mean_and_var,
+            use_global_stats, trainable_statistics)
+
+    def forward(self, input):
+        if not in_dygraph_mode():
+            _logger.error("NOT support static graph")
+
+        feature_dim = int(input.shape[1])
+
+        weight = self.weight[:feature_dim]
+        bias = self.bias[:feature_dim]
+        mean = self._mean[:feature_dim]
+        variance = self._variance[:feature_dim]
+
+        mean_out = mean
+        variance_out = variance
+
+        attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
+                 "is_test", not self.training, "data_layout", self._data_layout,
+                 "use_mkldnn", False, "fuse_with_relu", self._fuse_with_relu,
+                 "use_global_stats", self._use_global_stats,
+                 'trainable_statistics', self._trainable_statistics)
+        batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
+            input, weight, bias, mean, variance, mean_out, variance_out, *attrs)
+        return dygraph_utils._append_activation_in_dygraph(
+            batch_norm_out, act=self._act)
+
+
+class SuperInstanceNorm(fluid.dygraph.InstanceNorm):
+    """
+    """
+
+    def __init__(self,
+                 num_channels,
+                 epsilon=1e-05,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(SuperInstanceNorm, self).__init__(num_channels, epsilon,
+                                                param_attr, bias_attr, dtype)
+
+    def forward(self, input):
+        if not in_dygraph_mode():
+            _logger.error("NOT support static graph")
+
+        feature_dim = int(input.shape[1])
+
+        if self._param_attr == False and self._bias_attr == False:
+            scale = None
+            bias = None
+        else:
+            scale = self.scale[:feature_dim]
+            bias = self.bias[:feature_dim]
+
+        out, _, _ = core.ops.instance_norm(input, scale, bias, 'epsilon',
+                                           self._epsilon)
+        return out
diff --git a/paddleslim/nas/ofa/ofa.py b/paddleslim/nas/ofa/ofa.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fd7f5ada5d0f59eabd9ac580b9453f183bd78f1
--- /dev/null
+++ b/paddleslim/nas/ofa/ofa.py
@@ -0,0 +1,319 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+from collections import namedtuple
+import paddle
+import paddle.nn as nn
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Conv2D
+from .layers import BaseBlock, Block, SuperConv2D, SuperBatchNorm
+from .utils.utils import search_idx
+from ...common import get_logger
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+__all__ = ['OFA', 'RunConfig', 'DistillConfig']
+
+RunConfig = namedtuple('RunConfig', [
+    'train_batch_size', 'eval_batch_size', 'n_epochs', 'save_frequency',
+    'eval_frequency', 'init_learning_rate', 'total_images', 'elastic_depth',
+    'dynamic_batch_size'
+])
+RunConfig.__new__.__defaults__ = (None, ) * len(RunConfig._fields)
+
+DistillConfig = namedtuple('DistillConfig', [
+    'lambda_distill', 'teacher_model', 'mapping_layers', 'teacher_model_path',
+    'distill_fn'
+])
+DistillConfig.__new__.__defaults__ = (None, ) * len(DistillConfig._fields)
+
+
+class OFABase(fluid.dygraph.Layer):
+    def __init__(self, model):
+        super(OFABase, self).__init__()
+        self.model = model
+        self._layers, self._elastic_task = self.get_layers()
+
+    def get_layers(self):
+        layers = dict()
+        elastic_task = set()
+        for name, sublayer in self.model.named_sublayers():
+            if isinstance(sublayer, BaseBlock):
+                sublayer.set_supernet(self)
+                layers[sublayer.key] = sublayer.candidate_config
+                for k in sublayer.candidate_config.keys():
+                    elastic_task.add(k)
+        return layers, elastic_task
+
+    def forward(self, *inputs, **kwargs):
+        raise NotImplementedError
+
+    # NOTE: config means set forward config for layers, used in distill.
+    def layers_forward(self, block, *inputs, **kwargs):
+        if getattr(self, 'current_config', None) != None:
+            assert block.key in self.current_config, 'DONNT have {} layer in config.'.format(
+                block.key)
+            config = self.current_config[block.key]
+        else:
+            config = dict()
+        logging.debug(self.model, config)
+
+        return block.fn(*inputs, **config)
+
+    @property
+    def layers(self):
+        return self._layers
+
+
+class OFA(OFABase):
+    def __init__(self,
+                 model,
+                 run_config,
+                 net_config=None,
+                 distill_config=None,
+                 elastic_order=None,
+                 train_full=False):
+        super(OFA, self).__init__(model)
+        self.net_config = net_config
+        self.run_config = run_config
+        self.distill_config = distill_config
+        self.elastic_order = elastic_order
+        self.train_full = train_full
+        self.iter_per_epochs = self.run_config.total_images // self.run_config.train_batch_size
+        self.iter = 0
+        self.dynamic_iter = 0
+        self.manual_set_task = False
+        self.task_idx = 0
+        self._add_teacher = False
+        self.netAs_param = []
+
+        for idx in range(len(run_config.n_epochs)):
+            assert isinstance(
+                run_config.init_learning_rate[idx],
+                list), "each candidate in init_learning_rate must be list"
+            assert isinstance(run_config.n_epochs[idx],
+                              list), "each candidate in n_epochs must be list"
+
+        ### if elastic_order is none, use default order
+        if self.elastic_order is not None:
+            assert isinstance(self.elastic_order,
+                              list), 'elastic_order must be a list'
+
+        if self.elastic_order is None:
+            self.elastic_order = []
+            # zero, elastic resulotion, write in demo
+            # first, elastic kernel size
+            if 'kernel_size' in self._elastic_task:
+                self.elastic_order.append('kernel_size')
+
+            # second, elastic depth, such as: list(2, 3, 4)
+            if getattr(self.run_config, 'elastic_depth', None) != None:
+                depth_list = list(set(self.run_config.elastic_depth))
+                depth_list.sort()
+                self.layers['depth'] = depth_list
+                self.elastic_order.append('depth')
+
+            # final, elastic width
+            if 'expand_ratio' in self._elastic_task:
+                self.elastic_order.append('width')
+
+            if 'channel' in self._elastic_task and 'width' not in self.elastic_order:
+                self.elastic_order.append('width')
+
+        assert len(self.run_config.n_epochs) == len(self.elastic_order)
+        assert len(self.run_config.n_epochs) == len(
+            self.run_config.dynamic_batch_size)
+        assert len(self.run_config.n_epochs) == len(
+            self.run_config.init_learning_rate)
+
+        ### =================  add distill prepare ======================
+        if self.distill_config != None and (
+                self.distill_config.lambda_distill != None and
+                self.distill_config.lambda_distill > 0):
+            self._add_teacher = True
+            self._prepare_distill()
+
+        self.model.train()
+
+    def _prepare_distill(self):
+        self.Tacts, self.Sacts = {}, {}
+
+        if self.distill_config.teacher_model == None:
+            logging.error(
+                'If you want to add distill, please input class of teacher model'
+            )
+
+        assert isinstance(self.distill_config.teacher_model,
+                          paddle.fluid.dygraph.Layer)
+
+        # load teacher parameter
+        if self.distill_config.teacher_model_path != None:
+            param_state_dict, _ = paddle.load_dygraph(
+                self.distill_config.teacher_model_path)
+            self.distill_config.teacher_model.set_dict(param_state_dict)
+
+        self.ofa_teacher_model = OFABase(self.distill_config.teacher_model)
+        self.ofa_teacher_model.model.eval()
+
+        # add hook if mapping layers is not None
+        # if mapping layer is None, return the output of the teacher model,
+        # if mapping layer is NOT None, add hook and compute distill loss about mapping layers.
+        mapping_layers = self.distill_config.mapping_layers
+        if mapping_layers != None:
+            self.netAs = []
+            for name, sublayer in self.model.named_sublayers():
+                if name in mapping_layers:
+                    netA = SuperConv2D(
+                        sublayer._num_filters,
+                        sublayer._num_filters,
+                        filter_size=1)
+                    self.netAs_param.extend(netA.parameters())
+                    self.netAs.append(netA)
+
+            def get_activation(mem, name):
+                def get_output_hook(layer, input, output):
+                    mem[name] = output
+
+                return get_output_hook
+
+            def add_hook(net, mem, mapping_layers):
+                for idx, (n, m) in enumerate(net.named_sublayers()):
+                    if n in mapping_layers:
+                        m.register_forward_post_hook(get_activation(mem, n))
+
+            add_hook(self.model, self.Sacts, mapping_layers)
+            add_hook(self.ofa_teacher_model.model, self.Tacts, mapping_layers)
+
+    def _compute_epochs(self):
+        if getattr(self, 'epoch', None) == None:
+            epoch = self.iter // self.iter_per_epochs
+        else:
+            epoch = self.epochs
+        return epoch
+
+    def _sample_from_nestdict(self, cands, sample_type, task, phase):
+        sample_cands = dict()
+        for k, v in cands.items():
+            if isinstance(v, dict):
+                sample_cands[k] = self._sample_from_nestdict(
+                    v, sample_type=sample_type, task=task, phase=phase)
+            elif isinstance(v, list) or isinstance(v, set) or isinstance(v,
+                                                                         tuple):
+                if sample_type == 'largest':
+                    sample_cands[k] = v[-1]
+                elif sample_type == 'smallest':
+                    sample_cands[k] = v[0]
+                else:
+                    if k not in task:
+                        # sort and deduplication in candidate_config
+                        # fixed candidate not in task_list
+                        sample_cands[k] = v[-1]
+                    else:
+                        # phase == None -> all candidate; phase == number, append small candidate in each phase
+                        # phase only affect last task in current task_list
+                        if phase != None and k == task[-1]:
+                            start = -(phase + 2)
+                        else:
+                            start = 0
+                        sample_cands[k] = np.random.choice(v[start:])
+
+        return sample_cands
+
+    def _sample_config(self, task, sample_type='random', phase=None):
+        config = self._sample_from_nestdict(
+            self.layers, sample_type=sample_type, task=task, phase=phase)
+        return config
+
+    def set_task(self, task=None, phase=None):
+        self.manual_set_task = True
+        self.task = task
+        self.phase = phase
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+    def _progressive_shrinking(self):
+        epoch = self._compute_epochs()
+        self.task_idx, phase_idx = search_idx(epoch, self.run_config.n_epochs)
+        self.task = self.elastic_order[:self.task_idx + 1]
+        if 'width' in self.task:
+            ### change width in task to concrete config
+            self.task.remove('width')
+            if 'expand_ratio' in self._elastic_task:
+                self.task.append('expand_ratio')
+            if 'channel' in self._elastic_task:
+                self.task.append('channel')
+        if len(self.run_config.n_epochs[self.task_idx]) == 1:
+            phase_idx = None
+        return self._sample_config(task=self.task, phase=phase_idx)
+
+    def calc_distill_loss(self):
+        losses = []
+        assert len(self.netAs) > 0
+        for i, netA in enumerate(self.netAs):
+            assert isinstance(netA, SuperConv2D)
+            n = self.distill_config.mapping_layers[i]
+            Tact = self.Tacts[n]
+            Sact = self.Sacts[n]
+            Sact = netA(Sact, channel=netA._num_filters)
+            if self.distill_config.distill_fn == None:
+                loss = fluid.layers.mse_loss(Sact, Tact)
+            else:
+                loss = distill_fn(Sact, Tact)
+            losses.append(loss)
+        return sum(losses) * self.distill_config.lambda_distill
+
+    ### TODO: complete it
+    def search(self, eval_func, condition):
+        pass
+
+    ### TODO: complete it
+    def export(self, config):
+        pass
+
+    def forward(self, *inputs, **kwargs):
+        # =====================  teacher process  =====================
+        teacher_output = None
+        if self._add_teacher:
+            teacher_output = self.ofa_teacher_model.model.forward(*inputs,
+                                                                  **kwargs)
+        # ============================================================
+
+        # ====================   student process  =====================
+        self.dynamic_iter += 1
+        if self.dynamic_iter == self.run_config.dynamic_batch_size[
+                self.task_idx]:
+            self.iter += 1
+            self.dynamic_iter = 0
+
+        if self.net_config == None:
+            if self.train_full == True:
+                self.current_config = self._sample_config(
+                    task=None, sample_type='largest')
+            else:
+                if self.manual_set_task == False:
+                    self.current_config = self._progressive_shrinking()
+                else:
+                    self.current_config = self._sample_config(
+                        self.task, phase=self.phase)
+        else:
+            self.current_config = self.net_config
+
+        _logger.debug("Current config is {}".format(self.current_config))
+        if 'depth' in self.current_config:
+            kwargs['depth'] = int(self.current_config['depth'])
+
+        return self.model.forward(*inputs, **kwargs), teacher_output
diff --git a/paddleslim/nas/ofa/utils/__init__.py b/paddleslim/nas/ofa/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..342ae0eddcff168fb62bb08708af868dbc808aa5
--- /dev/null
+++ b/paddleslim/nas/ofa/utils/__init__.py
@@ -0,0 +1,15 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .utils import *
diff --git a/paddleslim/nas/ofa/utils/utils.py b/paddleslim/nas/ofa/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fad016a754f61df9c72c04956901d978db0b6df6
--- /dev/null
+++ b/paddleslim/nas/ofa/utils/utils.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def compute_start_end(kernel_size, sub_kernel_size):
+    center = kernel_size // 2
+    sub_center = sub_kernel_size // 2
+    start = center - sub_center
+    end = center + sub_center + 1
+    assert end - start == sub_kernel_size
+    return start, end
+
+
+def get_same_padding(kernel_size):
+    assert isinstance(kernel_size, int)
+    assert kernel_size % 2 > 0, "kernel size must be odd number"
+    return kernel_size // 2
+
+
+def convert_to_list(value, n):
+    return [value, ] * n
+
+
+def search_idx(num, sorted_nestlist):
+    max_num = -1
+    max_idx = -1
+    for idx in range(len(sorted_nestlist)):
+        task_ = sorted_nestlist[idx]
+        max_num = task_[-1]
+        max_idx = len(task_) - 1
+        for phase_idx in range(len(task_)):
+            if num <= task_[phase_idx]:
+                return idx, phase_idx
+    assert num > max_num
+    return len(sorted_nestlist) - 1, max_idx
diff --git a/paddleslim/nas/one_shot/__init__.py b/paddleslim/nas/one_shot/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8dfbe57b745501b2eae1512da8e52f69ffe95b6
--- /dev/null
+++ b/paddleslim/nas/one_shot/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from ..one_shot import one_shot_nas
+from .one_shot_nas import *
+from ..one_shot import super_mnasnet
+from .super_mnasnet import *
+__all__ = []
+__all__ += one_shot_nas.__all__
+__all__ += super_mnasnet.__all__
diff --git a/paddleslim/nas/one_shot/one_shot_nas.py b/paddleslim/nas/one_shot/one_shot_nas.py
new file mode 100644
index 0000000000000000000000000000000000000000..444f512ac8daee43e949a3ad3f917c8cf7344724
--- /dev/null
+++ b/paddleslim/nas/one_shot/one_shot_nas.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.fluid as fluid
+from ...common import SAController
+
+__all__ = ['OneShotSuperNet', 'OneShotSearch']
+
+
+def OneShotSearch(model, eval_func, strategy='sa', search_steps=100):
+    """
+    Search a best tokens which represents a sub-network.
+
+    Args:
+        model(fluid.dygraph.Layer): A dynamic graph module whose sub-modules should contain
+                                    one instance of `OneShotSuperNet` at least.
+        eval_func(function): A callback function which accept model and tokens as arguments.
+        strategy(str): The name of strategy used to search. Default: 'sa'.
+        search_steps(int): The total steps for searching.
+
+    Returns:
+        list<int>: The best tokens searched.
+    """
+    super_net = None
+    for layer in model.sublayers(include_sublayers=False):
+        print("layer: {}".format(layer))
+        if isinstance(layer, OneShotSuperNet):
+            super_net = layer
+            break
+    assert super_net is not None
+    controller = None
+    if strategy == "sa":
+        contoller = SAController(
+            range_table=super_net.range_table(),
+            init_tokens=super_net.init_tokens())
+    assert (controller is not None, "Unsupported searching strategy.")
+    for i in range(search_steps):
+        tokens = contoller.next_tokens()
+        reward = eval_func(model, tokens)
+        contoller.update(tokens, reward, i)
+    return contoller.best_tokens()
+
+
+class OneShotSuperNet(fluid.dygraph.Layer):
+    """The base class of super net used in one-shot searching strategy.
+    A super net is a dygraph layer.
+    
+    Args:
+        name_scope(str): The name scope of super net.
+    """
+
+    def __init__(self, name_scope):
+        super(OneShotSuperNet, self).__init__(name_scope)
+
+    def init_tokens(self):
+        """Get init tokens in search space.
+
+        Returns:
+           lis<int>t: The init tokens which is a list of integer.
+        """
+        raise NotImplementedError('Abstract method.')
+
+    def range_table(self):
+        """Get range table of current search space.
+
+        Returns:
+           range_table(tuple): The maximum value and minimum value in each position of tokens
+                               with format `(min_values, max_values)`. The `min_values` is
+                               a list of integers  indicating the minimum values while `max_values`
+                               indicating the maximum values.
+        """
+        raise NotImplementedError('Abstract method.')
+
+    def _forward_impl(self, *inputs, **kwargs):
+        """Defines the computation performed at every call.
+        Should be overridden by all subclasses.
+
+        Args:
+            inputs(tuple): unpacked tuple arguments
+            kwargs(dict): unpacked dict arguments
+        """
+        raise NotImplementedError('Abstract method.')
+
+    def forward(self, input, tokens=None):
+        """
+        Defines the computation performed at every call.
+
+        Args:
+            input(variable): The input of super net.
+            tokens(list): The tokens used to generate a sub-network.
+                          None means computing in super net training mode.
+                          Otherwise, it will execute the sub-network generated by tokens.
+                          The `tokens` should be set in searching stage and final training stage.
+                          Default: None.
+
+        Returns:
+            Varaible: The output of super net.
+        """
+        if tokens == None:
+            tokens = self._random_tokens()
+        return self._forward_impl(input, tokens=tokens)
+
+    def _random_tokens(self):
+        tokens = []
+        for min_v, max_v in zip(self.range_table()[0], self.range_table()[1]):
+            tokens.append(np.random.randint(min_v, max_v))
+        return tokens
diff --git a/paddleslim/nas/one_shot/super_mnasnet.py b/paddleslim/nas/one_shot/super_mnasnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..852b40383af5223668b1635a5ded05f39966f5fd
--- /dev/null
+++ b/paddleslim/nas/one_shot/super_mnasnet.py
@@ -0,0 +1,257 @@
+import paddle
+from paddle import fluid
+from paddle.fluid.layer_helper import LayerHelper
+import numpy as np
+from one_shot_nas import OneShotSuperNet
+
+__all__ = ['SuperMnasnet']
+
+
+class DConvBlock(fluid.dygraph.Layer):
+    def __init__(self,
+                 name_scope,
+                 in_channels,
+                 channels,
+                 expansion,
+                 stride,
+                 kernel_size=3,
+                 padding=1):
+        super(DConvBlock, self).__init__(name_scope)
+        self.expansion = expansion
+        self.in_channels = in_channels
+        self.channels = channels
+        self.stride = stride
+        self.flops = 0
+        self.flops_calculated = False
+        self.expand = fluid.dygraph.Conv2D(
+            in_channels,
+            num_filters=in_channels * expansion,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=None,
+            bias_attr=False)
+        self.expand_bn = fluid.dygraph.BatchNorm(
+            num_channels=in_channels * expansion, act='relu6')
+
+        self.dconv = fluid.dygraph.Conv2D(
+            in_channels * expansion,
+            num_filters=in_channels * expansion,
+            filter_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=False,
+            groups=in_channels * expansion,
+            use_cudnn=False)
+        self.dconv_bn = fluid.dygraph.BatchNorm(
+            num_channels=in_channels * expansion, act='relu6')
+
+        self.project = fluid.dygraph.Conv2D(
+            in_channels * expansion,
+            num_filters=channels,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=None,
+            bias_attr=False)
+        self.project_bn = fluid.dygraph.BatchNorm(
+            num_channels=channels, act=None)
+
+        self.shortcut = fluid.dygraph.Conv2D(
+            in_channels,
+            num_filters=channels,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=None,
+            bias_attr=False)
+        self.shortcut_bn = fluid.dygraph.BatchNorm(
+            num_channels=channels, act=None)
+
+    def get_flops(self, input, output, op):
+        if not self.flops_calculated:
+            flops = input.shape[1] * output.shape[1] * (
+                op._filter_size**2) * output.shape[2] * output.shape[3]
+            if op._groups:
+                flops /= op._groups
+            self.flops += flops
+
+    def forward(self, inputs):
+        expand_x = self.expand_bn(self.expand(inputs))
+        self.get_flops(inputs, expand_x, self.expand)
+        dconv_x = self.dconv_bn(self.dconv(expand_x))
+        self.get_flops(expand_x, dconv_x, self.dconv)
+        proj_x = self.project_bn(self.project(dconv_x))
+        self.get_flops(dconv_x, proj_x, self.project)
+        if self.in_channels != self.channels and self.stride == 1:
+            shortcut = self.shortcut_bn(self.shortcut(inputs))
+            self.get_flops(inputs, shortcut, self.shortcut)
+        elif self.stride == 1:
+            shortcut = inputs
+        self.flops_calculated = True
+        if self.stride == 1:
+            out = fluid.layers.elementwise_add(x=proj_x, y=shortcut)
+            return out
+        return proj_x
+
+
+class SearchBlock(fluid.dygraph.Layer):
+    def __init__(self,
+                 name_scope,
+                 in_channels,
+                 channels,
+                 stride,
+                 kernel_size=3,
+                 padding=1):
+        super(SearchBlock, self).__init__(name_scope)
+        self._stride = stride
+        self.block_list = []
+        self.flops = [0 for i in range(10)]
+        self.flops_calculated = [False if i < 6 else True for i in range(10)]
+        kernels = [3, 5, 7]
+        expansions = [3, 6]
+        for k in kernels:
+            for e in expansions:
+                self.block_list.append(
+                    DConvBlock(self.full_name(), in_channels, channels, e,
+                               stride, k, (k - 1) // 2))
+                self.add_sublayer("expansion_{}_kernel_{}".format(e, k),
+                                  self.block_list[-1])
+
+    def forward(self, inputs, arch):
+        if arch >= 6:
+            return inputs
+        out = self.block_list[arch](inputs)
+        if not self.flops_calculated[arch]:
+            self.flops[arch] = self.block_list[arch].flops
+            self.flops_calculated[arch] = True
+        return out
+
+
+class AuxiliaryHead(fluid.dygraph.Layer):
+    def __init__(self, name_scope, num_classes):
+        super(AuxiliaryHead, self).__init__(name_scope)
+
+        self.pool1 = fluid.dygraph.Pool2D(
+            5, 'avg', pool_stride=3, pool_padding=0)
+        self.conv1 = fluid.dygraph.Conv2D(128, 1, bias_attr=False)
+        self.bn1 = fluid.dygraph.BatchNorm(128, act='relu6')
+        self.conv2 = fluid.dygraph.Conv2D(768, 2, bias_attr=False)
+        self.bn2 = fluid.dygraph.BatchNorm(768, act='relu6')
+        self.classifier = fluid.dygraph.FC(num_classes, act='softmax')
+        self.layer_helper = LayerHelper(self.full_name(), act='relu6')
+
+    def forward(self, inputs):  #pylint: disable=arguments-differ
+        inputs = self.layer_helper.append_activation(inputs)
+        inputs = self.pool1(inputs)
+        inputs = self.conv1(inputs)
+        inputs = self.bn1(inputs)
+        inputs = self.conv2(inputs)
+        inputs = self.bn2(inputs)
+        inputs = self.classifier(inputs)
+        return inputs
+
+
+class SuperMnasnet(OneShotSuperNet):
+    def __init__(self,
+                 name_scope,
+                 input_channels=3,
+                 out_channels=1280,
+                 repeat_times=[6, 6, 6, 6, 6, 6],
+                 stride=[1, 1, 1, 1, 2, 1],
+                 channels=[16, 24, 40, 80, 96, 192, 320],
+                 use_auxhead=False):
+        super(SuperMnasnet, self).__init__(name_scope)
+        self.flops = 0
+        self.repeat_times = repeat_times
+        self.flops_calculated = False
+        self.last_tokens = None
+        self._conv = fluid.dygraph.Conv2D(
+            input_channels, 32, 3, 1, 1, act=None, bias_attr=False)
+        self._bn = fluid.dygraph.BatchNorm(32, act='relu6')
+        self._sep_conv = fluid.dygraph.Conv2D(
+            32,
+            32,
+            3,
+            1,
+            1,
+            groups=32,
+            act=None,
+            use_cudnn=False,
+            bias_attr=False)
+        self._sep_conv_bn = fluid.dygraph.BatchNorm(32, act='relu6')
+        self._sep_project = fluid.dygraph.Conv2D(
+            32, 16, 1, 1, 0, act=None, bias_attr=False)
+        self._sep_project_bn = fluid.dygraph.BatchNorm(16, act='relu6')
+
+        self._final_conv = fluid.dygraph.Conv2D(
+            320, out_channels, 1, 1, 0, act=None, bias_attr=False)
+        self._final_bn = fluid.dygraph.BatchNorm(out_channels, act='relu6')
+        self.stride = stride
+        self.block_list = []
+        self.use_auxhead = use_auxhead
+
+        for _iter, _stride in enumerate(self.stride):
+            repeat_block = []
+            for _ind in range(self.repeat_times[_iter]):
+                if _ind == 0:
+                    block = SearchBlock(self.full_name(), channels[_iter],
+                                        channels[_iter + 1], _stride)
+                else:
+                    block = SearchBlock(self.full_name(), channels[_iter + 1],
+                                        channels[_iter + 1], 1)
+                self.add_sublayer("block_{}_{}".format(_iter, _ind), block)
+                repeat_block.append(block)
+            self.block_list.append(repeat_block)
+        if self.use_auxhead:
+            self.auxhead = AuxiliaryHead(self.full_name(), 10)
+
+    def init_tokens(self):
+        return [
+            3, 3, 6, 6, 6, 6, 3, 3, 3, 6, 6, 6, 3, 3, 3, 3, 6, 6, 3, 3, 3, 6,
+            6, 6, 3, 3, 3, 6, 6, 6, 3, 6, 6, 6, 6, 6
+        ]
+
+    def range_table(self):
+        max_v = [
+            6, 6, 10, 10, 10, 10, 6, 6, 6, 10, 10, 10, 6, 6, 6, 6, 10, 10, 6,
+            6, 6, 10, 10, 10, 6, 6, 6, 10, 10, 10, 6, 10, 10, 10, 10, 10
+        ]
+        return (len(max_v) * [0], max_v)
+
+    def get_flops(self, input, output, op):
+        if not self.flops_calculated:
+            flops = input.shape[1] * output.shape[1] * (
+                op._filter_size**2) * output.shape[2] * output.shape[3]
+            if op._groups:
+                flops /= op._groups
+            self.flops += flops
+
+    def _forward_impl(self, inputs, tokens=None):
+        if isinstance(tokens, np.ndarray) and not (tokens == self.last_tokens).all()\
+           or not isinstance(tokens, np.ndarray) and not tokens == self.last_tokens:
+            self.flops_calculated = False
+            self.flops = 0
+        self.last_tokens = tokens
+        x = self._bn(self._conv(inputs))
+        self.get_flops(inputs, x, self._conv)
+        sep_x = self._sep_conv_bn(self._sep_conv(x))
+        self.get_flops(x, sep_x, self._sep_conv)
+        proj_x = self._sep_project_bn(self._sep_project(sep_x))
+        self.get_flops(sep_x, proj_x, self._sep_project)
+        x = proj_x
+        for ind in range(len(self.block_list)):
+            for b_ind, block in enumerate(self.block_list[ind]):
+                x = fluid.layers.dropout(block(x, tokens[ind * 6 + b_ind]), 0.)
+                if not self.flops_calculated:
+                    self.flops += block.flops[tokens[ind * 6 + b_ind]]
+            if ind == len(self.block_list) * 2 // 3 - 1 and self.use_auxhead:
+                fc_aux = self.auxhead(x)
+        final_x = self._final_bn(self._final_conv(x))
+        self.get_flops(x, final_x, self._final_conv)
+        #        x = self.global_pooling(final_x)
+        self.flops_calculated = True
+        if self.use_auxhead:
+            return final_x, fc_aux
+        return final_x
diff --git a/paddleslim/nas/rl_nas.py b/paddleslim/nas/rl_nas.py
new file mode 100644
index 0000000000000000000000000000000000000000..1718b8347def79a222c7dc016b2a2d021d380f36
--- /dev/null
+++ b/paddleslim/nas/rl_nas.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import socket
+import logging
+import numpy as np
+import json
+import hashlib
+import time
+import paddle.fluid as fluid
+from ..common.rl_controller.utils import RLCONTROLLER
+from ..common import get_logger
+
+from ..common import Server
+from ..common import Client
+from .search_space import SearchSpaceFactory
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+__all__ = ['RLNAS']
+
+
+class RLNAS(object):
+    """ 
+    Controller with Reinforcement Learning. 
+    Args:
+        key(str): The actual reinforcement learning method. Current support in paddleslim is `LSTM` and `DDPG`.
+        configs(list<tuple>): A list of search space configuration with format [(key, {input_size,
+                              output_size, block_num, block_mask})]. `key` is the name of search space
+                              with data type str. `input_size` and `output_size`  are input size and
+                              output size of searched sub-network. `block_num` is the number of blocks
+                              in searched network, `block_mask` is a list consists by 0 and 1, 0 means
+                              normal block, 1 means reduction block.
+        use_gpu(bool): Whether to use gpu in controller. Default: False.
+        server_addr(tuple): Server address, including ip and port of server. If ip is None or "", will
+                            use host ip if is_server = True. Default: ("", 8881).
+        is_server(bool): Whether current host is controller server. Default: True.
+        is_sync(bool): Whether to update controller in synchronous mode. Default: False.
+        save_controller(str|None): The directory of controller to save, if set to None, not save checkpoint.
+                                      Default: None.
+        load_controller(str|None): The directory of controller to load, if set to None, not load checkpoint.
+                                      Default: None.
+        **kwargs: Additional keyword arguments. 
+    """
+
+    def __init__(self,
+                 key,
+                 configs,
+                 use_gpu=False,
+                 server_addr=("", 8881),
+                 is_server=True,
+                 is_sync=False,
+                 save_controller=None,
+                 load_controller=None,
+                 **kwargs):
+        if not is_server:
+            assert server_addr[
+                0] != "", "You should set the IP and port of server when is_server is False."
+
+        self._configs = configs
+        factory = SearchSpaceFactory()
+        self._search_space = factory.get_search_space(configs)
+        self.range_tables = self._search_space.range_table()
+        self.save_controller = save_controller
+        self.load_controller = load_controller
+
+        if key.upper() in ['DDPG']:
+            try:
+                import parl
+            except ImportError as e:
+                _logger.error(
+                    "If you want to use DDPG in RLNAS, please pip install parl first. Now states: {}".
+                    format(e))
+                os._exit(1)
+
+        cls = RLCONTROLLER.get(key.upper())
+
+        server_ip, server_port = server_addr
+        if server_ip == None or server_ip == "":
+            server_ip = self._get_host_ip()
+
+        self._controller = cls(range_tables=self.range_tables,
+                               use_gpu=use_gpu,
+                               **kwargs)
+
+        if is_server:
+            max_client_num = 300
+            self._controller_server = Server(
+                controller=self._controller,
+                address=(server_ip, server_port),
+                is_sync=is_sync,
+                save_controller=self.save_controller,
+                load_controller=self.load_controller)
+            self._controller_server.start()
+
+        self._client_name = hashlib.md5(
+            str(time.time() + np.random.randint(1, 10000)).encode(
+                "utf-8")).hexdigest()
+        self._controller_client = Client(
+            controller=self._controller,
+            address=(server_ip, server_port),
+            client_name=self._client_name)
+
+        self._current_tokens = None
+
+    def _get_host_ip(self):
+        try:
+            return socket.gethostbyname(socket.gethostname())
+        except:
+            return socket.gethostbyname('localhost')
+
+    def next_archs(self, obs=None):
+        """ 
+        Get next archs
+        Args:
+            obs(int|np.array): observations in env.
+        """
+        archs = []
+        self._current_tokens = self._controller_client.next_tokens(obs)
+        _logger.info("current tokens: {}".format(self._current_tokens))
+        for token in self._current_tokens:
+            archs.append(self._search_space.token2arch(token))
+
+        return archs
+
+    @property
+    def tokens(self):
+        return self._current_tokens
+
+    def reward(self, rewards, **kwargs):
+        """ 
+        reward the score and to train controller
+        Args:
+            rewards(float|list<float>): rewards get by tokens.
+            **kwargs: Additional keyword arguments. 
+        """
+        return self._controller_client.update(rewards, **kwargs)
+
+    def final_archs(self, batch_obs):
+        """
+        Get finally architecture
+        Args:
+            batch_obs(int|np.array): observations in env.
+        """
+        final_tokens = self._controller_client.next_tokens(
+            batch_obs, is_inference=True)
+        self._current_tokens = final_tokens
+        _logger.info("Final tokens: {}".format(final_tokens))
+        archs = []
+        for token in final_tokens:
+            arch = self._search_space.token2arch(token)
+            archs.append(arch)
+
+        return archs
+
+    def tokens2arch(self, tokens):
+        """
+        Convert tokens to model architectures.
+        Args
+            tokens<list>: A list of token. The length and range based on search space.:
+        Returns:
+            list<function>: A model architecture instance according to tokens.
+        """
+        return self._search_space.token2arch(tokens)
diff --git a/paddleslim/nas/sa_nas.py b/paddleslim/nas/sa_nas.py
index 73a8ecf6cf45ee7f32503dd204a3f739ca01cde6..60472cd6d94efcba79eef99e9fc96b37ca1ccf39 100644
--- a/paddleslim/nas/sa_nas.py
+++ b/paddleslim/nas/sa_nas.py
@@ -18,8 +18,8 @@ import logging
 import numpy as np
 import json
 import hashlib
+import time
 import paddle.fluid as fluid
-from ..core import VarWrapper, OpWrapper, GraphWrapper
 from ..common import SAController
 from ..common import get_logger
 from ..analysis import flops
@@ -34,29 +34,76 @@ _logger = get_logger(__name__, level=logging.INFO)
 
 
 class SANAS(object):
+    """
+    SANAS(Simulated Annealing Neural Architecture Search) is a neural architecture search algorithm 
+    based on simulated annealing, used in discrete search task generally.
+
+    Args:
+        configs(list<tuple>): A list of search space configuration with format [(key, {input_size, 
+                              output_size, block_num, block_mask})]. `key` is the name of search space 
+                              with data type str. `input_size` and `output_size`  are input size and 
+                              output size of searched sub-network. `block_num` is the number of blocks 
+                              in searched network, `block_mask` is a list consists by 0 and 1, 0 means 
+                              normal block, 1 means reduction block.
+        server_addr(tuple): Server address, including ip and port of server. If ip is None or "", will 
+                            use host ip if is_server = True. Default: ("", 8881).
+        init_temperature(float): Initial temperature in SANAS. If init_temperature and init_tokens are None, 
+                                 default initial temperature is 10.0, if init_temperature is None and 
+                                 init_tokens is not None, default initial temperature is 1.0. The detail 
+                                 configuration about the init_temperature please reference Note. Default: None.
+        reduce_rate(float): Reduce rate in SANAS. The detail configuration about the reduce_rate please 
+                            reference Note. Default: 0.85.
+        search_steps(int): The steps of searching. Default: 300.
+        init_tokens(list|None): Initial token. If init_tokens is None, SANAS will random generate initial 
+                                tokens. Default: None.
+        save_checkpoint(string|None): The directory of checkpoint to save, if set to None, not save checkpoint.
+                                      Default: 'nas_checkpoint'.
+        load_checkpoint(string|None): The directory of checkpoint to load, if set to None, not load checkpoint. 
+                                      Default: None.
+        is_server(bool): Whether current host is controller server. Default: True.
+
+    .. note::
+        - Why need to set initial temperature and reduce rate:
+
+          - SA algorithm preserve a base token(initial token is the first base token, can be set by 
+            yourself or random generate) and base score(initial score is -1), next token will be 
+            generated based on base token. During the search, if the score which is obtained by the 
+            model corresponding to the token is greater than the score which is saved in SA corresponding to 
+            base token, current token saved as base token certainly; if score which is obtained by the model 
+            corresponding to the token is less than the score which is saved in SA correspinding to base token, 
+            current token saved as base token with a certain probability.
+          - For initial temperature, higher is more unstable, it means that SA has a strong possibility to save 
+            current token as base token if current score is smaller than base score saved in SA.
+          - For initial temperature, lower is more stable, it means that SA has a small possibility to save 
+            current token as base token if current score is smaller than base score saved in SA.
+          - For reduce rate, higher means SA algorithm has slower convergence.
+          - For reduce rate, lower means SA algorithm has faster convergence.
+
+        - How to set initial temperature and reduce rate:
+
+          - If there is a better initial token, and want to search based on this token, we suggest start search 
+            experiment in the steady state of the SA algorithm, initial temperature can be set to a small value, 
+            such as 1.0, and reduce rate can be set to a large value, such as 0.85. If you want to start search 
+            experiment based on the better token with greedy algorithm, which only saved current token as base 
+            token if current score higher than base score saved in SA algorithm, reduce rate can be set to a 
+            extremely small value, such as 0.85 ** 10.
+
+          - If initial token is generated randomly, it means initial token is a worse token, we suggest start 
+            search experiment in the unstable state of the SA algorithm, explore all random tokens as much as 
+            possible, and get a better token. Initial temperature can be set a higher value, such as 1000.0, 
+            and reduce rate can be set to a small value.
+    """
+
     def __init__(self,
                  configs,
                  server_addr=("", 8881),
-                 init_temperature=100,
+                 init_temperature=None,
                  reduce_rate=0.85,
                  search_steps=300,
+                 init_tokens=None,
                  save_checkpoint='nas_checkpoint',
                  load_checkpoint=None,
-                 is_server=False):
-        """
-        Search a group of ratios used to prune program.
-        Args:
-            configs(list<tuple>): A list of search space configuration with format [(key, {input_size, output_size, block_num, block_mask})].
-                                  `key` is the name of search space with data type str. `input_size` and `output_size`  are
-                                   input size and output size of searched sub-network. `block_num` is the number of blocks in searched network, `block_mask` is a list consists by 0 and 1, 0 means normal block, 1 means reduction block.
-            server_addr(tuple): A tuple of server ip and server port for controller server. 
-            init_temperature(float): The init temperature used in simulated annealing search strategy.
-            reduce_rate(float): The decay rate used in simulated annealing search strategy.
-            search_steps(int): The steps of searching.
-            save_checkpoint(string|None): The directory of checkpoint to save, if set to None, not save checkpoint. Default: 'nas_checkpoint'.
-            load_checkpoint(string|None): The directory of checkpoint to load, if set to None, not load checkpoint. Default: None.
-            is_server(bool): Whether current host is controller server. Default: True.
-        """
+                 is_server=True):
         if not is_server:
             assert server_addr[
                 0] != "", "You should set the IP and port of server when is_server is False."
@@ -64,18 +111,23 @@ class SANAS(object):
         self._init_temperature = init_temperature
         self._is_server = is_server
         self._configs = configs
-        self._key = hashlib.md5(str(self._configs).encode("utf-8")).hexdigest()
+        self._init_tokens = init_tokens
+        self._client_name = hashlib.md5(
+            str(time.time() + np.random.randint(1, 10000)).encode(
+                "utf-8")).hexdigest()
+        self._key = str(self._configs)
+        self._current_tokens = init_tokens
 
-        server_ip, server_port = server_addr
-        if server_ip == None or server_ip == "":
-            server_ip = self._get_host_ip()
+        self._server_ip, self._server_port = server_addr
+        if self._server_ip == None or self._server_ip == "":
+            self._server_ip = self._get_host_ip()
 
         factory = SearchSpaceFactory()
         self._search_space = factory.get_search_space(configs)
 
         # create controller server
         if self._is_server:
-            init_tokens = self._search_space.init_tokens()
+            init_tokens = self._search_space.init_tokens(self._init_tokens)
             range_table = self._search_space.range_table()
             range_table = (len(range_table) * [0], range_table)
             _logger.info("range table: {}".format(range_table))
@@ -106,7 +158,7 @@ class SANAS(object):
                 range_table,
                 self._reduce_rate,
                 self._init_temperature,
-                max_try_times=500,
+                max_try_times=50000,
                 init_tokens=preinit_tokens,
                 reward=prereward,
                 max_reward=premax_reward,
@@ -119,7 +171,7 @@ class SANAS(object):
             max_client_num = 100
             self._controller_server = ControllerServer(
                 controller=self._controller,
-                address=(server_ip, server_port),
+                address=(self._server_ip, self._server_port),
                 max_client_num=max_client_num,
                 search_steps=search_steps,
                 key=self._key)
@@ -127,7 +179,10 @@ class SANAS(object):
             server_port = self._controller_server.port()
 
         self._controller_client = ControllerClient(
-            server_ip, server_port, key=self._key)
+            self._server_ip,
+            self._server_port,
+            key=self._key,
+            client_name=self._client_name)
 
         if is_server and load_checkpoint != None:
             self._iter = scene['_iter']
@@ -135,9 +190,19 @@ class SANAS(object):
             self._iter = 0
 
     def _get_host_ip(self):
-        return socket.gethostbyname(socket.gethostname())
+        try:
+            return socket.gethostbyname(socket.gethostname())
+        except:
+            return socket.gethostbyname('localhost')
 
     def tokens2arch(self, tokens):
+        """
+        Convert tokens to model architectures.
+        Args
+            tokens<list>: A list of token. The length and range based on search space.:
+        Returns:
+            list<function>: A model architecture instance according to tokens.
+        """
         return self._search_space.token2arch(tokens)
 
     def current_info(self):
@@ -146,19 +211,17 @@ class SANAS(object):
         Returns:
             dict<name, value>: a dictionary include best tokens, best reward and current reward.
         """
-        current_dict = dict()
-        current_dict['best_tokens'] = self._controller.best_tokens
-        current_dict['best_reward'] = self._controller.max_reward
-        current_dict['current_tokens'] = self._controller.current_tokens
+        current_dict = self._controller_client.request_current_info()
         return current_dict
 
     def next_archs(self):
         """
-        Get next network architectures.
+        Get next model architectures.
         Returns:
-            list<function>: A list of functions that define networks.
+            list<function>: A list of instance of model architecture.
         """
         self._current_tokens = self._controller_client.next_tokens()
+        _logger.info("current tokens: {}".format(self._current_tokens))
         archs = self._search_space.token2arch(self._current_tokens)
         return archs
 
@@ -166,7 +229,7 @@ class SANAS(object):
         """
         Return reward of current searched network.
         Args:
-            score(float): The score of current searched network.
+            score(float): The score of current searched network, bigger is better.
         Returns:
             bool: True means updating successfully while false means failure.
         """
diff --git a/paddleslim/nas/search_space/__init__.py b/paddleslim/nas/search_space/__init__.py
index 9556c61917406ab461fb7a0dbb071b864c5ab357..ba72463f020da484fedb75d8f27443347cbd086c 100644
--- a/paddleslim/nas/search_space/__init__.py
+++ b/paddleslim/nas/search_space/__init__.py
@@ -18,12 +18,12 @@ from .resnet import ResNetSpace
 from .mobilenet_block import MobileNetV1BlockSpace, MobileNetV2BlockSpace
 from .resnet_block import ResNetBlockSpace
 from .inception_block import InceptionABlockSpace, InceptionCBlockSpace
+from .darts_space import DartsSpace
 from .search_space_registry import SEARCHSPACE
 from .search_space_factory import SearchSpaceFactory
 from .search_space_base import SearchSpaceBase
-
 __all__ = [
-    'MobileNetV1Space', 'MobileNetV2Space', 'ResNetSpace',
+    'MobileNetV1Space', 'MobileNetV2Space', 'ResNetSpace', 'DartsSpace',
     'MobileNetV1BlockSpace', 'MobileNetV2BlockSpace', 'ResNetBlockSpace',
     'InceptionABlockSpace', 'InceptionCBlockSpace', 'SearchSpaceBase',
     'SearchSpaceFactory', 'SEARCHSPACE'
diff --git a/paddleslim/nas/search_space/base_layer.py b/paddleslim/nas/search_space/base_layer.py
index b497c92a2ca57b4acab0c39c5dbd69d30083e295..76d7e3303727c1ae5d2bfe8683a571b5bf89c85e 100644
--- a/paddleslim/nas/search_space/base_layer.py
+++ b/paddleslim/nas/search_space/base_layer.py
@@ -19,7 +19,7 @@ from paddle.fluid.param_attr import ParamAttr
 def conv_bn_layer(input,
                   filter_size,
                   num_filters,
-                  stride,
+                  stride=1,
                   padding='SAME',
                   num_groups=1,
                   act=None,
@@ -52,9 +52,9 @@ def conv_bn_layer(input,
         bias_attr=False)
     bn_name = name + '_bn'
     return fluid.layers.batch_norm(
-               input=conv,
-               act = act,
-               param_attr=ParamAttr(name=bn_name + '_scale'),
-               bias_attr=ParamAttr(name=bn_name + '_offset'),
-               moving_mean_name=bn_name + '_mean',
-               moving_variance_name=bn_name + '_variance')
+        input=conv,
+        act=act,
+        param_attr=ParamAttr(name=bn_name + '_scale'),
+        bias_attr=ParamAttr(name=bn_name + '_offset'),
+        moving_mean_name=bn_name + '_mean',
+        moving_variance_name=bn_name + '_variance')
diff --git a/paddleslim/nas/search_space/combine_search_space.py b/paddleslim/nas/search_space/combine_search_space.py
index 7bb66c00c663cfba75dcc429e5ca53270e58bed7..b6458e4cd1a013a696a7dd605a54913f0d48dac2 100644
--- a/paddleslim/nas/search_space/combine_search_space.py
+++ b/paddleslim/nas/search_space/combine_search_space.py
@@ -97,16 +97,19 @@ class CombineSearchSpace(object):
         space = cls(input_size, output_size, block_num, block_mask=block_mask)
         return space
 
-    def init_tokens(self):
+    def init_tokens(self, tokens=None):
         """
         Combine init tokens.
         """
-        tokens = []
-        self.single_token_num = []
-        for space in self.spaces:
-            tokens.extend(space.init_tokens())
-            self.single_token_num.append(len(space.init_tokens()))
-        return tokens
+        if tokens is None:
+            tokens = []
+            self.single_token_num = []
+            for space in self.spaces:
+                tokens.extend(space.init_tokens())
+                self.single_token_num.append(len(space.init_tokens()))
+            return tokens
+        else:
+            return tokens
 
     def range_table(self):
         """
diff --git a/paddleslim/nas/search_space/darts_space.py b/paddleslim/nas/search_space/darts_space.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae2c5e3ec1fe6dfb94e38f84f74923ef92d70de7
--- /dev/null
+++ b/paddleslim/nas/search_space/darts_space.py
@@ -0,0 +1,626 @@
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.initializer import UniformInitializer, ConstantInitializer
+from .search_space_base import SearchSpaceBase
+from .base_layer import conv_bn_layer
+from .search_space_registry import SEARCHSPACE
+
+
+@SEARCHSPACE.register
+class DartsSpace(SearchSpaceBase):
+    def __init__(self, input_size, output_size, block_num, block_mask):
+        super(DartsSpace, self).__init__(input_size, output_size, block_num,
+                                         block_mask)
+        self.filter_num = np.array(
+            [4, 8, 12, 16, 20, 36, 54, 72, 90, 108, 144, 180, 216, 252])
+
+    def init_tokens(self):
+        return [5] * 6 + [7] * 7 + [10] * 7
+
+    def range_table(self):
+        return [len(self.filter_num)] * 20
+
+    def token2arch(self, tokens=None):
+        if tokens == None:
+            tokens = self.init_tokens()
+
+        self.bottleneck_params_list = []
+        reduction_count = 0
+        for i in range(3):
+            for j in range(6):
+                block_idx = i * 6 + j + reduction_count
+                self.bottleneck_params_list.append(
+                    (self.filter_num[tokens[block_idx]], 1))
+            if i < 2:
+                reduction_count += 1
+                block_idx = i * 6 + j + reduction_count
+                self.bottleneck_params_list.append(
+                    (self.filter_num[tokens[block_idx]], 2))
+
+        def net_arch(input, drop_prob, drop_path_mask, is_train, num_classes):
+            c_in = 36
+            stem_multiplier = 3
+            c_curr = stem_multiplier * c_in
+            x = self._conv_bn(
+                input,
+                c_curr,
+                kernel_size=3,
+                padding=1,
+                stride=1,
+                name='cifar10_darts_conv0')
+            s0 = s1 = x
+
+            logits_aux = None
+            reduction_prev = False
+
+            for i, layer_setting in enumerate(self.bottleneck_params_list):
+                filter_num, stride = layer_setting[0], layer_setting[1]
+                if stride == 2:
+                    reduction = True
+                else:
+                    reduction = False
+
+                if is_train:
+                    drop_path_cell = drop_path_mask[:, i, :, :]
+                else:
+                    drop_path_cell = drop_path_mask
+
+                s0, s1 = s1, self._cell(
+                    s0,
+                    s1,
+                    filter_num,
+                    stride,
+                    reduction_prev,
+                    drop_prob,
+                    drop_path_cell,
+                    is_train,
+                    name='cifar10_darts_layer{}'.format(i + 1))
+                reduction_prev = reduction
+
+                if i == 2 * 20 // 3:
+                    if is_train:
+                        logits_aux = self._auxiliary_cifar(
+                            s1, num_classes,
+                            "cifar10_darts_/l" + str(i) + "/aux")
+
+            logits = self._classifier(s1, num_classes, name='cifar10_darts')
+
+            return logits, logits_aux
+
+        return net_arch
+
+    def _classifier(self, x, num_classes, name):
+        out = fluid.layers.pool2d(x, pool_type='avg', global_pooling=True)
+        out = fluid.layers.squeeze(out, axes=[2, 3])
+        k = (1. / out.shape[1])**0.5
+        out = fluid.layers.fc(out,
+                              num_classes,
+                              param_attr=fluid.ParamAttr(
+                                  name=name + "/fc_weights",
+                                  initializer=UniformInitializer(
+                                      low=-k, high=k)),
+                              bias_attr=fluid.ParamAttr(
+                                  name=name + "/fc_bias",
+                                  initializer=UniformInitializer(
+                                      low=-k, high=k)))
+        return out
+
+    def _auxiliary_cifar(self, x, num_classes, name):
+        x = fluid.layers.relu(x)
+        pooled = fluid.layers.pool2d(
+            x, pool_size=5, pool_stride=3, pool_padding=0, pool_type='avg')
+        conv1 = self._conv_bn(
+            x=pooled,
+            c_out=128,
+            kernel_size=1,
+            padding=0,
+            stride=1,
+            name=name + '/conv_bn1')
+        conv1 = fluid.layers.relu(conv1)
+        conv2 = self._conv_bn(
+            x=conv1,
+            c_out=768,
+            kernel_size=2,
+            padding=0,
+            stride=1,
+            name=name + '/conv_bn2')
+        conv2 = fluid.layers.relu(conv2)
+        out = self._classifier(conv2, num_classes, name)
+        return out
+
+    def _cell(self,
+              s0,
+              s1,
+              filter_num,
+              stride,
+              reduction_prev,
+              drop_prob,
+              drop_path_cell,
+              is_train,
+              name=None):
+        if reduction_prev:
+            s0 = self._factorized_reduce(s0, filter_num, name=name + '/s-2')
+        else:
+            s0 = self._relu_conv_bn(
+                s0, filter_num, 1, 1, 0, name=name + '/s-2')
+        s1 = self._relu_conv_bn(s1, filter_num, 1, 1, 0, name=name + '/s-1')
+
+        if stride == 1:
+            out = self._normal_cell(
+                s0,
+                s1,
+                filter_num,
+                drop_prob,
+                drop_path_cell,
+                is_train,
+                name=name)
+        else:
+            out = self._reduction_cell(
+                s0,
+                s1,
+                filter_num,
+                drop_prob,
+                drop_path_cell,
+                is_train,
+                name=name)
+        return out
+
+    def _normal_cell(self,
+                     s0,
+                     s1,
+                     filter_num,
+                     drop_prob,
+                     drop_path_cell,
+                     is_train,
+                     name=None):
+        hidden0_0 = self._dil_conv(
+            s0,
+            c_out=filter_num,
+            kernel_size=3,
+            stride=1,
+            padding=2,
+            dilation=2,
+            affine=True,
+            name=name + '_normal_cell_hidden0_0')
+        hidden0_1 = self._sep_conv(
+            s1,
+            c_out=filter_num,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            affine=True,
+            name=name + '_normal_cell_hidden0_1')
+
+        if is_train:
+            hidden0_0 = self._drop_path(
+                hidden0_0,
+                drop_prob,
+                drop_path_cell[:, 0, 0],
+                name=name + '_normal_cell_hidden0_0')
+            hidden0_1 = self._drop_path(
+                hidden0_1,
+                drop_prob,
+                drop_path_cell[:, 0, 1],
+                name=name + '_normal_cell_hidden0_1')
+        n0 = hidden0_0 + hidden0_1
+
+        hidden1_0 = self._sep_conv(
+            s0,
+            c_out=filter_num,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            affine=True,
+            name=name + '_normal_cell_hidden1_0')
+        hidden1_1 = self._sep_conv(
+            s1,
+            c_out=filter_num,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            affine=True,
+            name=name + '_normal_cell_hidden1_1')
+        if is_train:
+            hidden1_0 = self._drop_path(
+                hidden1_0,
+                drop_prob,
+                drop_path_cell[:, 1, 0],
+                name=name + '_normal_cell_hidden1_0')
+            hidden1_1 = self._drop_path(
+                hidden1_1,
+                drop_prob,
+                drop_path_cell[:, 1, 1],
+                name=name + '_normal_cell_hidden1_1')
+        n1 = hidden1_0 + hidden1_1
+
+        hidden2_0 = self._sep_conv(
+            s0,
+            c_out=filter_num,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            affine=True,
+            name=name + '_normal_cell_hidden2_0')
+        hidden2_1 = self._sep_conv(
+            s1,
+            c_out=filter_num,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            affine=True,
+            name=name + '_normal_cell_hidden2_1')
+        if is_train:
+            hidden2_0 = self._drop_path(
+                hidden2_0,
+                drop_prob,
+                drop_path_cell[:, 2, 0],
+                name=name + '_normal_cell_hidden2_0')
+            hidden2_1 = self._drop_path(
+                hidden2_1,
+                drop_prob,
+                drop_path_cell[:, 2, 1],
+                name=name + '_normal_cell_hidden2_1')
+        n2 = hidden2_0 + hidden2_1
+
+        ### skip connect => identity
+        hidden3_0 = s0
+        hidden3_1 = self._sep_conv(
+            s1,
+            c_out=filter_num,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            affine=True,
+            name=name + '_normal_cell_hidden3_1')
+        if is_train:
+            hidden3_1 = self._drop_path(
+                hidden3_1,
+                drop_prob,
+                drop_path_cell[:, 3, 1],
+                name=name + '_normal_cell_hidden3_1')
+        n3 = hidden3_0 + hidden3_1
+
+        out = fluid.layers.concat(
+            input=[n0, n1, n2, n3], axis=1, name=name + '_normal_cell_concat')
+        return out
+
+    def _reduction_cell(self,
+                        s0,
+                        s1,
+                        filter_num,
+                        drop_prob,
+                        drop_path_cell,
+                        is_train,
+                        name=None):
+        hidden0_0 = fluid.layers.pool2d(
+            input=s0,
+            pool_size=3,
+            pool_type="max",
+            pool_stride=2,
+            pool_padding=1,
+            name=name + '_reduction_cell_hidden0_0')
+        hidden0_1 = self._factorized_reduce(
+            s1,
+            filter_num,
+            affine=True,
+            name=name + '_reduction_cell_hidden0_1')
+        if is_train:
+            hidden0_0 = self._drop_path(
+                hidden0_0,
+                drop_prob,
+                drop_path_cell[:, 0, 0],
+                name=name + '_reduction_cell_hidden0_0')
+        r0 = hidden0_0 + hidden0_1
+
+        hidden1_0 = fluid.layers.pool2d(
+            input=s1,
+            pool_size=3,
+            pool_type="max",
+            pool_stride=2,
+            pool_padding=1,
+            name=name + '_reduction_cell_hidden1_0')
+        hidden1_1 = r0
+        if is_train:
+            hidden1_0 = self._drop_path(
+                hidden1_0,
+                drop_prob,
+                drop_path_cell[:, 1, 0],
+                name=name + '_reduction_cell_hidden1_0')
+        r1 = hidden1_0 + hidden1_1
+
+        hidden2_0 = r0
+        hidden2_1 = self._dil_conv(
+            r1,
+            c_out=filter_num,
+            kernel_size=5,
+            stride=1,
+            padding=4,
+            dilation=2,
+            affine=True,
+            name=name + '_reduction_cell_hidden2_1')
+        if is_train:
+            hidden2_1 = self._drop_path(
+                hidden2_1,
+                drop_prob,
+                drop_path_cell[:, 2, 0],
+                name=name + '_reduction_cell_hidden2_1')
+        r2 = hidden2_0 + hidden2_1
+
+        hidden3_0 = r0
+        hidden3_1 = fluid.layers.pool2d(
+            input=s1,
+            pool_size=3,
+            pool_type="max",
+            pool_stride=2,
+            pool_padding=1,
+            name=name + '_reduction_cell_hidden3_1')
+        if is_train:
+            hidden3_1 = self._drop_path(
+                hidden3_1,
+                drop_prob,
+                drop_path_cell[:, 3, 0],
+                name=name + '_reduction_cell_hidden3_1')
+        r3 = hidden3_0 + hidden3_1
+
+        out = fluid.layers.concat(
+            input=[r0, r1, r2, r3],
+            axis=1,
+            name=name + '_reduction_cell_concat')
+        return out
+
+    def _conv_bn(self, x, c_out, kernel_size, padding, stride, name):
+        k = (1. / x.shape[1] / kernel_size / kernel_size)**0.5
+        conv1 = fluid.layers.conv2d(
+            x,
+            c_out,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            param_attr=fluid.ParamAttr(
+                name=name + "/conv",
+                initializer=UniformInitializer(
+                    low=-k, high=k)),
+            bias_attr=False)
+        bn1 = fluid.layers.batch_norm(
+            conv1,
+            param_attr=fluid.ParamAttr(
+                name=name + "/bn_scale",
+                initializer=ConstantInitializer(value=1)),
+            bias_attr=fluid.ParamAttr(
+                name=name + "/bn_offset",
+                initializer=ConstantInitializer(value=0)),
+            moving_mean_name=name + "/bn_mean",
+            moving_variance_name=name + "/bn_variance")
+        return bn1
+
+    def _sep_conv(self,
+                  x,
+                  c_out,
+                  kernel_size,
+                  stride,
+                  padding,
+                  affine=True,
+                  name=''):
+        c_in = x.shape[1]
+        x = fluid.layers.relu(x)
+        k = (1. / x.shape[1] / kernel_size / kernel_size)**0.5
+        x = fluid.layers.conv2d(
+            x,
+            c_in,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=c_in,
+            use_cudnn=False,
+            param_attr=fluid.ParamAttr(
+                name=name + "/sep_conv_1_1",
+                initializer=UniformInitializer(
+                    low=-k, high=k)),
+            bias_attr=False)
+        k = (1. / x.shape[1] / 1 / 1)**0.5
+        x = fluid.layers.conv2d(
+            x,
+            c_in,
+            1,
+            padding=0,
+            param_attr=fluid.ParamAttr(
+                name=name + "/sep_conv_1_2",
+                initializer=UniformInitializer(
+                    low=-k, high=k)),
+            bias_attr=False)
+        gama, beta = self._bn_param_config(name, affine, "sep_conv_bn1")
+        x = fluid.layers.batch_norm(
+            x,
+            param_attr=gama,
+            bias_attr=beta,
+            moving_mean_name=name + "/sep_bn1_mean",
+            moving_variance_name=name + "/sep_bn1_variance")
+
+        x = fluid.layers.relu(x)
+        k = (1. / x.shape[1] / kernel_size / kernel_size)**0.5
+        c_in = x.shape[1]
+        x = fluid.layers.conv2d(
+            x,
+            c_in,
+            kernel_size,
+            stride=1,
+            padding=padding,
+            groups=c_in,
+            use_cudnn=False,
+            param_attr=fluid.ParamAttr(
+                name=name + "/sep_conv2_1",
+                initializer=UniformInitializer(
+                    low=-k, high=k)),
+            bias_attr=False)
+        k = (1. / x.shape[1] / 1 / 1)**0.5
+        x = fluid.layers.conv2d(
+            x,
+            c_out,
+            1,
+            padding=0,
+            param_attr=fluid.ParamAttr(
+                name=name + "/sep_conv2_2",
+                initializer=UniformInitializer(
+                    low=-k, high=k)),
+            bias_attr=False)
+        gama, beta = self._bn_param_config(name, affine, "sep_conv_bn2")
+        x = fluid.layers.batch_norm(
+            x,
+            param_attr=gama,
+            bias_attr=beta,
+            moving_mean_name=name + "/sep_bn2_mean",
+            moving_variance_name=name + "/sep_bn2_variance")
+        return x
+
+    def _dil_conv(self,
+                  x,
+                  c_out,
+                  kernel_size,
+                  stride,
+                  padding,
+                  dilation,
+                  affine=True,
+                  name=''):
+        c_in = x.shape[1]
+        x = fluid.layers.relu(x)
+        k = (1. / x.shape[1] / kernel_size / kernel_size)**0.5
+        x = fluid.layers.conv2d(
+            x,
+            c_in,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=c_in,
+            use_cudnn=False,
+            param_attr=fluid.ParamAttr(
+                name=name + "/dil_conv1",
+                initializer=UniformInitializer(
+                    low=-k, high=k)),
+            bias_attr=False)
+        k = (1. / x.shape[1] / 1 / 1)**0.5
+        x = fluid.layers.conv2d(
+            x,
+            c_out,
+            1,
+            padding=0,
+            param_attr=fluid.ParamAttr(
+                name=name + "/dil_conv2",
+                initializer=UniformInitializer(
+                    low=-k, high=k)),
+            bias_attr=False)
+        gama, beta = self._bn_param_config(name, affine, "dil_conv_bn")
+        x = fluid.layers.batch_norm(
+            x,
+            param_attr=gama,
+            bias_attr=beta,
+            moving_mean_name=name + "/dil_bn_mean",
+            moving_variance_name=name + "/dil_bn_variance")
+        return x
+
+    def _factorized_reduce(self, x, c_out, affine=True, name=''):
+        assert c_out % 2 == 0
+        x = fluid.layers.relu(x)
+        x_sliced = x[:, :, 1:, 1:]
+        k = (1. / x.shape[1] / 1 / 1)**0.5
+        conv1 = fluid.layers.conv2d(
+            x,
+            c_out // 2,
+            1,
+            stride=2,
+            param_attr=fluid.ParamAttr(
+                name=name + "/fr_conv1",
+                initializer=UniformInitializer(
+                    low=-k, high=k)),
+            bias_attr=False)
+        k = (1. / x_sliced.shape[1] / 1 / 1)**0.5
+        conv2 = fluid.layers.conv2d(
+            x_sliced,
+            c_out // 2,
+            1,
+            stride=2,
+            param_attr=fluid.ParamAttr(
+                name=name + "/fr_conv2",
+                initializer=UniformInitializer(
+                    low=-k, high=k)),
+            bias_attr=False)
+        x = fluid.layers.concat(input=[conv1, conv2], axis=1)
+        gama, beta = self._bn_param_config(name, affine, "fr_bn")
+        x = fluid.layers.batch_norm(
+            x,
+            param_attr=gama,
+            bias_attr=beta,
+            moving_mean_name=name + "/fr_mean",
+            moving_variance_name=name + "/fr_variance")
+        return x
+
+    def _relu_conv_bn(self,
+                      x,
+                      c_out,
+                      kernel_size,
+                      stride,
+                      padding,
+                      affine=True,
+                      name=''):
+        x = fluid.layers.relu(x)
+        k = (1. / x.shape[1] / kernel_size / kernel_size)**0.5
+        x = fluid.layers.conv2d(
+            x,
+            c_out,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            param_attr=fluid.ParamAttr(
+                name=name + "/rcb_conv",
+                initializer=UniformInitializer(
+                    low=-k, high=k)),
+            bias_attr=False)
+        gama, beta = self._bn_param_config(name, affine, "rcb_bn")
+        x = fluid.layers.batch_norm(
+            x,
+            param_attr=gama,
+            bias_attr=beta,
+            moving_mean_name=name + "/rcb_mean",
+            moving_variance_name=name + "/rcb_variance")
+        return x
+
+    def _bn_param_config(self, name='', affine=False, op=None):
+        gama_name = name + "/" + str(op) + "/gama"
+        beta_name = name + "/" + str(op) + "/beta"
+        gama = ParamAttr(
+            name=gama_name,
+            initializer=ConstantInitializer(value=1),
+            trainable=affine)
+        beta = ParamAttr(
+            name=beta_name,
+            initializer=ConstantInitializer(value=0),
+            trainable=affine)
+        return gama, beta
+
+    def _drop_path(self, x, drop_prob, mask, name=None):
+        keep_prob = 1 - drop_prob[0]
+        x = fluid.layers.elementwise_mul(
+            x / keep_prob,
+            mask,
+            axis=0,
+            name=name + '_drop_path_elementwise_mul')
+        return x
diff --git a/paddleslim/nas/search_space/inception_block.py b/paddleslim/nas/search_space/inception_block.py
index 5b022d85eb9b4d80c3377bf074d645553bf67f80..1d9a2bd35c58fe72d875a68eba57c1cbd9d55193 100644
--- a/paddleslim/nas/search_space/inception_block.py
+++ b/paddleslim/nas/search_space/inception_block.py
@@ -22,7 +22,7 @@ from paddle.fluid.param_attr import ParamAttr
 from .search_space_base import SearchSpaceBase
 from .base_layer import conv_bn_layer
 from .search_space_registry import SEARCHSPACE
-from .utils import compute_downsample_num, check_points
+from .utils import compute_downsample_num, check_points, get_random_tokens
 
 __all__ = ["InceptionABlockSpace", "InceptionCBlockSpace"]
 ### TODO add asymmetric kernel of conv when paddle-lite support 
@@ -58,10 +58,7 @@ class InceptionABlockSpace(SearchSpaceBase):
         """
         The initial token.
         """
-        if self.block_mask != None:
-            return [0] * (len(self.block_mask) * 9)
-        else:
-            return [0] * (self.block_num * 9)
+        return get_random_tokens(self.range_table())
 
     def range_table(self):
         """
@@ -178,7 +175,7 @@ class InceptionABlockSpace(SearchSpaceBase):
                 input = self._inceptionA(
                     input,
                     A_tokens=filter_nums,
-                    filter_size=filter_size,
+                    filter_size=int(filter_size),
                     stride=stride,
                     pool_type=pool_type,
                     name='inceptionA_{}'.format(i + 1))
@@ -290,10 +287,7 @@ class InceptionCBlockSpace(SearchSpaceBase):
         """
         The initial token.
         """
-        if self.block_mask != None:
-            return [0] * (len(self.block_mask) * 11)
-        else:
-            return [0] * (self.block_num * 11)
+        return get_random_tokens(self.range_table())
 
     def range_table(self):
         """
@@ -414,13 +408,13 @@ class InceptionCBlockSpace(SearchSpaceBase):
                 pool_type = 'avg' if layer_setting[11] == 0 else 'max'
                 if stride == 2:
                     layer_count += 1
-                if check_points((layer_count - 1) in return_block):
+                if check_points((layer_count - 1), return_block):
                     mid_layer[layer_count - 1] = input
 
                 input = self._inceptionC(
                     input,
                     C_tokens=filter_nums,
-                    filter_size=filter_size,
+                    filter_size=int(filter_size),
                     stride=stride,
                     pool_type=pool_type,
                     name='inceptionC_{}'.format(i + 1))
diff --git a/paddleslim/nas/search_space/mobilenet_block.py b/paddleslim/nas/search_space/mobilenet_block.py
index 76597e3cdc0f2d613f39a51ed4dae81719c3ae78..84464ba3de7de8074ab4f3a72392eb3da290f401 100644
--- a/paddleslim/nas/search_space/mobilenet_block.py
+++ b/paddleslim/nas/search_space/mobilenet_block.py
@@ -22,7 +22,7 @@ from paddle.fluid.param_attr import ParamAttr
 from .search_space_base import SearchSpaceBase
 from .base_layer import conv_bn_layer
 from .search_space_registry import SEARCHSPACE
-from .utils import compute_downsample_num, check_points
+from .utils import compute_downsample_num, check_points, get_random_tokens
 
 __all__ = ["MobileNetV1BlockSpace", "MobileNetV2BlockSpace"]
 
@@ -60,10 +60,7 @@ class MobileNetV2BlockSpace(SearchSpaceBase):
         self.scale = scale
 
     def init_tokens(self):
-        if self.block_mask != None:
-            return [0] * (len(self.block_mask) * 4)
-        else:
-            return [0] * (self.block_num * 4)
+        return get_random_tokens(self.range_table())
 
     def range_table(self):
         range_table_base = []
@@ -156,7 +153,7 @@ class MobileNetV2BlockSpace(SearchSpaceBase):
                     c=int(c * self.scale),
                     n=n,
                     s=s,
-                    k=k,
+                    k=int(k),
                     name='mobilenetv2_' + str(i + 1))
                 in_c = int(c * self.scale)
 
@@ -292,9 +289,11 @@ class MobileNetV1BlockSpace(SearchSpaceBase):
                  scale=1.0):
         super(MobileNetV1BlockSpace, self).__init__(input_size, output_size,
                                                     block_num, block_mask)
-        # use input_size and output_size to compute self.downsample_num
-        self.downsample_num = compute_downsample_num(self.input_size,
-                                                     self.output_size)
+
+        if self.block_mask == None:
+            # use input_size and output_size to compute self.downsample_num
+            self.downsample_num = compute_downsample_num(self.input_size,
+                                                         self.output_size)
         if self.block_num != None:
             assert self.downsample_num <= self.block_num, 'downsample numeber must be LESS THAN OR EQUAL TO block_num, but NOW: downsample numeber is {}, block_num is {}'.format(
                 self.downsample_num, self.block_num)
@@ -308,10 +307,7 @@ class MobileNetV1BlockSpace(SearchSpaceBase):
         self.scale = scale
 
     def init_tokens(self):
-        if self.block_mask != None:
-            return [0] * (len(self.block_mask) * 3)
-        else:
-            return [0] * (self.block_num * 3)
+        return get_random_tokens(self.range_table())
 
     def range_table(self):
         range_table_base = []
@@ -389,7 +385,7 @@ class MobileNetV1BlockSpace(SearchSpaceBase):
                     num_filters2=filter_num2,
                     stride=stride,
                     scale=self.scale,
-                    kernel_size=kernel_size,
+                    kernel_size=int(kernel_size),
                     name='mobilenetv1_{}'.format(str(i + 1)))
 
             if return_mid_layer:
diff --git a/paddleslim/nas/search_space/mobilenetv1.py b/paddleslim/nas/search_space/mobilenetv1.py
index 4a931c6f0013c8c3dd0ef4a7bc47fd2d6da718ad..28928a8cc1d399da4f271e7a638d972034ef9fd8 100644
--- a/paddleslim/nas/search_space/mobilenetv1.py
+++ b/paddleslim/nas/search_space/mobilenetv1.py
@@ -191,7 +191,7 @@ class MobileNetV1Space(SearchSpaceBase):
                     num_groups=filter_num1,
                     stride=stride,
                     scale=self.scale,
-                    kernel_size=kernel_size,
+                    kernel_size=int(kernel_size),
                     name='mobilenetv1_{}'.format(str(i + 1)))
 
             ### return_block and end_points means block num
diff --git a/paddleslim/nas/search_space/mobilenetv2.py b/paddleslim/nas/search_space/mobilenetv2.py
index 5b8aa4feab1e5fbd5397272d4c6899f33e600ffd..6e2ffb21f863f9b219ff4bf73f9c535fc4194f21 100644
--- a/paddleslim/nas/search_space/mobilenetv2.py
+++ b/paddleslim/nas/search_space/mobilenetv2.py
@@ -182,7 +182,7 @@ class MobileNetV2Space(SearchSpaceBase):
                     c=int(c * self.scale),
                     n=n,
                     s=s,
-                    k=k,
+                    k=int(k),
                     name='mobilenetv2_conv' + str(i))
                 in_c = int(c * self.scale)
 
diff --git a/paddleslim/nas/search_space/resnet.py b/paddleslim/nas/search_space/resnet.py
index 97cf1ffbe2f759bfbb65b8197df49e1d9698e8e7..9d4f1f50f77e4c43dcd17bf7b98f4b2861c00a76 100644
--- a/paddleslim/nas/search_space/resnet.py
+++ b/paddleslim/nas/search_space/resnet.py
@@ -22,7 +22,7 @@ from paddle.fluid.param_attr import ParamAttr
 from .search_space_base import SearchSpaceBase
 from .base_layer import conv_bn_layer
 from .search_space_registry import SEARCHSPACE
-from .utils import check_points
+from .utils import check_points, get_random_tokens
 
 __all__ = ["ResNetSpace"]
 
@@ -47,8 +47,7 @@ class ResNetSpace(SearchSpaceBase):
         """
         The initial token.
         """
-        init_token_base = [0, 0, 0, 0, 0, 0, 0, 0]
-        return init_token_base
+        return [1, 1, 2, 2, 3, 4, 3, 1]
 
     def range_table(self):
         """
diff --git a/paddleslim/nas/search_space/resnet_block.py b/paddleslim/nas/search_space/resnet_block.py
index 64646a3863af86afd8ca3578dfb07fb59f26e4db..5a32add4383900957ffd37cadf30fcba6d194668 100644
--- a/paddleslim/nas/search_space/resnet_block.py
+++ b/paddleslim/nas/search_space/resnet_block.py
@@ -22,7 +22,7 @@ from paddle.fluid.param_attr import ParamAttr
 from .search_space_base import SearchSpaceBase
 from .base_layer import conv_bn_layer
 from .search_space_registry import SEARCHSPACE
-from .utils import compute_downsample_num, check_points
+from .utils import compute_downsample_num, check_points, get_random_tokens
 
 __all__ = ["ResNetBlockSpace"]
 
@@ -32,22 +32,20 @@ class ResNetBlockSpace(SearchSpaceBase):
     def __init__(self, input_size, output_size, block_num, block_mask=None):
         super(ResNetBlockSpace, self).__init__(input_size, output_size,
                                                block_num, block_mask)
-        # use input_size and output_size to compute self.downsample_num
-        self.downsample_num = compute_downsample_num(self.input_size,
-                                                     self.output_size)
+        if self.block_mask == None:
+            # use input_size and output_size to compute self.downsample_num
+            self.downsample_num = compute_downsample_num(self.input_size,
+                                                         self.output_size)
         if self.block_num != None:
             assert self.downsample_num <= self.block_num, 'downsample numeber must be LESS THAN OR EQUAL TO block_num, but NOW: downsample numeber is {}, block_num is {}'.format(
                 self.downsample_num, self.block_num)
         self.filter_num = np.array(
             [48, 64, 96, 128, 160, 192, 224, 256, 320, 384, 512, 640])
-        self.repeat = np.array([0, 1, 2])
+        self.repeat = np.array([0, 1, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16])
         self.k_size = np.array([3, 5])
 
     def init_tokens(self):
-        if self.block_mask != None:
-            return [0] * (len(self.block_mask) * 6)
-        else:
-            return [0] * (self.block_num * 6)
+        return get_random_tokens(self.range_table())
 
     def range_table(self):
         range_table_base = []
@@ -136,7 +134,7 @@ class ResNetBlockSpace(SearchSpaceBase):
                     num_filters1=filter_num1,
                     num_filters2=filter_num3,
                     num_filters3=filter_num3,
-                    kernel_size=k_size,
+                    kernel_size=int(k_size),
                     repeat1=repeat1,
                     repeat2=repeat2,
                     stride=stride,
diff --git a/paddleslim/nas/search_space/search_space_base.py b/paddleslim/nas/search_space/search_space_base.py
index 9dee1431d34afb2411747affc542e82ca099d4d7..af4d4a1d6a25c4b93754f5ff7a32186cc90eca00 100644
--- a/paddleslim/nas/search_space/search_space_base.py
+++ b/paddleslim/nas/search_space/search_space_base.py
@@ -19,6 +19,7 @@ __all__ = ['SearchSpaceBase']
 
 _logger = get_logger(__name__, level=logging.INFO)
 
+
 class SearchSpaceBase(object):
     """Controller for Neural Architecture Search.
     """
@@ -56,3 +57,7 @@ class SearchSpaceBase(object):
             model arch 
         """
         raise NotImplementedError('Abstract method.')
+
+    def super_net(self):
+        """This function is just used in one shot NAS strategy. Return a super graph."""
+        raise NotImplementedError('Abstract method.')
diff --git a/paddleslim/nas/search_space/utils.py b/paddleslim/nas/search_space/utils.py
index c76a48cc5d3cbbc2858dd91479f7913c49d4081a..255488d4fb86913348d3314af3866fdf9d0611ba 100644
--- a/paddleslim/nas/search_space/utils.py
+++ b/paddleslim/nas/search_space/utils.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import math
+import numpy as np
 
 
 def compute_downsample_num(input_size, output_size):
@@ -36,3 +37,11 @@ def check_points(count, points):
             return (True if count in points else False)
         else:
             return (True if count == points else False)
+
+
+def get_random_tokens(range_table):
+    tokens = []
+    for idx, max_value in enumerate(range_table):
+        tokens_idx = int(np.floor(range_table[idx] * np.random.rand(1)))
+        tokens.append(tokens_idx)
+    return tokens
diff --git a/paddleslim/pantheon/README.md b/paddleslim/pantheon/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7cd109280d43e0813863fe4ebdab20f5d023b402
--- /dev/null
+++ b/paddleslim/pantheon/README.md
@@ -0,0 +1,206 @@
+# Pantheon: Paddle large-scale scalable knowledge distillation framework
+
+Pantheon is a universal solution for knowledge distillation in Paddle Fluid. Its design takes account of many possible behaviors of teacher models. Every teacher and student model in Pantheon works in different processes and they communicate with each other via local files or TCP/IP ports. The knowledge can be easily transferred to the student model from a single teacher model or the ensemble of multiple teacher models, in which each teacher model can work in online or offline mode independently. And Pantheon also provides a highly optimized interface for the large-scale prediction of teacher models. Beneficial from the low coupling of teachers and the student, users can allocate computation resources for different roles dependent on their computation complexity, and build a large-scale and practical knowledge distillation learning system on Pantheon.  
+
+The illustration below shows an application of Pantheon, where the sudent model is trained with knowledge from multiple online teachers. These teachers may work on the same node but different devices, or different nodes with the student model, as long as they can communicate with each other via the Internet. The student model can send queries to teachers, and the latter take these queries as input and generate streaming knowledge data for the former. Or in a simpler way, the student model can read the training data in the **same order** with the teachers, avoiding the procedure of sending queryies.  
+
+
+<div align="center">
+  <img src="images/pantheon_arch.png" width=600 /> <br>
+  The architecture for one online knowledge distillation system based on Pantheon
+</div>
+
+## Prerequisites
+
+- Python 2.7.x or 3.x
+- PaddlePaddle >= 1.7.0
+- System: MacOS/Linux
+
+## APIs
+
+Pantheon defines two classes **Teacher** and **Student** for the communication and knowledge transfer between teacher and student.
+
+- **Teacher**: used by the teacher model. Can receive queries from student and write out the knowledge from teacher model via TCP/IP port (online mode) or into a local file (offline mode).
+- **Student**: used by the student model. Can receive and merge the knowledge from teachers, and feed the student model along with local data for training.
+
+Usually, the public methods of these two classes work in the pairwise way. Their mapping relations and suitable working modes are listed in the following table.
+
+<table>
+  <tr>
+    <th rowspan="2">Teacher</th>
+    <th rowspan="2">Student</th>
+    <th colspan="2">Supported Graph</th>
+    <th colspan="2">Mode</th>
+    <th rowspan="2">remarks</th>
+  </tr>
+  <tr>
+   <td>static</td>
+   <td>dynamic</td>
+   <td>online</td>
+   <td>offline</td>
+  </tr>
+    <tr>
+    <td><strong>__init__</strong>(<br>&nbsp;&nbsp;&nbsp;&nbsp;out_path=None,          <br>&nbsp;&nbsp;&nbsp;&nbsp;out_port=None)</td>
+    <td><strong>__init__</strong>(<br>&nbsp;&nbsp;&nbsp;&nbsp;merge_strategy=None)</td>
+    <td><center>✅</center></td>
+    <td><center>✅</center></td>
+    <td><center>✅</center></td>
+    <td><center>✅</center></td>
+    <td>[1]</td>
+  </tr>
+  <tr>
+    <td></td>
+    <td><strong>register_teacher</strong>(
+            <br>&nbsp;&nbsp;&nbsp;&nbsp;in_path=None,
+            <br>&nbsp;&nbsp;&nbsp;&nbsp;in_address=None)
+    </td>
+    <td><center>✅</center></td>
+    <td><center>✅</center></td>
+    <td><center>✅</center></td>
+    <td><center>✅</center></td>
+    <td>[2]</td>
+  </tr>
+  <tr>
+    <td><strong>start()</strong></td>
+    <td><strong>start()</strong></td>
+    <td><center>✅</center></td>
+    <td><center>✅</center></td>
+    <td><center>✅</center></td>
+    <td><center>✅</center></td>
+    <td>[3]</td>
+  </tr>
+  <tr>
+    <td><strong>send</strong>(data)</td>
+    <td><strong>recv</strong>(teacher_id)</td>
+    <td><center>✅</center></td>
+    <td><center>✅</center></td>
+    <td><center>✅</center></td>
+    <td><center></center></td>
+    <td>[4]</td>
+  </tr>
+   <tr>
+    <td><strong>recv()</strong></td>
+    <td><strong>send</strong>(data, <br>&nbsp;&nbsp;&nbsp;
+        &nbsp;teacher_ids=None)
+    </td>
+    <td><center>✅</center></td>
+    <td><center>✅</center></td>
+    <td><center>✅</center></td>
+    <td><center></center></td>
+    <td>[5]</td>
+  </tr>
+   <tr>
+    <td><strong>dump</strong>(knowledge)</td>
+    <td></td>
+    <td><center>✅</center></td>
+    <td><center>✅</center></td>
+    <td><center></center></td>
+    <td><center>✅</center></td>
+    <td>[6]</td>
+  </tr>
+  <tr>
+    <td rowspan="3"><strong>start_knowledge_service</strong>(
+    <br>&nbsp;&nbsp;&nbsp;&nbsp;feed_list,
+    <br>&nbsp;&nbsp;&nbsp;&nbsp;schema,
+    <br>&nbsp;&nbsp;&nbsp;&nbsp;program,
+    <br>&nbsp;&nbsp;&nbsp;&nbsp;reader_config,
+    <br>&nbsp;&nbsp;&nbsp;&nbsp;exe,
+    <br>&nbsp;&nbsp;&nbsp;&nbsp;buf_size=10,
+    <br>&nbsp;&nbsp;&nbsp;&nbsp;use_fp16=False,
+    <br>&nbsp;&nbsp;&nbsp;&nbsp;times=1)</td>
+    <td><strong>get_knowledge_desc</strong>()</td>
+    <td><center>✅</center></td>
+    <td><center></center></td>
+    <td><center>✅</center></td>
+    <td><center>✅</center></td>
+    <td rowspan="3">[7]</td>
+  </tr>
+  <tr>
+    <td><strong>get_knowledge_qsize</strong>()</td>
+    <td><center>✅</center></td>
+    <td><center></center></td>
+    <td><center>✅</center></td>
+    <td><center>✅</center></td>
+  </tr>
+   <tr>
+    <td><strong>get_knowledge_generator</strong>(<br>&nbsp;&nbsp;&nbsp;&nbsp;batch_size,
+        <br>&nbsp;&nbsp;&nbsp;&nbsp;drop_last=False)</td>
+    <td><center>✅</center></td>
+    <td><center></center></td>
+    <td><center>✅</center></td>
+    <td><center>✅</center></td>
+  </tr>
+</table>
+
+**Remarks:**
+
+  - [1] Decalre the teacher object for teacher model with **out\_path** or **out\_port**, and the student for student model with **merge\_strategy** for knowledge from different teachers.
+  - [2] Register a teacher, and allocate an id for it which starts from zero in the order of registration. **register\_teacher()** can be called many times for multiple-teacher mode.
+  - [3] Estabish TCP/IP link between teachers and the student, and synchronize all of them.
+  - [4] Send one data from teacher to student.
+  - [5] Send one data from student to teacher.
+  - [6] Dump one batch knowledge data into the output file.
+  - [7] Highly optimized high-level interfaces to build service for knowledge transfer:
+     -  **start\_knowledge\_service()** can perform large-scale prediction of teacher model on multiple devices;
+     - Support auto merging of knowledge from different teachers;
+     - Support auto reconnection of student and teachers.
+
+### About the data format
+
+- **Knowledge**: A dictionary with the keys specified by users and the values that are numpy ndarray tensors predicted by teacher models. The first dimension of tensors should be batch size and LoDTensor is not supported yet. One can call **get\_knowledge\_desc()** to get the description of knowledge, which is also a dictionary, including the shape, data type and LoD level about knowledge data.
+- **Offline knowledge file**: The first line is knowledge description, and the following lines are knowledge data, one line for one batch samples, all dumped by cPickle.
+
+
+
+### Usage
+
+If separately runnable teacher models and the student model
+have been ready, basically one can build the trainable system with knowledge
+distillation by following two simple steps.
+
+1) Instantiate a **Teacher** object for the teacher model, and launch knowledge serving
+
+```python
+
+from paddleslim.pantheon import Teacher
+...
+
+teacher = Teacher(out_path=args.out_path, out_port=args.out_port)
+teacher.start()
+
+teacher.start_knowledge_service(
+    feed_list=[inp_x.name],
+    schema={"x": inp_x,
+            "y": y},
+    program=program,
+    reader_config={"batch_generator": batch_generator},
+    exe=exe,
+    buf_size=100,
+    times=1)
+```
+
+2) Instantiate a **Student** object, specify the way to merge knowledge, register teachers,
+   and get knowledge description and data generator for the student model
+
+```python
+from paddleslim.pantheon import Student
+...
+
+student = Student(merge_strategy={"result": "sum"})
+
+student.register_teacher(
+        in_address=args.in_address0, in_path=args.in_path0)
+student.register_teacher(
+        in_address=args.in_address1, in_path=args.in_path1)
+student.start()
+
+knowledge_desc = student.get_knowledge_desc()
+data_generator = student.get_knowledge_generator(
+    batch_size=32, drop_last=False)
+```
+
+## Examples
+
+### Toy Example
+
+A toy example is provied to show how the knowledge data is transferred from teachers to the student model and merged, including offline, online modes and their hybrid. See [demo/pantheon/toy](../../demo/pantheon/toy).
diff --git a/paddleslim/pantheon/__init__.py b/paddleslim/pantheon/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcc99e781b6d2ae6fa921ef65636817560e98d37
--- /dev/null
+++ b/paddleslim/pantheon/__init__.py
@@ -0,0 +1,4 @@
+from .teacher import Teacher
+from .student import Student
+
+__all__ = teacher.__all__ + student.__all__
diff --git a/paddleslim/pantheon/images/pantheon_arch.png b/paddleslim/pantheon/images/pantheon_arch.png
new file mode 100644
index 0000000000000000000000000000000000000000..d88fc11f144e284fc64674a22cfd2554845b0176
Binary files /dev/null and b/paddleslim/pantheon/images/pantheon_arch.png differ
diff --git a/paddleslim/pantheon/student.py b/paddleslim/pantheon/student.py
new file mode 100644
index 0000000000000000000000000000000000000000..72bdd5e140871964495783814718622599f39c9e
--- /dev/null
+++ b/paddleslim/pantheon/student.py
@@ -0,0 +1,596 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+import time
+if six.PY2:
+    import cPickle as pickle
+    import Queue
+else:
+    import pickle
+    import queue as Queue
+
+import numpy as np
+from collections import OrderedDict
+from multiprocessing import Process, Manager
+from multiprocessing.managers import BaseManager
+
+from threading import Thread
+
+from paddleslim.pantheon.utils import EndSignal, SyncSignal, StartSignal, public_authkey, convert_dtype
+
+__all__ = ["Student"]
+
+
+class Student(object):
+    """
+    The class defined for the student model. Receive knowledge data from 
+    teacher model and carry out knowledge merging.    
+
+    Args:
+        merge_strategy (dict|None): A dictionary whose keys are common 
+            schemas shared by different teachers, and each corresponding 
+            value specifies the merging strategy for different schemas 
+            respectively, supporting 'sum' and 'mean' now.
+    """
+
+    def __init__(self, merge_strategy=None):
+        if merge_strategy:
+            for strategy in merge_strategy.values():
+                if strategy not in ["sum", "mean"]:
+                    raise ValueError(
+                        "Merging strategy must be 'sum' or 'mean'!")
+
+        self._merge_strategy = merge_strategy
+        self._common_schema = merge_strategy.keys() if merge_strategy else []
+
+        self._knowledge_desc = OrderedDict()
+        self._knowledge_queue = Queue.Queue(100)
+        self._teacher_knowledge_queues = []
+        self._t2s_queues = []
+        self._s2t_queues = []
+        self._cmd_queues = []
+
+        self._num_teachers = 0
+
+        self._in_paths = []
+        self._in_addresses = []
+
+        self._started = False
+        self._is_knowledge_desc_ready = False
+        self._is_knowledge_gen_locked = False
+
+    def register_teacher(self, in_path=None, in_address=None):
+        """Register one teacher model and assign the order number to it as 
+           its id, with the file path (offline mode) or IP address (online 
+           mode) that the teacher model wrote knowledge data to.
+
+        Args:
+            in_path (str|None): The input file path. Default None.
+            in_address (str|None): The input IP address, in the format 
+                "<IP address>:<IP port>" (e.g. "127.0.0.1:8080"). Default None.
+        """
+        if self._started:
+            raise ValueError(
+                "The student has been started and cannot register "
+                "teacher no longer!")
+        if in_path and in_address:
+            raise ValueError("Input path and input address should not "
+                             "be given at the same time!")
+        if not in_path and not in_address:
+            raise ValueError("One of input path and input address should "
+                             "be given when registering teacher!")
+        if in_address:
+            if in_address in self._in_addresses:
+                print("WARNING: the teacher with input address {} has been "
+                      "registered, and ignored this time!".format(in_path))
+                return
+            ip, port = in_address.strip().split(":")
+            BaseManager.register("get_knowledge_queue")
+            BaseManager.register("get_s2t_queue")
+            BaseManager.register("get_t2s_queue")
+            BaseManager.register("get_cmd_queue")
+            manager = BaseManager(
+                address=(ip, int(port)), authkey=public_authkey.encode())
+
+            print("Connecting to {}, with public key {} ...".format(
+                in_address, public_authkey))
+            # Wait for teacher model started to establish connection
+            while True:
+                try:
+                    manager.connect()
+                    break
+                except:
+                    time.sleep(1.0)
+
+            def merge(knowledge_queues):
+                num = len(knowledge_queues)
+                if num == 1:
+                    return knowledge_queues[0]
+                local_queues = [Queue.Queue(100) for _ in range(num)]
+
+                def receive(queue, local_queue):
+                    while True:
+                        try:
+                            data = queue.get()
+                            queue.task_done()
+                            local_queue.put(data)
+                        except EOFError:
+                            break
+
+                knowledge_queue = Queue.Queue(100)
+
+                def gather(local_queues, knowledge_queue):
+                    num = len(local_queues)
+                    end_received = [0] * num
+                    while True:
+                        try:
+                            for i in range(num):
+                                data = local_queues[i].get()
+                                local_queues[i].task_done()
+
+                                if isinstance(data, SyncSignal):
+                                    if i == 0:
+                                        knowledge_queue.put(data)
+                                elif isinstance(data, EndSignal):
+                                    end_received[i] = 1
+                                    if i == 0:
+                                        knowledge_queue.put(data)
+                                    if sum(end_received) == num:
+                                        end_received = [0] * num
+                                        break
+                                else:
+                                    knowledge_queue.put(data)
+                        except EOFError:
+                            break
+
+                # threads to receive knowledge from the online teacher
+                for i in range(num):
+                    p = Thread(
+                        target=receive,
+                        args=(knowledge_queues[i], local_queues[i]))
+                    p.daemon = True
+                    p.start()
+                # thread to gather data from different local queues
+                p = Thread(target=gather, args=(local_queues, knowledge_queue))
+                p.daemon = True
+                p.start()
+                return knowledge_queue
+
+            # get knowledge queues
+            knowledge_queues, idx = [], 0
+            while True:
+                q = manager.get_knowledge_queue(idx)
+                if hasattr(q, "get"):
+                    knowledge_queues.append(q)
+                    idx += 1
+                else:
+                    break
+            knowledge_queue = merge(knowledge_queues)
+            self._t2s_queues.append(manager.get_t2s_queue())
+            self._s2t_queues.append(manager.get_s2t_queue())
+            self._cmd_queues.append(manager.get_cmd_queue())
+            self._in_addresses.append(in_address)
+            self._in_paths.append(None)
+            print("Registered teacher {} with input address {}.".format(
+                self._num_teachers, in_address))
+        else:
+            if in_path in self._in_paths:
+                print("WARNING: th teacher with input path {} has been "
+                      "registered, and ignored this time!".format(in_path))
+                return
+
+            def read_offline(in_path, cmd_queue, out_queue):
+                end_recved = False
+
+                def get_cmd():
+                    cmd, end_recved = None, False
+                    try:
+                        if not cmd_queue.empty():
+                            cmd = cmd_queue.get()
+                            cmd_queue.task_done()
+                            if isinstance(cmd, EndSignal):
+                                end_recved = True
+                    except IOError:
+                        end_recved = True
+                    return cmd, end_recved
+
+                # wait for the sync in start
+                while not end_recved:
+                    cmd, end_recved = get_cmd()
+                    if isinstance(cmd, SyncSignal):
+                        out_queue.put(SyncSignal())
+                        break
+                # for multiple-times offline serving
+                while not end_recved:
+                    # wait for the sync in get_knowledge_desc()
+                    while not end_recved:
+                        cmd, end_recved = get_cmd()
+                        if isinstance(cmd, SyncSignal):
+                            out_queue.put(SyncSignal())
+                            break
+
+                    if end_recved:
+                        break
+                    with open(in_path, 'rb') as fin:
+                        # get knowledge desc
+                        desc = pickle.load(fin)
+                        out_queue.put(desc)
+                        # wait for the data accessing signal
+                        while not end_recved:
+                            cmd, end_recved = get_cmd()
+                            if isinstance(cmd, StartSignal):
+                                break
+                        # get knowledge data
+                        while not end_recved:
+                            try:
+                                data = pickle.load(fin)
+                                out_queue.put(data)
+                                _, end_recved = get_cmd()
+                            except EOFError:
+                                break
+                    if end_recved:
+                        break
+                    out_queue.put(EndSignal())
+                    out_queue.join()
+
+            knowledge_queue = Queue.Queue(100)
+            cmd_queue = Queue.Queue(5)
+            p = Thread(
+                target=read_offline,
+                args=(in_path, cmd_queue, knowledge_queue))
+            p.daemon = True
+            p.start()
+
+            self._t2s_queues.append(None)
+            self._s2t_queues.append(None)
+            self._cmd_queues.append(cmd_queue)
+            self._in_addresses.append(None)
+            self._in_paths.append(in_path)
+            print("Registered teacher {} with input path {}.".format(
+                self._num_teachers, in_path))
+
+        self._teacher_knowledge_queues.append(knowledge_queue)
+        self._num_teachers += 1
+
+    def _sync(self):
+        for i, queue in enumerate(self._cmd_queues):
+            if queue:
+                queue.put(SyncSignal())
+                while True:
+                    cmd = self._teacher_knowledge_queues[i].get()
+                    self._teacher_knowledge_queues[i].task_done()
+                    if isinstance(cmd, SyncSignal):
+                        break
+                queue.join()
+
+    def start(self):
+        """
+        End teachers' registration and synchronize with all of them.
+        """
+
+        if self._started:
+            raise ValueError(
+                "The student cannot be started more than one time.")
+        self._sync()
+        self._started = True
+
+    def _merge_knowledge(self, knowledge):
+        for k, tensors in list(knowledge.items()):
+            if len(tensors) == 0:
+                del knowledge[k]
+            elif len(tensors) == 1:
+                knowledge[k] = tensors[0]
+            else:
+                result = 0
+                for tensor in tensors:
+                    result += tensor
+                if self._merge_strategy[k] == "sum":
+                    knowledge[k] = result
+                elif self._merge_strategy[k] == "mean":
+                    knowledge[k] = result / len(tensors)
+            # cast back to original data type if necessary
+            tgt_dtype = self._knowledge_desc[k]["dtype"]
+            if str(knowledge[k].dtype) != tgt_dtype:
+                knowledge[k] = knowledge[k].astype(tgt_dtype)
+        return knowledge
+
+    def send(self, data, teacher_ids=None):
+        """ 
+        Send data to teachers.
+
+        Args:
+            data: A Python data object.
+            teacher_ids (list|None): A list of teacher ids to send data. If 
+                set to None, send the data to all teachers. Default None.
+        """
+        if not self._started:
+            raise ValueError("The method start() should be called first!")
+
+        if teacher_ids is None:
+            teacher_ids = range(self._num_teachers)
+
+        for i in teacher_ids:
+            if self._s2t_queues[i]:
+                self._s2t_queues[i].put(data)
+            else:
+                print("Warning: didn't send data to teacher {} for it is in "
+                      "offline mode.".format(i))
+
+    def recv(self, teacher_id):
+        """
+        Receive data from one teacher.
+       
+        Args:
+            teacher_id (int): The id of teacher that receives data from.
+
+        Return:
+            The received data object.
+        """
+        if not self._started:
+            raise ValueError("The method start() should be called first!")
+
+        if self._t2s_queues[teacher_id]:
+            data = self._t2s_queues[teacher_id].get()
+            self._t2s_queues[teacher_id].task_done()
+            return data
+        else:
+            raise ValueError("Cannot receive data from teacher {} for it is "
+                             "offline.".format(teacher_id))
+
+    def get_knowledge_desc(self):
+        """ 
+        Get description for knowledge, including shape, data type and lod 
+        level for each schema.
+
+        Return:
+            dict: Knowledge description.
+        """
+        if not self._started:
+            raise ValueError("The method start() should be called first!")
+
+        if self._is_knowledge_desc_ready == False:
+            self._sync()
+            # get knowledge description
+            knowledge_desc = OrderedDict()
+            for idx, queue in enumerate(self._teacher_knowledge_queues):
+                desc = queue.get()
+                queue.task_done()
+                inter_desc = set(knowledge_desc.keys()) & set(desc.keys())
+                if idx > 0 and (
+                        not inter_desc.issubset(set(self._common_schema))):
+                    raise ValueError(
+                        "Teacher {} has the same schema with other existed "
+                        "teachers not in the merge_strategy.".format(idx))
+                knowledge_desc.update(desc)
+
+            print("Knowledge merging strategy: {}".format(
+                self._merge_strategy))
+            print("Knowledge description after merging:")
+            for schema, desc in list(knowledge_desc.items()):
+                print("{}: {}".format(schema, desc))
+
+            self._knowledge_desc = knowledge_desc
+            self._is_knowledge_desc_ready = True
+        return self._knowledge_desc
+
+    def get_knowledge_qsize(self):
+        """
+        Get the real-time size of knowledge queue. If this size is denoted as 
+        **qsize**, it means that there are **qsize** batch knowledge data 
+        already pushed into knowledge queue and waiting for the knowledge 
+        generator to pop out. It's dynamic and limited up to 100, the capacity 
+        of the knowledge queue.
+        
+        Return:
+            int: The real-time size of knowledge queue.
+        """
+        if not self._started:
+            raise ValueError("The method start() should be called first!")
+
+        return self._knowledge_queue.qsize()
+
+    def get_knowledge_generator(self, batch_size, drop_last=False):
+        """ 
+        Get the generator for knowledge data, return None if last generator 
+        doesn't finish yet.
+
+        Args:
+            batch_size (int): The batch size of returned knowledge data.
+            drop_last (bool): Whether to drop the last batch if its size is less 
+                              than batch size.
+
+        Return:
+            func: The wrapper of knowledge data generator.
+        """
+        if not self._started:
+            raise ValueError("The method start() should be called first!")
+
+        if batch_size <= 0:
+            raise ValueError("batch size must be positive!")
+        self._batch_size = batch_size
+        self._drop_last = drop_last
+
+        # make sure only one generator is available at the same time
+        if self._is_knowledge_gen_locked:
+            print("WARNING: new knowledge generator is not available for the "
+                  "last generator hasn't finished yielding all data yet! "
+                  "Return None.")
+            return None
+        self._is_knowledge_gen_locked = True
+        self.get_knowledge_desc()
+
+        def split_batch(batch, num):
+            keys = batch.keys()
+            first, second = {}, {}
+            for key in keys:
+                first[key] = batch[key][0:num]
+                second[key] = batch[key][num:]
+            return first, second
+
+        def concat_batches(batches):
+            if len(batches) == 1:
+                return batches[0]
+            keys = batches[0].keys()
+            ret_batch = {}
+            for key in keys:
+                ret_batch[key] = np.concatenate(
+                    [batches[i][key] for i in range(len(batches))])
+            return ret_batch
+
+        def listen(knowledge_queue, out_queue):
+            """
+            listen on the knowledge queue for one teacher, get knowledge data
+            and put it into a local queue (out_queue). 
+            """
+            while True:
+                data = knowledge_queue.get()
+                knowledge_queue.task_done()
+                out_queue.put(data)
+                if isinstance(data, EndSignal):
+                    break
+
+        def make_new_batch(in_queue, out_queue, batch_size):
+            """ 
+            Get knowledge data from a local queue and make a new batch data in 
+            the batch size of student, then put it into the intermediate 
+            queue (out_queue).
+            """
+            batches, num_samples = [], 0
+            while True:
+                batch_samples = in_queue.get()
+                in_queue.task_done()
+                if not isinstance(batch_samples, EndSignal):
+                    cur_num_samples = list(batch_samples.values())[0].shape[0]
+                    if num_samples + cur_num_samples < batch_size:
+                        batches.append(batch_samples)
+                        num_samples += cur_num_samples
+                    elif num_samples + cur_num_samples == batch_size:
+                        batches.append(batch_samples)
+                        out_queue.put(concat_batches(batches))
+                        batches, num_samples = [], 0
+                    else:
+                        num_splited = batch_size - num_samples
+                        first, second = split_batch(batch_samples, num_splited)
+                        batches.append(first)
+                        out_queue.put(concat_batches(batches))
+                        num_left = cur_num_samples - num_splited
+                        while num_left > batch_size:
+                            first, second = split_batch(second, batch_size)
+                            out_queue.put(first)
+                            num_left -= batch_size
+
+                        if num_left == batch_size:
+                            out_queue.put(second)
+                            batches, num_samples = [], 0
+                        else:
+                            batches, num_samples = [second], num_left
+                else:
+                    if len(batches) > 0:
+                        out_queue.put(concat_batches(batches))
+                    out_queue.put(EndSignal())
+                    break
+
+        def gather_and_merge(in_queues, out_queue):
+            """ 
+            Gather knowledge from all intermediate queues, merge them 
+            and put the final knowledge into the knowledge queue to 
+            student (out_queue).
+            """
+
+            def data_receiver(queue):
+                while True:
+                    batch = queue.get()
+                    queue.task_done()
+                    yield batch
+                    if isinstance(batch, EndSignal):
+                        break
+
+            data_receivers = [data_receiver(queue) for queue in in_queues]
+
+            end_received = [0] * len(in_queues)
+            while True:
+                knowledge = OrderedDict(
+                    [(k, []) for k, v in list(self._knowledge_desc.items())])
+                for idx, receiver in enumerate(data_receivers):
+                    if not end_received[idx]:
+                        batch_samples = receiver.next(
+                        ) if six.PY2 else receiver.__next__()
+                        if not isinstance(batch_samples, EndSignal):
+                            for k, v in list(batch_samples.items()):
+                                knowledge[k].append(v)
+                        else:
+                            end_received[idx] = 1
+                if sum(end_received) == len(in_queues):
+                    break
+                knowledge = self._merge_knowledge(knowledge)
+                out_queue.put(knowledge)
+            out_queue.put(EndSignal())
+            out_queue.join()
+
+        # acquire data from teachers
+        for i, queue in enumerate(self._cmd_queues):
+            if queue:
+                queue.put(StartSignal())
+                queue.join()
+
+        local_queues = [Queue.Queue(100) for i in range(self._num_teachers)]
+        # launch threads to listen on all knowledge queues
+        for i in range(self._num_teachers):
+            listen_thread = Thread(
+                target=listen,
+                args=(self._teacher_knowledge_queues[i], local_queues[i]))
+            listen_thread.dameon = True
+            listen_thread.start()
+
+        med_queues = [Queue.Queue(100) for i in range(self._num_teachers)]
+        # launch threads to make new batch for student
+        for i in range(self._num_teachers):
+            listen_thread = Thread(
+                target=make_new_batch,
+                args=(local_queues[i], med_queues[i], self._batch_size))
+            listen_thread.dameon = True
+            listen_thread.start()
+
+        # launch another thread to merge knowledge from different teachers.
+        merge_thread = Thread(
+            target=gather_and_merge, args=(med_queues, self._knowledge_queue))
+        merge_thread.dameon = True
+        merge_thread.start()
+
+        def wrapper():
+            while True:
+                knowledge = self._knowledge_queue.get()
+                self._knowledge_queue.task_done()
+                if not isinstance(knowledge, EndSignal):
+                    batch_size = list(knowledge.values())[0].shape[0]
+                    if (batch_size < self._batch_size) and drop_last:
+                        continue
+                    yield knowledge
+                else:
+                    break
+            # After all knowledge data yielded, make current knowledge desc invalid.
+            self._is_knowledge_desc_ready = False
+            self._is_knowledge_gen_locked = False
+
+        return wrapper
+
+    def __del__(self):
+        for i, path in enumerate(self._in_paths):
+            if path:
+                try:
+                    self._cmd_queues[i].put(EndSignal())
+                    self._cmd_queues[i].join()
+                except:
+                    pass
diff --git a/paddleslim/pantheon/teacher.py b/paddleslim/pantheon/teacher.py
new file mode 100644
index 0000000000000000000000000000000000000000..281fe23db9ea5d1037d9eb8c14c277bfea69900e
--- /dev/null
+++ b/paddleslim/pantheon/teacher.py
@@ -0,0 +1,662 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import six
+if six.PY2:
+    import cPickle as pickle
+    import Queue
+else:
+    import pickle
+    import queue as Queue
+
+from collections import OrderedDict, Iterable
+import numpy as np
+import copy
+import multiprocessing
+from multiprocessing.managers import BaseManager
+from threading import Thread
+
+import paddle.fluid as fluid
+
+from paddleslim.pantheon.utils import convert_dtype, EndSignal, SyncSignal, StartSignal, public_authkey
+
+__all__ = ["Teacher"]
+
+# Num of threads for post-processing, including generating and transferring 
+# knowledge data
+num_postprocess_threads = int(os.getenv("NUM_POSTPROCESS_THREADS", 8))
+knowledge_queues = [Queue.Queue(100) for i in range(num_postprocess_threads)]
+
+t2s_queue = Queue.Queue(100)
+s2t_queue = Queue.Queue(100)
+cmd_queue = Queue.Queue(5)
+
+
+class MixedDataReader(object):
+    """ 
+    The wrapper for iterable data loader, to solve the drop problem of last 
+    batches when their number is less than the number of devices in prediction. 
+    It implements two data generators, one for the prediction on all devices, 
+    and another one for the prediction of remained data one single device, and 
+    they two should be called in order.
+
+    Args:
+        data_loader (fluid.io.DataLoader): The data loader.
+        base_number (int): The base number that the number of yielded data 
+                           batches for multiple devices should be its 
+                           multiple times.
+    """
+
+    def __init__(self, data_loader, base_number):
+        self._data_loader = data_loader
+        self._base_number = base_number
+        self._tail_data = []
+
+    def multi_dev_generator(self):
+        for data in self._data_loader():
+            if len(self._tail_data) < self._base_number:
+                self._tail_data += data
+            if len(self._tail_data) == self._base_number:
+                yield self._tail_data
+                self._tail_data = []
+
+    def tail_generator(self):
+        for data in self._tail_data:
+            yield data
+        self._tail_data = []
+
+
+class WorkerParallel(object):
+    """
+    Process data from the input queue by given worker in parallel, and put the 
+    result into output queue in order.
+
+    Args:
+        num_postprocess_threads (int): Number of threads for data processing.
+        in_queue (object): The input queue.
+        out_queue (object|list): The output queue(s). Its length should be equal 
+            to arg 'num_postprocess_threads' when it is a list.
+    """
+
+    def __init__(self, num_postprocess_threads, in_queue, out_queue):
+        self._num_postprocess_threads = num_postprocess_threads
+        self._in_queue = in_queue
+        self._local_in_queues = [
+            Queue.Queue(5) for i in range(num_postprocess_threads)
+        ]
+        if isinstance(out_queue, list):
+            if len(out_queue) != num_postprocess_threads:
+                raise ValueError("When out_queue is a list, its length must "
+                                 "equal to num_postprocess_threads!")
+            self._local_out_queues = out_queue
+            self._out_queue = None
+        else:
+            self._local_out_queues = [
+                Queue.Queue(5) for i in range(num_postprocess_threads)
+            ]
+            self._out_queue = out_queue
+
+    def _distribute(self):
+        def func():
+            idx = 0
+            while True:
+                data = self._in_queue.get()
+                self._in_queue.task_done()
+                if not isinstance(data, EndSignal):
+                    self._local_in_queues[
+                        idx % self._num_postprocess_threads].put(data)
+                    idx += 1
+                else:
+                    for q in self._local_in_queues:
+                        q.put(EndSignal())
+                    break
+
+        t = Thread(target=func)
+        t.daemon = True
+        t.start()
+
+    def _run(self, worker, args):
+        for i in range(self._num_postprocess_threads):
+            t = Thread(
+                target=worker,
+                args=(self._local_in_queues[i], self._local_out_queues[i]) +
+                args)
+            t.daemon = True
+            t.start()
+
+    def _gather(self):
+        def func():
+            end_received = False
+            while True:
+                for idx, q in enumerate(self._local_out_queues):
+                    data = q.get()
+                    q.task_done()
+                    if isinstance(data, EndSignal):
+                        end_received = True
+                        if idx > 0:
+                            continue
+                    self._out_queue.put(data)
+                if end_received:
+                    break
+
+        t = Thread(target=func)
+        t.daemon = True
+        t.start()
+
+    def __call__(self, worker, args):
+        self._distribute()
+        self._run(worker, args)
+        if self._out_queue:
+            self._gather()
+
+
+class Teacher(object):
+    """
+    The class defined for the teacher model. Generate knowledge data and 
+    transfer them to the student model.
+
+    Args:
+        out_path (str|None): The path to dump knowledge for offline mode.
+        out_port (int|None): The IP port number to send out knowledge for 
+            online mode, should be unique when launching multiple teachers in 
+            the same node.
+    """
+
+    def __init__(self, out_path=None, out_port=None):
+        if out_path and out_port:
+            raise ValueError("Out path and out port should not be set at "
+                             "the same time!")
+
+        self._out_path = out_path
+        self._out_port = out_port
+        # knowledge description
+        self._knowledge_desc = {}
+
+        self._sync_required = False
+        self._data_required = False
+        self._started = False
+
+    def _start_manager(self):
+        def get_knowledge_queue(idx):
+            global knowledge_queues
+            if idx < len(knowledge_queues):
+                return knowledge_queues[idx]
+            else:
+                return None
+
+        def get_s2t_queue():
+            global s2t_queue
+            return s2t_queue
+
+        def get_t2s_queue():
+            global t2s_queue
+            return t2s_queue
+
+        def get_cmd_queue():
+            global cmd_queue
+            return cmd_queue
+
+        BaseManager.register(
+            "get_knowledge_queue", callable=get_knowledge_queue)
+        BaseManager.register("get_s2t_queue", callable=get_s2t_queue)
+        BaseManager.register("get_t2s_queue", callable=get_t2s_queue)
+        BaseManager.register("get_cmd_queue", callable=get_cmd_queue)
+        manager = BaseManager(
+            address=("", self._out_port), authkey=public_authkey.encode())
+        manager.start()
+        print("listen on address: {}".format(manager._address))
+        print("public authkey: {}".format(public_authkey))
+        return manager
+
+    def start(self):
+        """ 
+        Start teacher service, sychronize with student and launch the thread 
+        to monitor commands from student. 
+        """
+        if self._started:
+            raise ValueError(
+                "The teacher cannot be started more than one time.")
+        self._started = True
+        self._manager = self._start_manager() if self._out_port else None
+        if self._manager:
+            self._knowledge_queues = [
+                self._manager.get_knowledge_queue(i)
+                for i in range(num_postprocess_threads)
+            ]
+            print("Num of knowledge queues: {}".format(
+                num_postprocess_threads))
+            self._s2t_queue = self._manager.get_s2t_queue()
+            self._t2s_queue = self._manager.get_t2s_queue()
+            self._cmd_queue = self._manager.get_cmd_queue()
+        else:
+            self._knowledge_queues = None
+            self._s2t_queue = None
+            self._t2s_queue = None
+            self._cmd_queue = None
+
+        self._out_file = open(self._out_path, "wb") if self._out_path else None
+        if self._out_file:
+            return
+
+        def wrapper():
+            while True:
+                if not self._cmd_queue.empty():
+                    cmd = self._cmd_queue.get()
+                    self._cmd_queue.task_done()
+                    if isinstance(cmd, SyncSignal):
+                        self._sync_required = True
+                    elif isinstance(cmd, StartSignal):
+                        self._data_required = True
+                else:
+                    time.sleep(1.0)
+
+        t = Thread(target=wrapper)
+        t.daemon = True
+        t.start()
+
+        while True:
+            if self._sync_required:
+                for q in self._knowledge_queues:
+                    q.put(SyncSignal())
+                    q.join()
+                self._sync_required = False
+                break
+
+    def send(self, data):
+        """
+        Send one data object to student.
+        
+        Args:
+            data (Python data): The data to be sent, can be any type of Python data object. 
+        """
+        if not self._started:
+            raise ValueError("The method start() should be called first!")
+
+        if not self._t2s_queue:
+            raise ValueError("Cannot send data to stuent for this teacher "
+                             "is offline!")
+        self._t2s_queue.put(data)
+
+    def recv(self):
+        """
+        Recieve one data object from student. 
+
+        Return:
+            The received data, can be any type of Python data object.
+        """
+        if not self._started:
+            raise ValueError("The method start() should be called first!")
+
+        if not self._s2t_queue:
+            raise ValueError(
+                "Cannot receive data from stuent for this teacher "
+                "is in offline mode!")
+        data = self._s2t_queue.get()
+        self._s2t_queue.task_done()
+        return data
+
+    def dump(self, knowledge):
+        """
+        Dump one batch knowledge data into output file, only used in the 
+        offline mode.
+
+        Args:
+            knowledge (dict): The knowledge data to be dumped.  
+        """
+        if not self._started:
+            raise ValueError("The method start() should be called first!")
+
+        if not self._out_file:
+            raise ValueError("Cannot dump knowledge data in online mode!")
+
+        if not isinstance(knowledge, dict) and not isinstance(knowledge,
+                                                              OrderedDict):
+            raise ValueError(
+                "The knowledge data should be a dict or OrderedDict!")
+
+        knowledge_desc = {}
+        for name, value in list(knowledge.items()):
+            knowledge_desc[name] = {
+                "shape": [-1] + list(value.shape[1:]),
+                "dtype": str(value.dtype),
+                "lod_level": 0
+            }
+        if not self._knowledge_desc:
+            self._knowledge_desc = knowledge_desc
+            self._out_file.write(pickle.dumps(self._knowledge_desc))
+        else:
+            if self._knowledge_desc != knowledge_desc:
+                raise ValueError(
+                    "Current knowledge desc {} is not the same as "
+                    "historic desc {}!".format(knowledge_desc,
+                                               self._knowledge_desc))
+
+        self._out_file.write(pickle.dumps(knowledge))
+
+    def start_knowledge_service(self,
+                                feed_list,
+                                schema,
+                                program,
+                                reader_config,
+                                exe,
+                                buf_size=10,
+                                use_fp16=False,
+                                times=1):
+        """
+        Start the knowledge service to generate and transfer knowledge data.
+        In GPU mode, the devices to execute knowledge prediction will be 
+        determined by environment variable **FLAGS_selected_gpus**, or by 
+        **CUDA_VISIBLE_DEVICES** if it is not set, and by **CPU_NUM** (default 
+        1) in CPU mode. Only supported in static graph. 
+
+        Args:
+            feed_list (list): A list of feed Variables or their names for the 
+                              input program.
+            schema (dict): A dictionary to specify names and fetched 
+                           Variables of knowledge.
+            program (fluid.Program): Inference program for the teacher model.
+            reader_config (dict): The config for data reader. Support all the 
+                three types of generators used by `fluid.io.PyReader` and 
+                `fluid.io.DataLoader`, and their configs contain the key-value 
+                pair of the generator type and a generator object, plus
+                other necessary argument pairs. See the following: 
+
+                    1) sample generator:
+                       reader_config={"sample_generator": #some_sample_generator, 
+                                  "batch_size": #batch_size, "drop_last": #drop_last},
+                       'drop_last' set to True by default, 
+                    2) sample list generator:
+                       reader_config={"sample_list_generator": 
+                                       #some_sample_list_generator},
+                    3) batch generator:
+                       reader_config={"batch_generator": #some_batch_genrator}.
+
+                The trial to parse config will be in the order of 1) -> 3), and 
+                any other unrelated keys in these configs will be ignored.
+            exe (fluid.Executor): The executor to run the input program.
+            buf_size (int): The size of buffers for data reader and knowledge 
+                            writer on each device. 
+            use_fp16 (bool): Whether to transfer/store knowledge data in float16 
+                         if their data type is float32/float64. In the offline 
+                         mode, it will reduce the size of dumped knowledge file, 
+                         and in the online mode, it will speedup the online 
+                         transfer, with the sacrifice in precision . Default False.
+            times (int): The maximum repeated serving times. Default 1. Whenever 
+                         the public method 'get_knowledge_generator()' in Student 
+                         object called once, the serving times will be added one, 
+                         until reaching the maximum and ending the service. Only 
+                         valid in online mode, and will be ignored in offline mode.
+        """
+        if not self._started:
+            raise ValueError("The method start() should be called first!")
+
+        if not isinstance(program, fluid.Program):
+            raise ValueError(
+                "Input argument 'program' should be a fluid Program!")
+        self._program = program._inference_optimize(prune_read_op=True)
+
+        if not isinstance(feed_list, list):
+            raise ValueError("Input argument 'feed_list' should be a list!")
+        else:
+            self._feed_list = []
+            for feed in feed_list:
+                if isinstance(feed, fluid.framework.Variable):
+                    self._feed_list.append(feed)
+                elif isinstance(feed, str) or isinstance(feed, unicode):
+                    self._feed_list.append(self._program.global_block().var(
+                        feed))
+                else:
+                    raise ValueError(
+                        "Input 'feed_list' should consist of feed "
+                        "Variables or their names!")
+
+        if not isinstance(schema, dict) and not isinstance(schema,
+                                                           OrderedDict):
+            raise ValueError(
+                "Input argument 'schema' should be a dict or OrderedDict!")
+        self._schema = schema
+
+        if not isinstance(reader_config, dict):
+            raise ValueError("The reader config must be a dictionary!")
+
+        if not isinstance(exe, fluid.Executor):
+            raise ValueError("Input argument should be a fluid Executor!")
+        self._exe = exe
+
+        self._use_fp16 = use_fp16
+
+        if not buf_size > 0:
+            raise ValueError("The buffer size should be positive!")
+        self._buf_size = buf_size
+
+        if not times > 0:
+            raise ValueError("Repeated serving times should be positive!")
+        self._times = times
+        if self._times > 1 and self._out_file:
+            self._times = 1
+            print("WARNING: args 'times' will be ignored in offline mode")
+
+        desc = {}
+        for name, var in list(schema.items()):
+            if not isinstance(var, fluid.framework.Variable):
+                raise ValueError(
+                    "The member of schema must be fluid Variable.")
+            desc[name] = {
+                "shape": var.shape,
+                "dtype": convert_dtype(var.dtype),
+                "lod_level": var.lod_level
+            }
+        if not self._knowledge_desc:
+            self._knowledge_desc = desc
+        else:
+            if self._out_file and not self._knowledge_desc == desc:
+                raise ValueError("The knowledge description should be kept "
+                                 "consistent in offline mode!")
+
+        if isinstance(self._exe.place, fluid.CUDAPlace):
+            places = fluid.cuda_places()
+        else:
+            places = fluid.cpu_places()
+        dev_count = len(places)
+
+        data_loader = fluid.io.DataLoader.from_generator(
+            feed_list=self._feed_list,
+            capacity=self._buf_size * dev_count,
+            use_double_buffer=(dev_count == 1),
+            iterable=True)
+
+        places = [fluid.CPUPlace()] if dev_count > 1 else [self._exe.place]
+        if "sample_generator" in reader_config:
+            if "batch_size" not in reader_config:
+                raise ValueError("batch size must be specified when using "
+                                 "sample generator!")
+            sample_generator = reader_config["sample_generator"]
+            batch_size = reader_config["batch_size"]
+            drop_last = reader_config[
+                "drop_last"] if "drop_last" in reader_config else True
+
+            data_loader.set_sample_generator(
+                reader=sample_generator,
+                batch_size=batch_size,
+                drop_last=drop_last,
+                places=places)
+        elif "sample_list_generator" in reader_config:
+            sample_list_generator = reader_config["sample_list_generator"]
+            data_loader.set_sample_list_generator(
+                reader=sample_list_generator, places=places)
+        elif "batch_generator" in reader_config:
+            batch_generator = reader_config["batch_generator"]
+            data_loader.set_batch_generator(
+                reader=batch_generator, places=places)
+        else:
+            raise ValueError(
+                "The reader config doesn't contain any valid "
+                "generator type, which should be one of 'sample_generator', "
+                "'sample_list_generator', and 'batch_generator'.")
+
+        def cast2fp16(know):
+            for k, v in list(know.items()):
+                if not isinstance(v, np.ndarray):
+                    break
+                if v.dtype == np.float32 or v.dtype == np.float64:
+                    v = v.astype("float16")
+                    know[k] = v
+            return know
+
+        feed_var_names = [var.name for var in self._feed_list]
+        schema_in_feed, schema_in_fetch = {}, {}
+        for k, v in list(self._schema.items()):
+            if k in feed_var_names:
+                schema_in_feed[k] = v
+            else:
+                schema_in_fetch[k] = v
+        schema_in_fetch_keys, schema_in_fetch_vars = zip(
+            *list(schema_in_fetch.items()))
+
+        def know_maker(in_queue, out_queue, use_fp16):
+            while True:
+                data = in_queue.get()
+                in_queue.task_done()
+                if isinstance(data, tuple):
+                    dev_batches, outputs = data
+                    know = {}
+                    for k in schema_in_feed.keys():
+                        batch_know = [
+                            np.array(batch[k]) for batch in dev_batches
+                        ]
+                        know[k] = np.concatenate(batch_know)
+                    know.update(dict(zip(schema_in_fetch_keys, outputs)))
+                    if use_fp16:
+                        know = cast2fp16(know)
+                    out_queue.put(know)
+                else:
+                    # forward other types of data directly (maybe knowledge desc or EndSignal)
+                    out_queue.put(data)
+                    if isinstance(data, EndSignal):
+                        break
+
+        know_make_queue = Queue.Queue(self._buf_size)
+        if self._out_file:
+            # For offline dump, write the knowledge description to the head of file
+            self._out_file.write(pickle.dumps(self._knowledge_desc))
+            print("output path: %s" % self._out_path)
+            offline_write_queue = Queue.Queue(self._buf_size)
+
+            def offline_write(queue):
+                while True:
+                    know = queue.get()
+                    queue.task_done()
+                    if not isinstance(know, EndSignal):
+                        self._out_file.write(pickle.dumps(know))
+                    else:
+                        # should close file in child thread to wait for all 
+                        # writing finished
+                        self._out_file.close()
+
+            t = Thread(target=offline_write, args=(offline_write_queue, ))
+            t.daemon = True
+            t.start()
+            make_knowledge = WorkerParallel(
+                num_postprocess_threads, know_make_queue, offline_write_queue)
+
+        if self._knowledge_queues:
+            make_knowledge = WorkerParallel(num_postprocess_threads,
+                                            know_make_queue,
+                                            self._knowledge_queues)
+
+        compiled_program = fluid.compiler.CompiledProgram(
+            self._program).with_data_parallel()
+
+        print("Knowledge description {}".format(self._knowledge_desc))
+        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) +
+              "  Teacher begins to serve ...")
+
+        data_reader = MixedDataReader(data_loader, dev_count)
+        for repeated in range(self._times):
+            make_knowledge(worker=know_maker, args=(self._use_fp16, ))
+            if self._knowledge_queues:
+                # wait for the accessing of knowledge desc and data
+                while True:
+                    if self._sync_required:
+                        for q in self._knowledge_queues:
+                            q.put(SyncSignal())
+                        # For online mode, send knowledge description every sync
+                        know_make_queue.put(self._knowledge_desc)
+                        self._sync_required = False
+                    if self._data_required:
+                        self._data_required = False
+                        break
+                for q in self._knowledge_queues:
+                    q.join()
+
+            print("No.{} time serving ... ".format(repeated))
+            num_batches_sent = 0
+            for index, dev_batches in enumerate(
+                    data_reader.multi_dev_generator()):
+                if self._sync_required:
+                    break
+                outputs = self._exe.run(compiled_program,
+                                        feed=dev_batches,
+                                        fetch_list=schema_in_fetch_vars)
+                know_make_queue.put((dev_batches, outputs))
+
+                num_batches_sent += dev_count
+                if num_batches_sent % (100 * dev_count) == 0:
+                    log = "Processed {} batch samples.".format(
+                        num_batches_sent)
+                    if self._knowledge_queues:
+                        qsize = 0
+                        for q in self._knowledge_queues:
+                            qsize += q.qsize()
+                        log += " Knowledge queue size {}.".format(qsize)
+                    print(log)
+
+            dev_batches, outputs = [], []
+            for index, batch in enumerate(data_reader.tail_generator()):
+                if self._sync_required:
+                    break
+                dev_batches.append(batch)
+                output = self._exe.run(self._program,
+                                       feed=batch,
+                                       fetch_list=schema_in_fetch_vars)
+                if outputs:
+                    outputs = [
+                        np.concatenate(
+                            (outs, out), axis=0)
+                        for (outs, out) in zip(outputs, output)
+                    ]
+                else:
+                    outputs = copy.deepcopy(output)
+            if dev_batches or outputs:
+                know_make_queue.put((dev_batches, outputs))
+                num_batches_sent += (index + 1)
+
+            print("Processed {} batch samples in total.".format(
+                num_batches_sent))
+            know_make_queue.put(EndSignal())
+            know_make_queue.join()
+
+            if self._knowledge_queues:
+                for q in self._knowledge_queues:
+                    q.join()
+            if self._out_file:
+                offline_write_queue.join()
+        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) +
+              "  Teacher ends serving.")
+
+    def __del__(self):
+        if self._manager:
+            self._manager.shutdown()
diff --git a/paddleslim/pantheon/utils.py b/paddleslim/pantheon/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4c8001eb6f392dc9c3f9a9dc582714d9daf74ad
--- /dev/null
+++ b/paddleslim/pantheon/utils.py
@@ -0,0 +1,61 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+
+public_authkey = u"aBcXyZ123"
+
+
+class StartSignal():
+    pass
+
+
+class EndSignal():
+    pass
+
+
+class SyncSignal():
+    pass
+
+
+def convert_dtype(dtype):
+    import paddle.fluid as fluid
+    if isinstance(dtype, fluid.core.VarDesc.VarType):
+        if dtype == fluid.core.VarDesc.VarType.BOOL:
+            return 'bool'
+        elif dtype == fluid.core.VarDesc.VarType.FP16:
+            return 'float16'
+        elif dtype == fluid.core.VarDesc.VarType.FP32:
+            return 'float32'
+        elif dtype == fluid.core.VarDesc.VarType.FP64:
+            return 'float64'
+        elif dtype == fluid.core.VarDesc.VarType.INT8:
+            return 'int8'
+        elif dtype == fluid.core.VarDesc.VarType.INT16:
+            return 'int16'
+        elif dtype == fluid.core.VarDesc.VarType.INT32:
+            return 'int32'
+        elif dtype == fluid.core.VarDesc.VarType.INT64:
+            return 'int64'
+        elif dtype == fluid.core.VarDesc.VarType.UINT8:
+            return 'uint8'
+
+
+def check_ip(address):
+    import IPy
+    try:
+        IPy.IP(address)
+        return True
+    except Exception as e:
+        return False
diff --git a/paddleslim/prune/__init__.py b/paddleslim/prune/__init__.py
index d8c439be403ff93a24406c6caf4d2524fd17023a..361a3af13db508a1d1b697b9136e79d065d00a52 100644
--- a/paddleslim/prune/__init__.py
+++ b/paddleslim/prune/__init__.py
@@ -11,24 +11,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import absolute_import
 from .pruner import *
-import pruner
+from ..prune import pruner
 from .auto_pruner import *
-import auto_pruner
-from .controller_server import *
-import controller_server
-from .controller_client import *
-import controller_client
+from ..prune import auto_pruner
 from .sensitive_pruner import *
-import sensitive_pruner
+from ..prune import sensitive_pruner
 from .sensitive import *
-import sensitive
+from ..prune import sensitive
+from .prune_walker import *
+from ..prune import prune_walker
+from .prune_io import *
+from ..prune import prune_io
+from .group_param import *
+from ..prune import group_param
+from .criterion import *
+from ..prune import criterion
 
+from .idx_selector import *
+from ..prune import idx_selector
 __all__ = []
 
 __all__ += pruner.__all__
 __all__ += auto_pruner.__all__
-__all__ += controller_server.__all__
-__all__ += controller_client.__all__
 __all__ += sensitive_pruner.__all__
 __all__ += sensitive.__all__
+__all__ += prune_walker.__all__
+__all__ += prune_io.__all__
+__all__ += group_param.__all__
+__all__ += criterion.__all__
+__all__ += idx_selector.__all__
diff --git a/paddleslim/prune/auto_pruner.py b/paddleslim/prune/auto_pruner.py
index 575d93c546e10717dd294004a8be80e55550ae4b..54ae8f3aab6c6047677661a66e0ddd7fd0d3d3e9 100644
--- a/paddleslim/prune/auto_pruner.py
+++ b/paddleslim/prune/auto_pruner.py
@@ -31,6 +31,39 @@ _logger = get_logger(__name__, level=logging.INFO)
 
 
 class AutoPruner(object):
+    """
+    Search a group of ratios used to prune program.
+
+    Args:
+        program(Program): The program to be pruned.
+        scope(Scope): The scope to be pruned.
+        place(fluid.Place): The device place of parameters.
+        params(list<str>): The names of parameters to be pruned.
+        init_ratios(list<float>|float): Init ratios used to pruned parameters in `params`.
+            List means ratios used for pruning each parameter in `params`.
+            The length of `init_ratios` should be equal to length of params when `init_ratios` is a list. 
+            If it is a scalar, all the parameters in `params` will be pruned by uniform ratio.
+            None means get a group of init ratios by `pruned_flops` of `pruned_latency`. Default: None.
+        pruned_flops(float): The percent of FLOPS to be pruned. Default: None.
+        pruned_latency(float): The percent of latency to be pruned. Default: None.
+        server_addr(tuple): A tuple of server ip and server port for controller server. 
+        init_temperature(float): The init temperature used in simulated annealing search strategy.
+        reduce_rate(float): The decay rate used in simulated annealing search strategy.
+        max_try_times(int): The max number of trying to generate legal tokens.
+        max_client_num(int): The max number of connections of controller server.
+        search_steps(int): The steps of searching.
+        max_ratios(float|list<float>): Max ratios used to pruned parameters in `params`.
+            List means max ratios for each parameter in `params`.
+            The length of `max_ratios` should be equal to length of params when `max_ratios` is a list.
+            If it is a scalar, it will used for all the parameters in `params`.
+        min_ratios(float|list<float>): Min ratios used to pruned parameters in `params`.
+            List means min ratios for each parameter in `params`.
+            The length of `min_ratios` should be equal to length of params when `min_ratios` is a list.
+            If it is a scalar, it will used for all the parameters in `params`.
+        key(str): Identity used in communication between controller server and clients.
+        is_server(bool): Whether current host is controller server. Default: True.
+        """
+
     def __init__(self,
                  program,
                  scope,
@@ -49,37 +82,6 @@ class AutoPruner(object):
                  min_ratios=[0],
                  key="auto_pruner",
                  is_server=True):
-        """
-        Search a group of ratios used to prune program.
-        Args:
-            program(Program): The program to be pruned.
-            scope(Scope): The scope to be pruned.
-            place(fluid.Place): The device place of parameters.
-            params(list<str>): The names of parameters to be pruned.
-            init_ratios(list<float>|float): Init ratios used to pruned parameters in `params`.
-                List means ratios used for pruning each parameter in `params`.
-                The length of `init_ratios` should be equal to length of params when `init_ratios` is a list. 
-                If it is a scalar, all the parameters in `params` will be pruned by uniform ratio.
-                None means get a group of init ratios by `pruned_flops` of `pruned_latency`. Default: None.
-            pruned_flops(float): The percent of FLOPS to be pruned. Default: None.
-            pruned_latency(float): The percent of latency to be pruned. Default: None.
-            server_addr(tuple): A tuple of server ip and server port for controller server. 
-            init_temperature(float): The init temperature used in simulated annealing search strategy.
-            reduce_rate(float): The decay rate used in simulated annealing search strategy.
-            max_try_times(int): The max number of trying to generate legal tokens.
-            max_client_num(int): The max number of connections of controller server.
-            search_steps(int): The steps of searching.
-            max_ratios(float|list<float>): Max ratios used to pruned parameters in `params`.
-                List means max ratios for each parameter in `params`.
-                The length of `max_ratios` should be equal to length of params when `max_ratios` is a list.
-                If it is a scalar, it will used for all the parameters in `params`.
-            min_ratios(float|list<float>): Min ratios used to pruned parameters in `params`.
-                List means min ratios for each parameter in `params`.
-                The length of `min_ratios` should be equal to length of params when `min_ratios` is a list.
-                If it is a scalar, it will used for all the parameters in `params`.
-            key(str): Identity used in communication between controller server and clients.
-            is_server(bool): Whether current host is controller server. Default: True.
-        """
 
         self._program = program
         self._scope = scope
@@ -111,9 +113,13 @@ class AutoPruner(object):
                 self._pruned_latency)
         init_tokens = self._ratios2tokens(self._init_ratios)
         _logger.info("range table: {}".format(self._range_table))
-        controller = SAController(self._range_table, self._reduce_rate,
-                                  self._init_temperature, self._max_try_times,
-                                  init_tokens, self._constrain_func)
+        controller = SAController(
+            self._range_table,
+            self._reduce_rate,
+            self._init_temperature,
+            self._max_try_times,
+            init_tokens,
+            constrain_func=self._constrain_func)
 
         server_ip, server_port = server_addr
         if server_ip == None or server_ip == "":
@@ -157,7 +163,7 @@ class AutoPruner(object):
 
     def _constrain_func(self, tokens):
         ratios = self._tokens2ratios(tokens)
-        pruned_program = self._pruner.prune(
+        pruned_program, _, _ = self._pruner.prune(
             self._program,
             self._scope,
             self._params,
@@ -177,13 +183,15 @@ class AutoPruner(object):
     def prune(self, program, eval_program=None):
         """
         Prune program with latest tokens generated by controller.
+
         Args:
             program(fluid.Program): The program to be pruned.
+
         Returns:
-            Program: The pruned program.
+            paddle.fluid.Program: The pruned program.
         """
         self._current_ratios = self._next_ratios()
-        pruned_program = self._pruner.prune(
+        pruned_program, _, _ = self._pruner.prune(
             program,
             self._scope,
             self._params,
@@ -193,7 +201,7 @@ class AutoPruner(object):
             param_backup=self._param_backup)
         pruned_val_program = None
         if eval_program is not None:
-            pruned_val_program = self._pruner.prune(
+            pruned_val_program, _, _ = self._pruner.prune(
                 program,
                 self._scope,
                 self._params,
@@ -208,8 +216,9 @@ class AutoPruner(object):
     def reward(self, score):
         """
         Return reward of current pruned program.
+
         Args:
-            score(float): The score of pruned program.
+            float: The score of pruned program.
         """
         self._restore(self._scope)
         self._param_backup = {}
diff --git a/paddleslim/prune/controller_client.py b/paddleslim/prune/controller_client.py
deleted file mode 100644
index f133e8b28f823bba89024fe1473630feb509a616..0000000000000000000000000000000000000000
--- a/paddleslim/prune/controller_client.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import socket
-from ..common import get_logger
-
-__all__ = ['ControllerClient']
-
-_logger = get_logger(__name__, level=logging.INFO)
-
-
-class ControllerClient(object):
-    """
-    Controller client.
-    """
-
-    def __init__(self, server_ip=None, server_port=None, key=None):
-        """
-        Args:
-            server_ip(str): The ip that controller server listens on. None means getting the ip automatically. Default: None.
-            server_port(int): The port that controller server listens on. 0 means getting usable port automatically. Default: 0.
-            key(str): The key used to identify legal agent for controller server. Default: "light-nas"
-        """
-        self.server_ip = server_ip
-        self.server_port = server_port
-        self.socket_client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        self._key = key
-
-    def update(self, tokens, reward):
-        """
-        Update the controller according to latest tokens and reward.
-        Args:
-            tokens(list<int>): The tokens generated in last step.
-            reward(float): The reward of tokens.
-        """
-        socket_client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        socket_client.connect((self.server_ip, self.server_port))
-        tokens = ",".join([str(token) for token in tokens])
-        socket_client.send("{}\t{}\t{}".format(self._key, tokens, reward)
-                           .encode())
-        tokens = socket_client.recv(1024).decode()
-        tokens = [int(token) for token in tokens.strip("\n").split(",")]
-        return tokens
-
-    def next_tokens(self):
-        """
-        Get next tokens.
-        """
-        socket_client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        socket_client.connect((self.server_ip, self.server_port))
-        socket_client.send("next_tokens".encode())
-        tokens = socket_client.recv(1024).decode()
-        tokens = [int(token) for token in tokens.strip("\n").split(",")]
-        return tokens
diff --git a/paddleslim/prune/controller_server.py b/paddleslim/prune/controller_server.py
deleted file mode 100644
index 5fc978444656d2650904eedfd37453b6b5e22207..0000000000000000000000000000000000000000
--- a/paddleslim/prune/controller_server.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import logging
-import socket
-from ..common import get_logger
-from threading import Thread
-from .lock import lock, unlock
-
-__all__ = ['ControllerServer']
-
-_logger = get_logger(__name__, level=logging.INFO)
-
-
-class ControllerServer(object):
-    """
-    The controller wrapper with a socket server to handle the request of search agent.
-    """
-
-    def __init__(self,
-                 controller=None,
-                 address=('', 0),
-                 max_client_num=100,
-                 search_steps=None,
-                 key=None):
-        """
-        Args:
-            controller(slim.searcher.Controller): The controller used to generate tokens.
-            address(tuple): The address of current server binding with format (ip, port). Default: ('', 0).
-                            which means setting ip automatically
-            max_client_num(int): The maximum number of clients connecting to current server simultaneously. Default: 100.
-            search_steps(int): The total steps of searching. None means never stopping. Default: None 
-        """
-        self._controller = controller
-        self._address = address
-        self._max_client_num = max_client_num
-        self._search_steps = search_steps
-        self._closed = False
-        self._port = address[1]
-        self._ip = address[0]
-        self._key = key
-        self._socket_file = "./controller_server.socket"
-
-    def start(self):
-        open(self._socket_file, 'a').close()
-        socket_file = open(self._socket_file, 'r+')
-        lock(socket_file)
-        tid = socket_file.readline()
-        if tid == '':
-            _logger.info("start controller server...")
-            tid = self._start()
-            socket_file.write("tid: {}\nip: {}\nport: {}\n".format(
-                tid, self._ip, self._port))
-            _logger.info("started controller server...")
-        unlock(socket_file)
-        socket_file.close()
-
-    def _start(self):
-        self._socket_server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        self._socket_server.bind(self._address)
-        self._socket_server.listen(self._max_client_num)
-        self._port = self._socket_server.getsockname()[1]
-        self._ip = self._socket_server.getsockname()[0]
-        _logger.info("ControllerServer - listen on: [{}:{}]".format(
-            self._ip, self._port))
-        thread = Thread(target=self.run)
-        thread.start()
-        return str(thread)
-
-    def close(self):
-        """Close the server."""
-        self._closed = True
-        os.remove(self._socket_file)
-        _logger.info("server closed!")
-
-    def port(self):
-        """Get the port."""
-        return self._port
-
-    def ip(self):
-        """Get the ip."""
-        return self._ip
-
-    def run(self):
-        _logger.info("Controller Server run...")
-        try:
-            while ((self._search_steps is None) or
-                   (self._controller._iter <
-                    (self._search_steps))) and not self._closed:
-                conn, addr = self._socket_server.accept()
-                message = conn.recv(1024).decode()
-                if message.strip("\n") == "next_tokens":
-                    tokens = self._controller.next_tokens()
-                    tokens = ",".join([str(token) for token in tokens])
-                    conn.send(tokens.encode())
-                else:
-                    _logger.debug("recv message from {}: [{}]".format(addr,
-                                                                      message))
-                    messages = message.strip('\n').split("\t")
-                    if (len(messages) < 3) or (messages[0] != self._key):
-                        _logger.debug("recv noise from {}: [{}]".format(
-                            addr, message))
-                        continue
-                    tokens = messages[1]
-                    reward = messages[2]
-                    tokens = [int(token) for token in tokens.split(",")]
-                    self._controller.update(tokens, float(reward))
-                    tokens = self._controller.next_tokens()
-                    tokens = ",".join([str(token) for token in tokens])
-                    conn.send(tokens.encode())
-                    _logger.debug("send message to {}: [{}]".format(addr,
-                                                                    tokens))
-                conn.close()
-        finally:
-            self._socket_server.close()
-            self.close()
diff --git a/paddleslim/prune/criterion.py b/paddleslim/prune/criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32ec6c94d3b792987e03a99b177b9e7cfef1e95
--- /dev/null
+++ b/paddleslim/prune/criterion.py
@@ -0,0 +1,122 @@
+"""Define some functions to compute the importance of structure to be pruned.
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+from ..common import get_logger
+from ..core import Registry, GraphWrapper
+
+__all__ = ["l1_norm", "CRITERION"]
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+CRITERION = Registry('criterion')
+
+
+@CRITERION.register
+def l1_norm(group, graph):
+    """Compute l1-norm scores of parameter on given axis.
+
+    This function return a list of parameters' l1-norm scores on given axis.
+    Each element of list is a tuple with format (name, axis, score) in which 'name' is parameter's name
+    and 'axis' is the axis reducing on and `score` is a np.array storing the l1-norm of strucure on `axis`.
+
+    Args:
+       group(list): A group of parameters. The first parameter of the group is convolution layer's weight
+                    while the others are parameters affected by pruning the first one. Each parameter in group
+                    is represented as tuple '(name, values, axis)' in which `name` is the parameter's name and
+                    and `values` is the values of parameter and `axis` is the axis reducing on pruning on.
+    Returns:
+       list: A list of tuple storing l1-norm on given axis.
+    """
+    scores = []
+    for name, value, axis, pruned_idx in group:
+
+        reduce_dims = [i for i in range(len(value.shape)) if i != axis]
+        score = np.sum(np.abs(value), axis=tuple(reduce_dims))
+        scores.append((name, axis, score, pruned_idx))
+
+    return scores
+
+
+@CRITERION.register
+def geometry_median(group, graph):
+    scores = []
+    name, value, axis, _ = group[0]
+    assert (len(value.shape) == 4)
+
+    def get_distance_sum(value, out_idx):
+        w = value.view()
+        w.shape = value.shape[0], np.product(value.shape[1:])
+        selected_filter = np.tile(w[out_idx], (w.shape[0], 1))
+        x = w - selected_filter
+        x = np.sqrt(np.sum(x * x, -1))
+        return x.sum()
+
+    dist_sum_list = []
+    for out_i in range(value.shape[0]):
+        dist_sum = get_distance_sum(value, out_i)
+        dist_sum_list.append(dist_sum)
+
+    tmp = np.array(dist_sum_list)
+
+    for name, value, axis, idx in group:
+        scores.append((name, axis, tmp, idx))
+    return scores
+
+
+@CRITERION.register
+def bn_scale(group, graph):
+    """Compute l1-norm scores of parameter on given axis.
+
+    This function return a list of parameters' l1-norm scores on given axis.
+    Each element of list is a tuple with format (name, axis, score) in which 'name' is parameter's name
+    and 'axis' is the axis reducing on and `score` is a np.array storing the l1-norm of strucure on `axis`.
+
+    Args:
+       group(list): A group of parameters. The first parameter of the group is convolution layer's weight
+                    while the others are parameters affected by pruning the first one. Each parameter in group
+                    is represented as tuple '(name, values, axis)' in which `name` is the parameter's name and
+                    and `values` is the values of parameter and `axis` is the axis reducing on pruning on.
+    Returns:
+       list: A list of tuple storing l1-norm on given axis.
+    """
+    assert (isinstance(graph, GraphWrapper))
+
+    # step1: Get first convolution
+    conv_weight, value, axis, _ = group[0]
+    param_var = graph.var(conv_weight)
+    conv_op = param_var.outputs()[0]
+
+    # step2: Get bn layer after first convolution
+    conv_output = conv_op.outputs("Output")[0]
+    bn_op = conv_output.outputs()[0]
+    if bn_op is not None:
+        bn_scale_param = bn_op.inputs("Scale")[0].name()
+    else:
+        raise SystemExit("Can't find BatchNorm op after Conv op in Network.")
+
+    # steps3: Find scale of bn
+    score = None
+    for name, value, aixs, _ in group:
+        if bn_scale_param == name:
+            score = np.abs(value.reshape([-1]))
+
+    scores = []
+    for name, value, axis, idx in group:
+        scores.append((name, axis, score, idx))
+
+    return scores
diff --git a/paddleslim/prune/group_param.py b/paddleslim/prune/group_param.py
new file mode 100644
index 0000000000000000000000000000000000000000..61077c2b5db88dd68e1dc0ca7b512c26f5cc6eeb
--- /dev/null
+++ b/paddleslim/prune/group_param.py
@@ -0,0 +1,91 @@
+"""Define some functions to collect ralated parameters into groups."""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core import GraphWrapper
+from .prune_walker import PRUNE_WORKER
+
+__all__ = ["collect_convs"]
+
+
+def collect_convs(params, graph, visited={}):
+    """Collect convolution layers of graph into groups. The layers in the same group is relative on pruning operation.
+    A group is a list of tuple with format (param_name, axis) in which `param_name` is the name of parameter and `axis` is the axis to be pruned on.
+
+    .. code-block:: text
+
+       conv1->conv2->conv3->conv4
+
+    As shown above, the demo has 4 convolution layers. And the shape of convolution's parameter is `[out_channel, in_channel, filter_size, filter_size]`. If parameter of `conv1` was pruned on axis 0, then the parameter of `conv2` should be pruned on axis 1. So the `conv1` and `conv2` is a group that can be represented as:
+
+    .. code-block:: python
+
+       [("conv1", 0), ("conv2", 1)]
+
+    If `params` is `["conv1", "conv2"]`, then the returned groups is:
+
+    .. code-block:: python
+
+       [[("conv1", 0), ("conv2", 1)],
+        [("conv2", 0), ("conv3", 1)]]
+
+    Args:
+       params(list): A list of convolution layer's parameter names. It will collect all the groups that contains anyone of these parameters.
+       graph(paddle.fluid.Program | GraphWrapper): The graph used to search the groups.
+
+    Returns:
+       list<list<tuple>>: The groups.
+
+    """
+    if not isinstance(graph, GraphWrapper):
+        graph = GraphWrapper(graph)
+    groups = []
+    for param in params:
+        pruned_params = []
+        param = graph.var(param)
+
+        target_op = param.outputs()[0]
+        if target_op.type() == 'conditional_block':
+            for op in param.outputs():
+                if op.type() in PRUNE_WORKER._module_dict.keys():
+                    cls = PRUNE_WORKER.get(op.type())
+                    walker = cls(op,
+                                 pruned_params=pruned_params,
+                                 visited=visited)
+                    break
+        else:
+            cls = PRUNE_WORKER.get(target_op.type())
+            walker = cls(target_op,
+                         pruned_params=pruned_params,
+                         visited=visited)
+
+        walker.prune(param, pruned_axis=0, pruned_idx=[0])
+        groups.append(pruned_params)
+    visited = set()
+    uniq_groups = []
+    for group in groups:
+        repeat_group = False
+        simple_group = []
+        for param, axis, pruned_idx in group:
+            param = param.name()
+            if axis == 0:
+                if param in visited:
+                    repeat_group = True
+                else:
+                    visited.add(param)
+            simple_group.append((param, axis, pruned_idx))
+        if not repeat_group:
+            uniq_groups.append(simple_group)
+
+    return uniq_groups
diff --git a/paddleslim/prune/idx_selector.py b/paddleslim/prune/idx_selector.py
new file mode 100644
index 0000000000000000000000000000000000000000..73caf2767704399bebe2e92a1eac8ed04b0d5fd8
--- /dev/null
+++ b/paddleslim/prune/idx_selector.py
@@ -0,0 +1,117 @@
+"""Define some functions to sort substructures of parameter by importance.
+"""
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+from ..core import GraphWrapper
+from ..common import get_logger
+from ..core import Registry
+
+__all__ = ["IDX_SELECTOR"]
+
+IDX_SELECTOR = Registry('idx_selector')
+
+
+@IDX_SELECTOR.register
+def default_idx_selector(group, ratio):
+    """Get the pruned indexes by given ratio.
+
+    This function return a list of parameters' pruned indexes on given axis.
+    Each element of list is a tuple with format (name, axis, indexes) in which 'name' is parameter's name
+    and 'axis' is the axis pruning on and `indexes` is indexes to be pruned.
+
+    Args:
+       group(list): A group of parameters. The first parameter of the group is convolution layer's weight
+                    while the others are parameters affected by pruning the first one. Each parameter in group
+                    is represented as tuple '(name, axis, score)' in which `name` is the parameter's name and
+                    `axis` is the axis pruning on and `score` is a np.array storing the importance of strucure
+                    on `axis`. Show as below:
+
+                    .. code-block: text
+
+                       [("conv1_weights", 0, [0.7, 0.5, 0.6]), ("conv1_bn.scale", 0, [0.1, 0.2, 0.4])]
+
+                    The shape of "conv1_weights" is `[out_channel, in_channel, filter_size, filter_size]`, so
+                    `[0.7, 0.5, 0.6]` are the importance sores of each output channel in "conv1_weights"
+                    while axis is 0.
+     
+    Returns:
+
+       list: pruned indexes
+
+    """
+    name, axis, score, _ = group[
+        0]  # sort channels by the first convolution's score
+    sorted_idx = score.argsort()
+
+    pruned_num = int(round(len(sorted_idx) * ratio))
+    pruned_idx = sorted_idx[:pruned_num]
+
+    idxs = []
+    for name, axis, score, offsets in group:
+        r_idx = [i + offsets[0] for i in pruned_idx]
+        idxs.append((name, axis, r_idx))
+    return idxs
+
+
+@IDX_SELECTOR.register
+def optimal_threshold(group, ratio):
+    """Get the pruned indexes by given ratio.
+
+    This function return a list of parameters' pruned indexes on given axis.
+    Each element of list is a tuple with format (name, axis, indexes) in which 'name' is parameter's name
+    and 'axis' is the axis pruning on and `indexes` is indexes to be pruned.
+
+    Args:
+       group(list): A group of parameters. The first parameter of the group is convolution layer's weight
+                    while the others are parameters affected by pruning the first one. Each parameter in group
+                    is represented as tuple '(name, axis, score)' in which `name` is the parameter's name and
+                    `axis` is the axis pruning on and `score` is a np.array storing the importance of strucure
+                    on `axis`. Show as below:
+
+                    .. code-block: text
+
+                       [("conv1_weights", 0, [0.7, 0.5, 0.6]), ("conv1_bn.scale", 0, [0.1, 0.2, 0.4])]
+
+                    The shape of "conv1_weights" is `[out_channel, in_channel, filter_size, filter_size]`, so
+                    `[0.7, 0.5, 0.6]` are the importance sores of each output channel in "conv1_weights"
+                    while axis is 0.
+     
+    Returns:
+
+       list: pruned indexes
+
+    """
+    name, axis, score, _ = group[
+        0]  # sort channels by the first convolution's score
+
+    score[score < 1e-18] = 1e-18
+    score_sorted = np.sort(score)
+    score_square = score_sorted**2
+    total_sum = score_square.sum()
+    acc_sum = 0
+    for i in range(score_square.size):
+        acc_sum += score_square[i]
+        if acc_sum / total_sum > ratio:
+            break
+    th = (score_sorted[i - 1] + score_sorted[i]) / 2 if i > 0 else 0
+
+    pruned_idx = np.squeeze(np.argwhere(score < th))
+
+    idxs = []
+    for name, axis, score, _ in group:
+        idxs.append((name, axis, pruned_idx))
+    return idxs
diff --git a/paddleslim/prune/prune_io.py b/paddleslim/prune/prune_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0e2c97557a0c43adab2676003741f379b780692
--- /dev/null
+++ b/paddleslim/prune/prune_io.py
@@ -0,0 +1,74 @@
+import os
+import paddle.fluid as fluid
+from paddle.fluid import Program
+from ..core import GraphWrapper
+from ..common import get_logger
+import json
+import logging
+
+__all__ = ["save_model", "load_model"]
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+_SHAPES_FILE = "__shapes__"
+
+
+def save_model(exe, graph, dirname):
+    """
+    Save weights of model and information of shapes into filesystem.
+
+    Args:
+        exe(paddle.fluid.Executor): The executor used to save model.
+        graph(Program|Graph): The graph to be saved.
+        dirname(str): The directory that the model saved into.
+    """
+    assert graph is not None and dirname is not None
+    graph = GraphWrapper(graph) if isinstance(graph, Program) else graph
+
+    fluid.io.save_persistables(
+        executor=exe,
+        dirname=dirname,
+        main_program=graph.program,
+        filename=None)
+    weights_file = dirname
+    _logger.info("Save model weights into {}".format(weights_file))
+    shapes = {}
+    for var in fluid.io.get_program_persistable_vars(graph.program):
+        shapes[var.name] = var.shape
+    SHAPES_FILE = os.path.join(dirname, _SHAPES_FILE)
+    with open(SHAPES_FILE, "w") as f:
+        json.dump(shapes, f)
+        _logger.info("Save shapes of weights into {}".format(SHAPES_FILE))
+
+
+def load_model(exe, graph, dirname):
+    """
+    Load weights of model and information of shapes from filesystem.
+
+    Args:
+        graph(Program|Graph): The graph to be updated by loaded information..
+        dirname(str): The directory that the model will be loaded.
+    """
+    assert graph is not None and dirname is not None
+    graph = GraphWrapper(graph) if isinstance(graph, Program) else graph
+
+    SHAPES_FILE = os.path.join(dirname, _SHAPES_FILE)
+    _logger.info("Load shapes of weights from {}".format(SHAPES_FILE))
+    with open(SHAPES_FILE, "r") as f:
+        shapes = json.load(f)
+        for param_name, shape in shapes.items():
+            param = graph.var(param_name)
+            if param is not None:
+                param.set_shape(shape)
+            else:
+                _logger.info('{} is not loaded'.format(param_name))
+
+    _logger.info("Load shapes of weights from {}".format(SHAPES_FILE))
+    fluid.io.load_persistables(
+        executor=exe,
+        dirname=dirname,
+        main_program=graph.program,
+        filename=None)
+    graph.update_groups_of_conv()
+    graph.infer_shape()
+    _logger.info("Load weights from {}".format(dirname))
diff --git a/paddleslim/prune/prune_walker.py b/paddleslim/prune/prune_walker.py
new file mode 100644
index 0000000000000000000000000000000000000000..688bcacd74375199296e11c7586e8e3d6946cf8f
--- /dev/null
+++ b/paddleslim/prune/prune_walker.py
@@ -0,0 +1,626 @@
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+from ..core import Registry
+from ..common import get_logger
+
+__all__ = ["PRUNE_WORKER", "conv2d"]
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+PRUNE_WORKER = Registry('prune_worker')
+
+SKIP_OPS = ["conditional_block"]
+
+
+class PruneWorker(object):
+    def __init__(self, op, pruned_params=[], visited={}):
+        """
+        A wrapper of operator used to infer the information of all the related variables.
+
+        Args:
+            op(Operator): The operator to be pruned.
+            pruned_params(list): The list to store the information of pruning that infered by walker.
+            visited(dict): The auxiliary dict to record the visited operators and variables. The key is a encoded string of operator id and variable name.
+
+        Return: A instance of PruneWalker.
+        """
+        self.op = op
+        self.pruned_params = pruned_params
+        self.visited = visited
+
+    def prune(self, var, pruned_axis, pruned_idx):
+        """ 
+        Infer the shape of variables related with current operator, predecessor and successor. 
+        It will search the graph to find all varibles related with `var` and record the information of pruning.
+        Args:
+            var(Variable): The root variable of searching. It can be the input or output of current operator.
+            pruned_axis(int): The axis to be pruned of root variable.
+            pruned_idx(int): The indexes to be pruned in `pruned_axis` of root variable.
+        """
+        if self._visit(var, pruned_axis):
+            self._prune(var, pruned_axis, pruned_idx)
+
+    def _visit(self, var, pruned_axis):
+        key = "_".join([str(self.op.idx()), var.name()])
+        key = "_".join([key, self.op.all_inputs()[0].name()])
+        if pruned_axis not in self.visited:
+            self.visited[pruned_axis] = {}
+        if key in self.visited[pruned_axis]:
+            return False
+        else:
+            self.visited[pruned_axis][key] = True
+            return True
+
+    def _prune(self, var, pruned_axis, pruned_idx):
+        raise NotImplementedError('Abstract method.')
+
+    def _prune_op(self, op, var, pruned_axis, pruned_idx, visited=None):
+        if op.type().endswith("_grad"):
+            return
+        if visited is not None:
+            self.visited = visited
+        cls = PRUNE_WORKER.get(op.type())
+        if cls is None:
+            if op.type() in SKIP_OPS:
+                _logger.warn("Skip operator [{}]".format(op.type()))
+                return
+
+#            _logger.warn(
+#                "{} op will be pruned by default walker to keep the shapes of input and output being same because its walker is not registered.".
+#                format(op.type()))
+            cls = PRUNE_WORKER.get("default_walker")
+        _logger.debug("\nfrom: {}\nto: {}\npruned_axis: {}; var: {}".format(
+            self.op, op, pruned_axis, var.name()))
+        walker = cls(op, pruned_params=self.pruned_params, visited=self.visited)
+        walker.prune(var, pruned_axis, pruned_idx)
+
+
+@PRUNE_WORKER.register
+class conv2d(PruneWorker):
+    def __init__(self, op, pruned_params, visited={}):
+        super(conv2d, self).__init__(op, pruned_params, visited)
+
+    def _prune(self, var, pruned_axis, pruned_idx):
+        data_format = self.op.attr("data_format")
+        channel_axis = 1
+        if data_format == "NHWC":
+            channel_axis = 3
+        if var in self.op.inputs("Input"):
+            assert pruned_axis == channel_axis, "The Input of conv2d can only be pruned at channel axis, but got {}; var: {}".format(
+                pruned_axis, var.name())
+            filter_var = self.op.inputs("Filter")[0]
+            self._visit(filter_var, 1)
+            self.pruned_params.append((filter_var, 1, pruned_idx))
+            for op in filter_var.outputs():
+                self._prune_op(op, filter_var, 1, pruned_idx)
+
+        elif var in self.op.inputs("Filter"):
+            assert pruned_axis in [0, 1]
+
+            self.pruned_params.append((var, pruned_axis, pruned_idx))
+
+            for op in var.outputs():
+                self._prune_op(op, var, pruned_axis, pruned_idx)
+
+            if pruned_axis == 0:
+                if len(self.op.inputs("Bias")) > 0:
+                    self.pruned_params.append(
+                        (self.op.inputs("Bias"), channel_axis, pruned_idx))
+                output_var = self.op.outputs("Output")[0]
+                self._visit(output_var, channel_axis)
+                next_ops = output_var.outputs()
+                for op in next_ops:
+                    self._prune_op(op, output_var, channel_axis, pruned_idx)
+
+            elif pruned_axis == 1:
+                input_var = self.op.inputs("Input")[0]
+                self._visit(input_var, channel_axis)
+                pre_ops = input_var.inputs()
+                for op in pre_ops:
+                    self._prune_op(op, input_var, channel_axis, pruned_idx)
+        elif var in self.op.outputs("Output"):
+            assert pruned_axis == channel_axis, "pruned_axis: {}; var: {}".format(
+                pruned_axis, var.name())
+
+            filter_var = self.op.inputs("Filter")[0]
+            self._visit(filter_var, 0)
+
+            self.pruned_params.append((filter_var, 0, pruned_idx))
+
+            for op in filter_var.outputs():
+                self._prune_op(op, filter_var, 0, pruned_idx)
+
+            if len(self.op.inputs("Bias")) > 0:
+                self.pruned_params.append(
+                    (self.op.inputs("Bias")[0], channel_axis, pruned_idx))
+
+            output_var = self.op.outputs("Output")[0]
+            next_ops = output_var.outputs()
+            for op in next_ops:
+                self._prune_op(op, output_var, channel_axis, pruned_idx)
+
+
+@PRUNE_WORKER.register
+class conv2d_transpose(PruneWorker):
+    def __init__(self, op, pruned_params, visited={}):
+        super(conv2d_transpose, self).__init__(op, pruned_params, visited)
+
+    def _prune(self, var, pruned_axis, pruned_idx):
+        data_format = self.op.attr("data_format")
+        channel_axis = 1
+        if data_format == "NHWC":
+            channel_axis = 3
+        if var in self.op.inputs("Input"):
+            assert pruned_axis == channel_axis, "The Input of conv2d can only be pruned at channel axis, but got {}; var: {}".format(
+                pruned_axis, var.name())
+            filter_var = self.op.inputs("Filter")[0]
+            self._visit(filter_var, 0)
+            self.pruned_params.append((filter_var, 0, pruned_idx))
+            for op in filter_var.outputs():
+                self._prune_op(op, filter_var, 0, pruned_idx)
+
+        elif var in self.op.inputs("Filter"):
+            _logger.warn("Skip pruning output channels of conv2d_transpose!")
+            return
+        elif var in self.op.outputs("Output"):
+            assert pruned_axis == channel_axis, "pruned_axis: {}; var: {}".format(
+                pruned_axis, var.name())
+
+            filter_var = self.op.inputs("Filter")[0]
+            self._visit(filter_var, 1)
+
+            self.pruned_params.append((filter_var, 1, pruned_idx))
+
+            for op in filter_var.outputs():
+                self._prune_op(op, filter_var, 1, pruned_idx)
+
+            if len(self.op.inputs("Bias")) > 0:
+                self.pruned_params.append(
+                    (self.op.inputs("Bias")[0], channel_axis, pruned_idx))
+
+            output_var = self.op.outputs("Output")[0]
+            next_ops = output_var.outputs()
+            for op in next_ops:
+                self._prune_op(op, output_var, channel_axis, pruned_idx)
+
+
+@PRUNE_WORKER.register
+class batch_norm(PruneWorker):
+    def __init__(self, op, pruned_params, visited):
+        super(batch_norm, self).__init__(op, pruned_params, visited)
+
+    def _prune(self, var, pruned_axis, pruned_idx):
+        if (var not in self.op.outputs("Y")) and (
+                var not in self.op.inputs("X")):
+            return
+
+        if var in self.op.outputs("Y"):
+            in_var = self.op.inputs("X")[0]
+            self._visit(in_var, pruned_axis)
+            pre_ops = in_var.inputs()
+            for op in pre_ops:
+                self._prune_op(op, in_var, pruned_axis, pruned_idx)
+
+        for param in ["Scale", "Bias", "Mean", "Variance"]:
+            param_var = self.op.inputs(param)[0]
+            for op in param_var.outputs():
+                self._prune_op(op, param_var, 0, pruned_idx)
+            self.pruned_params.append((param_var, 0, pruned_idx))
+
+        out_var = self.op.outputs("Y")[0]
+        self._visit(out_var, pruned_axis)
+        next_ops = out_var.outputs()
+        for op in next_ops:
+            self._prune_op(op, out_var, pruned_axis, pruned_idx)
+
+
+class elementwise_op(PruneWorker):
+    def __init__(self, op, pruned_params, visited):
+        super(elementwise_op, self).__init__(op, pruned_params, visited)
+
+    def _prune(self, var, pruned_axis, pruned_idx):
+        axis = self.op.attr("axis")
+        if axis == -1:  # TODO
+            axis = 0
+        if var in self.op.outputs("Out"):
+            for name in ["X", "Y"]:
+                actual_axis = pruned_axis
+                if name == "Y":
+                    actual_axis = pruned_axis - axis
+                in_var = self.op.inputs(name)[0]
+                if len(in_var.shape()) == 1 and in_var.shape()[0] == 1:
+                    continue
+                pre_ops = in_var.inputs()
+                for op in pre_ops:
+                    self._prune_op(op, in_var, actual_axis, pruned_idx)
+
+        else:
+            if var in self.op.inputs("X"):
+                in_var = self.op.inputs("Y")[0]
+                if not (len(in_var.shape()) == 1 and in_var.shape()[0] == 1):
+                    if in_var.is_parameter():
+                        self.pruned_params.append(
+                            (in_var, pruned_axis - axis, pruned_idx))
+                    pre_ops = in_var.inputs()
+                    for op in pre_ops:
+                        self._prune_op(op, in_var, pruned_axis - axis,
+                                       pruned_idx)
+            elif var in self.op.inputs("Y"):
+                in_var = self.op.inputs("X")[0]
+                if not (len(in_var.shape()) == 1 and in_var.shape()[0] == 1):
+                    pre_ops = in_var.inputs()
+                    pruned_axis = pruned_axis + axis
+                    for op in pre_ops:
+                        self._prune_op(op, in_var, pruned_axis, pruned_idx)
+
+        out_var = self.op.outputs("Out")[0]
+        self._visit(out_var, pruned_axis)
+        next_ops = out_var.outputs()
+        for op in next_ops:
+            self._prune_op(op, out_var, pruned_axis, pruned_idx)
+
+
+@PRUNE_WORKER.register
+class elementwise_add(elementwise_op):
+    def __init__(self, op, pruned_params, visited):
+        super(elementwise_add, self).__init__(op, pruned_params, visited)
+
+
+@PRUNE_WORKER.register
+class elementwise_sub(elementwise_op):
+    def __init__(self, op, pruned_params, visited):
+        super(elementwise_sub, self).__init__(op, pruned_params, visited)
+
+
+@PRUNE_WORKER.register
+class elementwise_mul(elementwise_op):
+    def __init__(self, op, pruned_params, visited):
+        super(elementwise_mul, self).__init__(op, pruned_params, visited)
+
+
+@PRUNE_WORKER.register
+class activation(PruneWorker):
+    def __init__(self, op, pruned_params, visited):
+        super(activation, self).__init__(op, pruned_params, visited)
+        self.input_name = "X"
+        self.output_name = "Out"
+
+    def _prune(self, var, pruned_axis, pruned_idx):
+        if var in self.op.outputs(self.output_name):
+            in_var = self.op.inputs(self.input_name)[0]
+            pre_ops = in_var.inputs()
+            for op in pre_ops:
+                self._prune_op(op, in_var, pruned_axis, pruned_idx)
+
+        out_var = self.op.outputs(self.output_name)[0]
+        self._visit(out_var, pruned_axis)
+        next_ops = out_var.outputs()
+        for op in next_ops:
+            self._prune_op(op, out_var, pruned_axis, pruned_idx)
+
+
+@PRUNE_WORKER.register
+class default_walker(PruneWorker):
+    def __init__(self, op, pruned_params, visited):
+        super(default_walker, self).__init__(op, pruned_params, visited)
+
+    def _prune(self, var, pruned_axis, pruned_idx):
+        if var in self.op.all_outputs():
+            for in_var in self.op.all_inputs():
+                if len(in_var.shape()) == len(var.shape()):
+                    pre_ops = in_var.inputs()
+                    for op in pre_ops:
+                        self._prune_op(op, in_var, pruned_axis, pruned_idx)
+
+        for out_var in self.op.all_outputs():
+            if len(out_var.shape()) == len(var.shape()):
+                self._visit(out_var, pruned_axis)
+                next_ops = out_var.outputs()
+                for op in next_ops:
+                    self._prune_op(op, out_var, pruned_axis, pruned_idx)
+
+
+@PRUNE_WORKER.register
+class uniform_random_batch_size_like(activation):
+    def __init__(self, op, pruned_params, visited):
+        super(uniform_random_batch_size_like, self).__init__(op, pruned_params,
+                                                             visited)
+        self.input_name = "Input"
+        self.output_name = "Out"
+
+
+@PRUNE_WORKER.register
+class bilinear_interp(activation):
+    def __init__(self, op, pruned_params, visited):
+        super(bilinear_interp, self).__init__(op, pruned_params, visited)
+
+
+@PRUNE_WORKER.register
+class nearest_interp(activation):
+    def __init__(self, op, pruned_params, visited):
+        super(nearest_interp, self).__init__(op, pruned_params, visited)
+
+
+@PRUNE_WORKER.register
+class relu(activation):
+    def __init__(self, op, pruned_params, visited):
+        super(relu, self).__init__(op, pruned_params, visited)
+
+
+@PRUNE_WORKER.register
+class leaky_relu(activation):
+    def __init__(self, op, pruned_params, visited):
+        super(leaky_relu, self).__init__(op, pruned_params, visited)
+
+
+@PRUNE_WORKER.register
+class floor(activation):
+    def __init__(self, op, pruned_params, visited):
+        super(floor, self).__init__(op, pruned_params, visited)
+
+
+@PRUNE_WORKER.register
+class relu6(activation):
+    def __init__(self, op, pruned_params, visited):
+        super(relu6, self).__init__(op, pruned_params, visited)
+
+
+@PRUNE_WORKER.register
+class pool2d(activation):
+    def __init__(self, op, pruned_params, visited):
+        super(pool2d, self).__init__(op, pruned_params, visited)
+
+
+@PRUNE_WORKER.register
+class sum(PruneWorker):
+    def __init__(self, op, pruned_params, visited):
+        super(sum, self).__init__(op, pruned_params, visited)
+
+    def _prune(self, var, pruned_axis, pruned_idx):
+        if var in self.op.outputs("Out"):
+            for in_var in self.op.inputs("X"):
+                pre_ops = in_var.inputs()
+                for op in pre_ops:
+                    self._prune_op(op, in_var, pruned_axis, pruned_idx)
+        elif var in self.op.inputs("X"):
+            for in_var in self.op.inputs("X"):
+                if in_var != var:
+                    pre_ops = in_var.inputs()
+                    for op in pre_ops:
+                        self._prune_op(op, in_var, pruned_axis, pruned_idx)
+        out_var = self.op.outputs("Out")[0]
+        self._visit(out_var, pruned_axis)
+        next_ops = out_var.outputs()
+        for op in next_ops:
+            self._prune_op(op, out_var, pruned_axis, pruned_idx)
+
+
+@PRUNE_WORKER.register
+class concat(PruneWorker):
+    def __init__(self, op, pruned_params, visited):
+        super(concat, self).__init__(op, pruned_params, visited)
+
+    def _prune(self, var, pruned_axis, pruned_idx):
+        idx = []
+        axis = self.op.attr("axis")
+        if var in self.op.outputs("Out"):
+            start = 0
+            if axis == pruned_axis:
+                for _, in_var in enumerate(self.op.inputs("X")):
+                    idx = []
+                    for i in pruned_idx:
+                        r_idx = i - start
+                        if r_idx < in_var.shape()[pruned_axis] and r_idx >= 0:
+                            idx.append(r_idx)
+                    start += in_var.shape()[pruned_axis]
+
+                    pre_ops = in_var.inputs()
+                    for op in pre_ops:
+                        self._prune_op(op, in_var, pruned_axis, idx)
+                idx = pruned_idx[:]
+            else:
+                for _, in_var in enumerate(self.op.inputs("X")):
+                    pre_ops = in_var.inputs()
+                    for op in pre_ops:
+                        self._prune_op(op, in_var, pruned_axis, pruned_idx)
+        elif var in self.op.inputs("X"):
+            if axis == pruned_axis:
+                idx = []
+                start = 0
+                for v in self.op.inputs("X"):
+                    if v.name() == var.name():
+                        idx = [i + start for i in pruned_idx]
+                    else:
+                        start += v.shape()[pruned_axis]
+
+                out_var = self.op.outputs("Out")[0]
+                self._visit(out_var, pruned_axis)
+                next_ops = out_var.outputs()
+                for op in next_ops:
+                    self._prune_op(op, out_var, pruned_axis, idx, visited={})
+            else:
+                for v in self.op.inputs("X"):
+                    for op in v.inputs():
+                        self._prune_op(op, v, pruned_axis, pruned_idx)
+                out_var = self.op.outputs("Out")[0]
+                self._visit(out_var, pruned_axis)
+                next_ops = out_var.outputs()
+                for op in next_ops:
+                    self._prune_op(op, out_var, pruned_axis, pruned_idx)
+
+
+@PRUNE_WORKER.register
+class depthwise_conv2d(PruneWorker):
+    def __init__(self, op, pruned_params, visited={}):
+        super(depthwise_conv2d, self).__init__(op, pruned_params, visited)
+
+    def _prune(self, var, pruned_axis, pruned_idx):
+        data_format = self.op.attr("data_format")
+        channel_axis = 1
+        if data_format == "NHWC":
+            channel_axis = 3
+        if var in self.op.inputs("Input"):
+            assert pruned_axis == channel_axis, "The Input of conv2d can only be pruned at channel axis, but got {}".format(
+                pruned_axis)
+
+            filter_var = self.op.inputs("Filter")[0]
+            self.pruned_params.append((filter_var, 0, pruned_idx))
+            self._visit(filter_var, 0)
+
+            for op in filter_var.outputs():
+                self._prune_op(op, filter_var, 0, pruned_idx)
+
+            output_var = self.op.outputs("Output")[0]
+            next_ops = output_var.outputs()
+            for op in next_ops:
+                self._prune_op(op, output_var, channel_axis, pruned_idx)
+
+        elif var in self.op.inputs("Filter"):
+            assert pruned_axis in [0]
+            if pruned_axis == 0:
+                if len(self.op.inputs("Bias")) > 0:
+                    self.pruned_params.append(
+                        (self.op.inputs("Bias"), channel_axis, pruned_idx))
+
+                self.pruned_params.append((var, 0, pruned_idx))
+
+                for op in var.outputs():
+                    self._prune_op(op, var, 0, pruned_idx)
+
+                output_var = self.op.outputs("Output")[0]
+                self._visit(output_var, channel_axis)
+                next_ops = output_var.outputs()
+                for op in next_ops:
+                    self._prune_op(op, output_var, channel_axis, pruned_idx)
+            for op in var.outputs():
+                self._prune_op(op, var, pruned_axis, pruned_idx)
+        elif var in self.op.outputs("Output"):
+            assert pruned_axis == channel_axis
+            filter_var = self.op.inputs("Filter")[0]
+            self.pruned_params.append((filter_var, 0, pruned_idx))
+            self._visit(filter_var, 0)
+
+            for op in filter_var.outputs():
+                self._prune_op(op, filter_var, 0, pruned_idx)
+
+            if len(self.op.inputs("Bias")) > 0:
+                self.pruned_params.append(
+                    (self.op.inputs("Bias")[0], channel_axis, pruned_idx))
+
+            in_var = self.op.inputs("Input")[0]
+            self._visit(in_var, channel_axis)
+            pre_ops = in_var.inputs()
+            for op in pre_ops:
+                self._prune_op(op, in_var, channel_axis, pruned_idx)
+
+            output_var = self.op.outputs("Output")[0]
+            next_ops = output_var.outputs()
+            for op in next_ops:
+                self._prune_op(op, output_var, channel_axis, pruned_idx)
+
+
+@PRUNE_WORKER.register
+class mul(PruneWorker):
+    def __init__(self, op, pruned_params, visited={}):
+        super(mul, self).__init__(op, pruned_params, visited)
+
+    def _prune(self, var, pruned_axis, pruned_idx):
+        if var in self.op.inputs("X"):
+            assert pruned_axis == 1, "The Input of conv2d can only be pruned at axis 1, but got {}".format(
+                pruned_axis)
+            idx = []
+            feature_map_size = var.shape()[2] * var.shape()[3]
+            range_idx = np.array(range(feature_map_size))
+            for i in pruned_idx:
+                idx += list(range_idx + i * feature_map_size)
+            param_var = self.op.inputs("Y")[0]
+            self.pruned_params.append((param_var, 0, idx))
+
+            for op in param_var.outputs():
+                self._prune_op(op, param_var, 0, pruned_idx)
+
+
+@PRUNE_WORKER.register
+class scale(PruneWorker):
+    def __init__(self, op, pruned_params, visited={}):
+        super(scale, self).__init__(op, pruned_params, visited)
+
+    def _prune(self, var, pruned_axis, pruned_idx):
+        if var in self.op.inputs("X"):
+            out_var = self.op.outputs("Out")[0]
+            for op in out_var.outputs():
+                self._prune_op(op, out_var, pruned_axis, pruned_idx)
+        elif var in self.op.outputs("Out"):
+            in_var = self.op.inputs("X")[0]
+            for op in in_var.inputs():
+                self._prune_op(op, in_var, pruned_axis, pruned_idx)
+
+
+@PRUNE_WORKER.register
+class momentum(PruneWorker):
+    def __init__(self, op, pruned_params, visited={}):
+        super(momentum, self).__init__(op, pruned_params, visited)
+
+    def _prune(self, var, pruned_axis, pruned_idx):
+        if var in self.op.inputs("Param"):
+            _logger.debug("pruning momentum, var:{}".format(var.name()))
+            velocity_var = self.op.inputs("Velocity")[0]
+            self.pruned_params.append((velocity_var, pruned_axis, pruned_idx))
+
+
+@PRUNE_WORKER.register
+class adam(PruneWorker):
+    def __init__(self, op, pruned_params, visited={}):
+        super(adam, self).__init__(op, pruned_params, visited)
+
+    def _prune(self, var, pruned_axis, pruned_idx):
+        if var in self.op.inputs("Param"):
+            _logger.debug("pruning momentum, var:{}".format(var.name()))
+            moment1_var = self.op.inputs("Moment1")[0]
+            self.pruned_params.append((moment1_var, pruned_axis, pruned_idx))
+            moment2_var = self.op.inputs("Moment2")[0]
+            self.pruned_params.append((moment2_var, pruned_axis, pruned_idx))
+
+
+@PRUNE_WORKER.register
+class affine_channel(PruneWorker):
+    def __init__(self, op, pruned_params, visited):
+        super(affine_channel, self).__init__(op, pruned_params, visited)
+
+    def _prune(self, var, pruned_axis, pruned_idx):
+        if (var not in self.op.outputs("Out")) and (
+                var not in self.op.inputs("X")):
+            return
+
+        if var in self.op.outputs("Out"):
+            in_var = self.op.inputs("X")[0]
+            self._visit(in_var, pruned_axis)
+            pre_ops = in_var.inputs()
+            for op in pre_ops:
+                self._prune_op(op, in_var, pruned_axis, pruned_idx)
+
+        for param in ["Scale", "Bias"]:
+            param_var = self.op.inputs(param)[0]
+            for op in param_var.outputs():
+                self._prune_op(op, param_var, 0, pruned_idx)
+            self.pruned_params.append((param_var, 0, pruned_idx))
+
+        out_var = self.op.outputs("Out")[0]
+        self._visit(out_var, pruned_axis)
+        next_ops = out_var.outputs()
+        for op in next_ops:
+            self._prune_op(op, out_var, pruned_axis, pruned_idx)
diff --git a/paddleslim/prune/pruner.py b/paddleslim/prune/pruner.py
index 95f6774ce5a36b8a6aa05fd6f989f0cb23f2339c..7fbac719f9ca10b07869791ecf0efbfd086a2cba 100644
--- a/paddleslim/prune/pruner.py
+++ b/paddleslim/prune/pruner.py
@@ -13,10 +13,15 @@
 # limitations under the License.
 
 import logging
+import sys
 import numpy as np
+from functools import reduce
 import paddle.fluid as fluid
 import copy
 from ..core import VarWrapper, OpWrapper, GraphWrapper
+from .group_param import collect_convs
+from .criterion import CRITERION
+from .idx_selector import IDX_SELECTOR
 from ..common import get_logger
 
 __all__ = ["Pruner"]
@@ -25,13 +30,27 @@ _logger = get_logger(__name__, level=logging.INFO)
 
 
 class Pruner():
-    def __init__(self, criterion="l1_norm"):
-        """
-        Args:
-            criterion(str): the criterion used to sort channels for pruning.
-                            It only supports 'l1_norm' currently.
-        """
-        self.criterion = criterion
+    """The pruner used to prune channels of convolution.
+
+    Args:
+        criterion(str|function): the criterion used to sort channels for pruning.
+        idx_selector(str|function): 
+
+    """
+
+    def __init__(self,
+                 criterion="l1_norm",
+                 idx_selector="default_idx_selector"):
+        if isinstance(criterion, str):
+            self.criterion = CRITERION.get(criterion)
+        else:
+            self.criterion = criterion
+        if isinstance(idx_selector, str):
+            self.idx_selector = IDX_SELECTOR.get(idx_selector)
+        else:
+            self.idx_selector = idx_selector
+
+        self.pruned_weights = False
 
     def prune(self,
               program,
@@ -43,9 +62,10 @@ class Pruner():
               only_graph=False,
               param_backup=False,
               param_shape_backup=False):
-        """
-        Pruning the given parameters.
+        """Pruning the given parameters.
+
         Args:
+
             program(fluid.Program): The program to be pruned.
             scope(fluid.Scope): The scope storing paramaters to be pruned.
             params(list<str>): A list of parameter names to be pruned.
@@ -57,593 +77,99 @@ class Pruner():
                               False means modifying graph and variables in scope. Default: False.
             param_backup(bool): Whether to return a dict to backup the values of parameters. Default: False.
             param_shape_backup(bool): Whether to return a dict to backup the shapes of parameters. Default: False.
+
         Returns:
-            Program: The pruned program.
-            param_backup: A dict to backup the values of parameters.
-            param_shape_backup: A dict to backup the shapes of parameters.
+            tuple: ``(pruned_program, param_backup, param_shape_backup)``. ``pruned_program`` is the pruned program. ``param_backup`` is a dict to backup the values of parameters. ``param_shape_backup`` is a dict to backup the shapes of parameters.
         """
 
         self.pruned_list = []
         graph = GraphWrapper(program.clone())
         param_backup = {} if param_backup else None
         param_shape_backup = {} if param_shape_backup else None
-        self._prune_parameters(
-            graph,
-            scope,
-            params,
-            ratios,
-            place,
-            lazy=lazy,
-            only_graph=only_graph,
-            param_backup=param_backup,
-            param_shape_backup=param_shape_backup)
-        for op in graph.ops():
-            if op.type() == 'depthwise_conv2d' or op.type(
-            ) == 'depthwise_conv2d_grad':
-                op.set_attr('groups', op.inputs('Filter')[0].shape()[0])
-        return graph.program, param_backup, param_shape_backup
 
-    def _prune_filters_by_ratio(self,
-                                scope,
-                                params,
-                                ratio,
-                                place,
-                                lazy=False,
-                                only_graph=False,
-                                param_shape_backup=None,
-                                param_backup=None):
-        """
-        Pruning filters by given ratio.
-        Args:
-            scope(fluid.core.Scope): The scope used to pruning filters.
-            params(list<VarWrapper>): A list of filter parameters.
-            ratio(float): The ratio to be pruned.
-            place(fluid.Place): The device place of filter parameters.
-            lazy(bool): True means setting the pruned elements to zero.
-                        False means cutting down the pruned elements.
-            only_graph(bool): True means only modifying the graph.
-                              False means modifying graph and variables in  scope.
-        """
-        if params[0].name() in self.pruned_list[0]:
-            return
-
-        if only_graph:
-            pruned_num = int(round(params[0].shape()[0] * ratio))
-            for param in params:
-                ori_shape = param.shape()
-                if param_backup is not None and (
-                        param.name() not in param_backup):
-                    param_backup[param.name()] = copy.deepcopy(ori_shape)
-                new_shape = list(ori_shape)
-                new_shape[0] -= pruned_num
-                param.set_shape(new_shape)
-                _logger.debug("prune [{}] from {} to {}".format(param.name(
-                ), ori_shape, new_shape))
-                self.pruned_list[0].append(param.name())
-            return range(pruned_num)
-
-        else:
-
-            param_t = scope.find_var(params[0].name()).get_tensor()
-            pruned_idx = self._cal_pruned_idx(
-                params[0].name(), np.array(param_t), ratio, axis=0)
-            for param in params:
-                assert isinstance(param, VarWrapper)
-                param_t = scope.find_var(param.name()).get_tensor()
-                if param_backup is not None and (
-                        param.name() not in param_backup):
-                    param_backup[param.name()] = copy.deepcopy(
-                        np.array(param_t))
-                try:
-                    pruned_param = self._prune_tensor(
-                        np.array(param_t),
-                        pruned_idx,
-                        pruned_axis=0,
-                        lazy=lazy)
-                except IndexError as e:
-                    _logger.error("Pruning {}, but get [{}]".format(param.name(
-                    ), e))
-
-                param_t.set(pruned_param, place)
-                ori_shape = param.shape()
-                if param_shape_backup is not None and (
-                        param.name() not in param_shape_backup):
-                    param_shape_backup[param.name()] = copy.deepcopy(
-                        param.shape())
-                new_shape = list(param.shape())
-                new_shape[0] = pruned_param.shape[0]
-                param.set_shape(new_shape)
-                _logger.debug("prune [{}] from {} to {}".format(param.name(
-                ), ori_shape, new_shape))
-                self.pruned_list[0].append(param.name())
-            return pruned_idx
-
-    def _prune_parameter_by_idx(self,
-                                scope,
-                                params,
-                                pruned_idx,
-                                pruned_axis,
-                                place,
-                                lazy=False,
-                                only_graph=False,
-                                param_shape_backup=None,
-                                param_backup=None):
-        """
-        Pruning parameters in given axis.
-        Args:
-            scope(fluid.core.Scope): The scope storing paramaters to be pruned.
-            params(VarWrapper): The parameter to be pruned.
-            pruned_idx(list): The index of elements to be pruned.
-            pruned_axis(int): The pruning axis.
-            place(fluid.Place): The device place of filter parameters.
-            lazy(bool): True means setting the pruned elements to zero.
-                        False means cutting down the pruned elements.
-            only_graph(bool): True means only modifying the graph.
-                              False means modifying graph and variables in  scope.
-        """
-        if params[0].name() in self.pruned_list[pruned_axis]:
-            return
-        if only_graph:
-            pruned_num = len(pruned_idx)
-            for param in params:
-                ori_shape = param.shape()
-                if param_backup is not None and (
-                        param.name() not in param_backup):
-                    param_backup[param.name()] = copy.deepcopy(ori_shape)
-                new_shape = list(ori_shape)
-                new_shape[pruned_axis] -= pruned_num
-                param.set_shape(new_shape)
-                _logger.debug("prune [{}] from {} to {}".format(param.name(
-                ), ori_shape, new_shape))
-                self.pruned_list[pruned_axis].append(param.name())
-
-        else:
-            for param in params:
-                assert isinstance(param, VarWrapper)
-                param_t = scope.find_var(param.name()).get_tensor()
-                if param_backup is not None and (
-                        param.name() not in param_backup):
-                    param_backup[param.name()] = copy.deepcopy(
-                        np.array(param_t))
-                pruned_param = self._prune_tensor(
-                    np.array(param_t), pruned_idx, pruned_axis, lazy=lazy)
-                param_t.set(pruned_param, place)
-                ori_shape = param.shape()
-
-                if param_shape_backup is not None and (
-                        param.name() not in param_shape_backup):
-                    param_shape_backup[param.name()] = copy.deepcopy(
-                        param.shape())
-                new_shape = list(param.shape())
-                new_shape[pruned_axis] = pruned_param.shape[pruned_axis]
-                param.set_shape(new_shape)
-                _logger.debug("prune [{}] from {} to {}".format(param.name(
-                ), ori_shape, new_shape))
-                self.pruned_list[pruned_axis].append(param.name())
-
-    def _forward_search_related_op(self, graph, node):
-        """
-        Forward search operators that will be affected by pruning of param.
-        Args:
-            graph(GraphWrapper): The graph to be searched.
-            node(VarWrapper|OpWrapper): The current pruned parameter or operator.
-        Returns:
-            list<OpWrapper>: A list of operators.
-        """
         visited = {}
-        for op in graph.ops():
-            visited[op.idx()] = False
-        stack = []
-        visit_path = []
-        if isinstance(node, VarWrapper):
-            for op in graph.ops():
-                if (not op.is_bwd_op()) and (node in op.all_inputs()):
-                    next_ops = self._get_next_unvisited_op(graph, visited, op)
-                    #                visit_path.append(op)
-                    visited[op.idx()] = True
-                    for next_op in next_ops:
-                        if visited[next_op.idx()] == False:
-                            stack.append(next_op)
-                            visit_path.append(next_op)
-                            visited[next_op.idx()] = True
-        elif isinstance(node, OpWrapper):
-            next_ops = self._get_next_unvisited_op(graph, visited, node)
-            for next_op in next_ops:
-                if visited[next_op.idx()] == False:
-                    stack.append(next_op)
-                    visit_path.append(next_op)
-                    visited[next_op.idx()] = True
-        while len(stack) > 0:
-            #top_op = stack[len(stack) - 1]
-            top_op = stack.pop(0)
-            next_ops = None
-            if top_op.type() in ["conv2d", "deformable_conv"]:
-                next_ops = None
-            elif top_op.type() in ["mul", "concat"]:
-                next_ops = None
-            else:
-                next_ops = self._get_next_unvisited_op(graph, visited, top_op)
-            if next_ops != None:
-                for op in next_ops:
-                    if visited[op.idx()] == False:
-                        stack.append(op)
-                        visit_path.append(op)
-                        visited[op.idx()] = True
-
-        return visit_path
-
-    def _get_next_unvisited_op(self, graph, visited, top_op):
-        """
-        Get next unvisited adjacent operators of given operators.
-        Args:
-            graph(GraphWrapper): The graph used to search. 
-            visited(list): The ids of operators that has been visited.
-            top_op: The given operator.
-        Returns:
-            list<OpWrapper>: A list of operators. 
-        """
-        assert isinstance(top_op, OpWrapper)
-        next_ops = []
-        for op in graph.next_ops(top_op):
-            if (visited[op.idx()] == False) and (not op.is_bwd_op()):
-                next_ops.append(op)
-        return next_ops
-
-    def _get_accumulator(self, graph, param):
-        """
-        Get accumulators of given parameter. The accumulator was created by optimizer.
-        Args:
-            graph(GraphWrapper): The graph used to search.
-            param(VarWrapper): The given parameter.
-        Returns:
-            list<VarWrapper>: A list of accumulators which are variables.
-        """
-        assert isinstance(param, VarWrapper)
-        params = []
-        for op in param.outputs():
-            if op.is_opt_op():
-                for out_var in op.all_outputs():
-                    if graph.is_persistable(out_var) and out_var.name(
-                    ) != param.name():
-                        params.append(out_var)
-        return params
-
-    def _forward_pruning_ralated_params(self,
-                                        graph,
-                                        scope,
-                                        param,
-                                        place,
-                                        ratio=None,
-                                        pruned_idxs=None,
-                                        lazy=False,
-                                        only_graph=False,
-                                        param_backup=None,
-                                        param_shape_backup=None):
-        """
-        Pruning all the parameters affected by the pruning of given parameter.
-        Args:
-            graph(GraphWrapper): The graph to be searched.
-            scope(fluid.core.Scope): The scope storing paramaters to be pruned.
-            param(VarWrapper): The given parameter.
-            place(fluid.Place): The device place of filter parameters.
-            ratio(float): The target ratio to be pruned.
-            pruned_idx(list): The index of elements to be pruned.
-            lazy(bool): True means setting the pruned elements to zero.
-                        False means cutting down the pruned elements.
-            only_graph(bool): True means only modifying the graph.
-                              False means modifying graph and variables in  scope.
-        """
-        assert isinstance(
-            graph,
-            GraphWrapper), "graph must be instance of slim.core.GraphWrapper"
-        assert isinstance(
-            param,
-            VarWrapper), "param must be instance of slim.core.VarWrapper"
-
-        if param.name() in self.pruned_list[0]:
-            return
-        related_ops = self._forward_search_related_op(graph, param)
-        for op in related_ops:
-            _logger.debug("relate op: {};".format(op))
-        if ratio is None:
-            assert pruned_idxs is not None
-            self._prune_parameter_by_idx(
-                scope, [param] + self._get_accumulator(graph, param),
-                pruned_idxs,
-                pruned_axis=0,
-                place=place,
-                lazy=lazy,
-                only_graph=only_graph,
-                param_backup=param_backup,
-                param_shape_backup=param_shape_backup)
-
-        else:
-            pruned_idxs = self._prune_filters_by_ratio(
-                scope, [param] + self._get_accumulator(graph, param),
-                ratio,
-                place,
-                lazy=lazy,
-                only_graph=only_graph,
-                param_backup=param_backup,
-                param_shape_backup=param_shape_backup)
-        self._prune_ops(related_ops, pruned_idxs, graph, scope, place, lazy,
-                        only_graph, param_backup, param_shape_backup)
-
-    def _prune_ops(self, ops, pruned_idxs, graph, scope, place, lazy,
-                   only_graph, param_backup, param_shape_backup):
-        for idx, op in enumerate(ops):
-            if op.type() in ["conv2d", "deformable_conv"]:
-                for in_var in op.all_inputs():
-                    if graph.is_parameter(in_var):
-                        conv_param = in_var
-                        self._prune_parameter_by_idx(
-                            scope, [conv_param] + self._get_accumulator(
-                                graph, conv_param),
-                            pruned_idxs,
-                            pruned_axis=1,
-                            place=place,
-                            lazy=lazy,
-                            only_graph=only_graph,
-                            param_backup=param_backup,
-                            param_shape_backup=param_shape_backup)
-            if op.type() == "depthwise_conv2d":
-                for in_var in op.all_inputs():
-                    if graph.is_parameter(in_var):
-                        conv_param = in_var
-                        self._prune_parameter_by_idx(
-                            scope, [conv_param] + self._get_accumulator(
-                                graph, conv_param),
-                            pruned_idxs,
-                            pruned_axis=0,
-                            place=place,
-                            lazy=lazy,
-                            only_graph=only_graph,
-                            param_backup=param_backup,
-                            param_shape_backup=param_shape_backup)
-            elif op.type() == "elementwise_add":
-                # pruning bias
-                for in_var in op.all_inputs():
-                    if graph.is_parameter(in_var):
-                        bias_param = in_var
-                        self._prune_parameter_by_idx(
-                            scope, [bias_param] + self._get_accumulator(
-                                graph, bias_param),
-                            pruned_idxs,
-                            pruned_axis=0,
-                            place=place,
-                            lazy=lazy,
-                            only_graph=only_graph,
-                            param_backup=param_backup,
-                            param_shape_backup=param_shape_backup)
-            elif op.type() == "mul":  # pruning fc layer
-                fc_input = None
-                fc_param = None
-                for in_var in op.all_inputs():
-                    if graph.is_parameter(in_var):
-                        fc_param = in_var
-                    else:
-                        fc_input = in_var
-
-                idx = []
-                feature_map_size = fc_input.shape()[2] * fc_input.shape()[3]
-                range_idx = np.array(range(feature_map_size))
-                for i in pruned_idxs:
-                    idx += list(range_idx + i * feature_map_size)
-                corrected_idxs = idx
-                self._prune_parameter_by_idx(
-                    scope, [fc_param] + self._get_accumulator(graph, fc_param),
-                    corrected_idxs,
-                    pruned_axis=0,
-                    place=place,
-                    lazy=lazy,
-                    only_graph=only_graph,
-                    param_backup=param_backup,
-                    param_shape_backup=param_shape_backup)
-
-            elif op.type() == "concat":
-                concat_inputs = op.all_inputs()
-                last_op = ops[idx - 1]
-                concat_idx = None
-                for last_op in reversed(ops):
-                    for out_var in last_op.all_outputs():
-                        if out_var in concat_inputs:
-                            concat_idx = concat_inputs.index(out_var)
-                            break
-                    if concat_idx is not None:
-                        break
-                offset = 0
-                for ci in range(concat_idx):
-                    offset += concat_inputs[ci].shape()[1]
-                corrected_idxs = [x + offset for x in pruned_idxs]
-                related_ops = self._forward_search_related_op(graph, op)
-
-                for op in related_ops:
-                    _logger.debug("concat relate op: {};".format(op))
-
-                self._prune_ops(related_ops, corrected_idxs, graph, scope,
-                                place, lazy, only_graph, param_backup,
-                                param_shape_backup)
-            elif op.type() == "batch_norm":
-                bn_inputs = op.all_inputs()
-                in_num = len(bn_inputs)
-                beta = bn_inputs[0]
-                mean = bn_inputs[1]
-                alpha = bn_inputs[2]
-                variance = bn_inputs[3]
-                self._prune_parameter_by_idx(
-                    scope, [mean] + self._get_accumulator(graph, mean),
-                    pruned_idxs,
-                    pruned_axis=0,
-                    place=place,
-                    lazy=lazy,
-                    only_graph=only_graph,
-                    param_backup=param_backup,
-                    param_shape_backup=param_shape_backup)
-                self._prune_parameter_by_idx(
-                    scope, [variance] + self._get_accumulator(graph, variance),
-                    pruned_idxs,
-                    pruned_axis=0,
-                    place=place,
-                    lazy=lazy,
-                    only_graph=only_graph,
-                    param_backup=param_backup,
-                    param_shape_backup=param_shape_backup)
-                self._prune_parameter_by_idx(
-                    scope, [alpha] + self._get_accumulator(graph, alpha),
-                    pruned_idxs,
-                    pruned_axis=0,
-                    place=place,
-                    lazy=lazy,
-                    only_graph=only_graph,
-                    param_backup=param_backup,
-                    param_shape_backup=param_shape_backup)
-                self._prune_parameter_by_idx(
-                    scope, [beta] + self._get_accumulator(graph, beta),
-                    pruned_idxs,
-                    pruned_axis=0,
-                    place=place,
-                    lazy=lazy,
-                    only_graph=only_graph,
-                    param_backup=param_backup,
-                    param_shape_backup=param_shape_backup)
-
-    def _prune_parameters(self,
-                          graph,
-                          scope,
-                          params,
-                          ratios,
-                          place,
-                          lazy=False,
-                          only_graph=False,
-                          param_backup=None,
-                          param_shape_backup=None):
-        """
-        Pruning the given parameters.
-        Args:
-            graph(GraphWrapper): The graph to be searched.
-            scope(fluid.core.Scope): The scope storing paramaters to be pruned.
-            params(list<str>): A list of parameter names to be pruned.
-            ratios(list<float>): A list of ratios to be used to pruning parameters.
-            place(fluid.Place): The device place of filter parameters.
-            pruned_idx(list): The index of elements to be pruned.
-            lazy(bool): True means setting the pruned elements to zero.
-                        False means cutting down the pruned elements.
-            only_graph(bool): True means only modifying the graph.
-                              False means modifying graph and variables in  scope.
-        """
-        assert len(params) == len(ratios)
-        self.pruned_list = [[], []]
+        pruned_params = []
         for param, ratio in zip(params, ratios):
-            assert isinstance(param, str) or isinstance(param, unicode)
-            if param in self.pruned_list[0]:
-                _logger.info("Skip {}".format(param))
+            _logger.info("pruning: {}".format(param))
+            if graph.var(param) is None:
+                _logger.warn(
+                    "Variable[{}] to be pruned is not in current graph.".
+                    format(param))
                 continue
-            _logger.info("pruning param: {}".format(param))
-            param = graph.var(param)
-            self._forward_pruning_ralated_params(
-                graph,
-                scope,
-                param,
-                place,
-                ratio=ratio,
-                lazy=lazy,
-                only_graph=only_graph,
-                param_backup=param_backup,
-                param_shape_backup=param_shape_backup)
-            ops = param.outputs()
-            for op in ops:
-                if op.type() in ['conv2d', 'deformable_conv']:
-                    brother_ops = self._search_brother_ops(graph, op)
-                    for broher in brother_ops:
-                        _logger.debug("pruning brother: {}".format(broher))
-                        for p in graph.get_param_by_op(broher):
-                            self._forward_pruning_ralated_params(
-                                graph,
-                                scope,
-                                p,
-                                place,
-                                ratio=ratio,
-                                lazy=lazy,
-                                only_graph=only_graph,
-                                param_backup=param_backup,
-                                param_shape_backup=param_shape_backup)
-
-    def _search_brother_ops(self, graph, op_node):
-        """
-        Search brother operators that was affected by pruning of given operator.
-        Args:
-            graph(GraphWrapper): The graph to be searched.
-            op_node(OpWrapper): The start node for searching.
-        Returns: 
-            list<VarWrapper>: A list of operators.
-        """
-        _logger.debug("######################search: {}######################".
-                      format(op_node))
-        visited = [op_node.idx()]
-        stack = []
-        brothers = []
-        for op in graph.next_ops(op_node):
-            if ("conv2d" not in op.type()) and (
-                    "concat" not in op.type()) and (
-                        "deformable_conv" not in op.type()) and (
-                            op.type() != 'fc') and (
-                                not op.is_bwd_op()) and (not op.is_opt_op()):
-                stack.append(op)
-                visited.append(op.idx())
-        while len(stack) > 0:
-            top_op = stack.pop()
-            for parent in graph.pre_ops(top_op):
-                if parent.idx() not in visited and (
-                        not parent.is_bwd_op()) and (not parent.is_opt_op()):
-                    _logger.debug("----------go back from {} to {}----------".
-                                  format(top_op, parent))
-                    if (('conv2d' in parent.type()) or
-                        ("deformable_conv" in parent.type()) or
-                        (parent.type() == 'fc')):
-                        brothers.append(parent)
-                    else:
-                        stack.append(parent)
-                    visited.append(parent.idx())
+            group = collect_convs([param], graph,
+                                  visited)[0]  # [(name, axis, pruned_idx)]
+            if group is None or len(group) == 0:
+                continue
+            if only_graph and self.idx_selector.__name__ == "default_idx_selector":
 
-            for child in graph.next_ops(top_op):
-                if ('conv2d' not in child.type()) and (
-                        "concat" not in child.type()) and (
-                            'deformable_conv' not in child.type()) and (
-                                child.type() != 'fc') and (
-                                    child.idx() not in visited) and (
-                                        not child.is_bwd_op()) and (
-                                            not child.is_opt_op()):
-                    stack.append(child)
-                    visited.append(child.idx())
-        _logger.debug("brothers: {}".format(brothers))
-        _logger.debug(
-            "######################Finish search######################".format(
-                op_node))
-        return brothers
+                param_v = graph.var(param)
+                pruned_num = int(round(param_v.shape()[0] * ratio))
+                pruned_idx = [0] * pruned_num
+                for name, axis, _ in group:
+                    pruned_params.append((name, axis, pruned_idx))
 
-    def _cal_pruned_idx(self, name, param, ratio, axis):
-        """
-        Calculate the index to be pruned on axis by given pruning ratio.
-        Args:
-            name(str): The name of parameter to be pruned.
-            param(np.array): The data of parameter to be pruned.
-            ratio(float): The ratio to be pruned.
-            axis(int): The axis to be used for pruning given parameter.
-                       If it is None, the value in self.pruning_axis will be used.
-                       default: None.
-        Returns:
-            list<int>: The indexes to be pruned on axis.
-        """
-        prune_num = int(round(param.shape[axis] * ratio))
-        reduce_dims = [i for i in range(len(param.shape)) if i != axis]
-        if self.criterion == 'l1_norm':
-            criterions = np.sum(np.abs(param), axis=tuple(reduce_dims))
-        pruned_idx = criterions.argsort()[:prune_num]
-        return pruned_idx
+            else:
+                assert ((not self.pruned_weights),
+                        "The weights have been pruned once.")
+                group_values = []
+                for name, axis, pruned_idx in group:
+                    values = np.array(scope.find_var(name).get_tensor())
+                    group_values.append((name, values, axis, pruned_idx))
+
+                scores = self.criterion(
+                    group_values, graph)  # [(name, axis, score, pruned_idx)]
+
+                pruned_params.extend(self.idx_selector(scores, ratio))
+
+        merge_pruned_params = {}
+        for param, pruned_axis, pruned_idx in pruned_params:
+            if param not in merge_pruned_params:
+                merge_pruned_params[param] = {}
+            if pruned_axis not in merge_pruned_params[param]:
+                merge_pruned_params[param][pruned_axis] = []
+            merge_pruned_params[param][pruned_axis].append(pruned_idx)
+
+        for param_name in merge_pruned_params:
+            for pruned_axis in merge_pruned_params[param_name]:
+                pruned_idx = np.concatenate(merge_pruned_params[param_name][
+                    pruned_axis])
+                param = graph.var(param_name)
+                if not lazy:
+                    _logger.debug("{}\t{}\t{}\t{}".format(
+                        param.name(), pruned_axis,
+                        param.shape()[pruned_axis], len(pruned_idx)))
+                    if param_shape_backup is not None:
+                        origin_shape = copy.deepcopy(param.shape())
+                        param_shape_backup[param.name()] = origin_shape
+                    new_shape = list(param.shape())
+                    new_shape[pruned_axis] -= len(pruned_idx)
+                    param.set_shape(new_shape)
+                if not only_graph:
+                    param_t = scope.find_var(param.name()).get_tensor()
+                    if param_backup is not None and (
+                            param.name() not in param_backup):
+                        param_backup[param.name()] = copy.deepcopy(
+                            np.array(param_t))
+                    try:
+                        pruned_param = self._prune_tensor(
+                            np.array(param_t),
+                            pruned_idx,
+                            pruned_axis=pruned_axis,
+                            lazy=lazy)
+                    except IndexError as e:
+                        _logger.error("Pruning {}, but get [{}]".format(
+                            param.name(), e))
+
+                    param_t.set(pruned_param, place)
+        graph.update_groups_of_conv()
+        graph.infer_shape()
+        self.pruned_weights = (not only_graph)
+        return graph.program, param_backup, param_shape_backup
 
     def _prune_tensor(self, tensor, pruned_idx, pruned_axis, lazy=False):
         """
         Pruning a array by indexes on given axis.
+
         Args:
             tensor(numpy.array): The target array to be pruned.
             pruned_idx(list<int>): The indexes to be pruned.
@@ -651,6 +177,7 @@ class Pruner():
             lazy(bool): True means setting the pruned elements to zero.
                         False means remove the pruned elements from memory.
                         default: False.
+
         Returns:
             numpy.array: The pruned array.
         """
diff --git a/paddleslim/prune/sensitive.py b/paddleslim/prune/sensitive.py
index 5b9d229d9c011f0ff495def1d0a0b6519cf39351..a5a6e3601e4a493db17de83200e48bf04109164a 100644
--- a/paddleslim/prune/sensitive.py
+++ b/paddleslim/prune/sensitive.py
@@ -26,8 +26,8 @@ from ..prune import Pruner
 _logger = get_logger(__name__, level=logging.INFO)
 
 __all__ = [
-    "sensitivity", "flops_sensitivity", "load_sensitivities",
-    "merge_sensitive", "get_ratios_by_loss"
+    "sensitivity", "flops_sensitivity", "load_sensitivities", "merge_sensitive",
+    "get_ratios_by_loss"
 ]
 
 
@@ -36,7 +36,38 @@ def sensitivity(program,
                 param_names,
                 eval_func,
                 sensitivities_file=None,
-                pruned_ratios=None):
+                pruned_ratios=None,
+                eval_args=None,
+                criterion='l1_norm'):
+    """Compute the sensitivities of convolutions in a model. The sensitivity of a convolution is the losses of accuracy on test dataset in differenct pruned ratios. The sensitivities can be used to get a group of best ratios with some condition.
+    This function return a dict storing sensitivities as below:
+
+    .. code-block:: python
+
+           {"weight_0":
+               {0.1: 0.22,
+                0.2: 0.33
+               },
+             "weight_1":
+               {0.1: 0.21,
+                0.2: 0.4
+               }
+           }
+
+    ``weight_0`` is parameter name of convolution. ``sensitivities['weight_0']`` is a dict in which key is pruned ratio and value is the percent of losses.
+
+
+    Args:
+        program(paddle.fluid.Program): The program to be analysised.
+        place(fluid.CPUPlace | fluid.CUDAPlace): The device place of filter parameters. 
+        param_names(list): The parameter names of convolutions to be analysised. 
+        eval_func(function): The callback function used to evaluate the model. It should accept a instance of `paddle.fluid.Program` as argument and return a score on test dataset.
+        sensitivities_file(str): The file to save the sensitivities. It will append the latest computed sensitivities into the file. And the sensitivities in the file would not be computed again. This file can be loaded by `pickle` library.
+        pruned_ratios(list): The ratios to be pruned. default: ``[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]``.
+
+    Returns: 
+        dict: A dict storing sensitivities.
+    """
     scope = fluid.global_scope()
     graph = GraphWrapper(program)
     sensitivities = load_sensitivities(sensitivities_file)
@@ -54,9 +85,12 @@ def sensitivity(program,
                 _logger.debug('{}, {} has computed.'.format(name, ratio))
                 continue
             if baseline is None:
-                baseline = eval_func(graph.program)
+                if eval_args is None:
+                    baseline = eval_func(graph.program)
+                else:
+                    baseline = eval_func(eval_args)
 
-            pruner = Pruner()
+            pruner = Pruner(criterion=criterion)
             _logger.info("sensitive - param: {}; ratios: {}".format(name,
                                                                     ratio))
             pruned_program, param_backup, _ = pruner.prune(
@@ -68,7 +102,10 @@ def sensitivity(program,
                 lazy=True,
                 only_graph=False,
                 param_backup=True)
-            pruned_metric = eval_func(pruned_program)
+            if eval_args is None:
+                pruned_metric = eval_func(pruned_program)
+            else:
+                pruned_metric = eval_func(eval_args)
             loss = (baseline - pruned_metric) / baseline
             _logger.info("pruned param: {}; {}; loss={}".format(name, ratio,
                                                                 loss))
@@ -159,13 +196,13 @@ def flops_sensitivity(program,
 
 
 def merge_sensitive(sensitivities):
-    """
-    Merge sensitivities.
+    """Merge sensitivities.
+
     Args:
       sensitivities(list<dict> | list<str>): The sensitivities to be merged. It cann be a list of sensitivities files or dict.
 
     Returns:
-      sensitivities(dict): A dict with sensitivities.
+      dict: A dict stroring sensitivities.
     """
     assert len(sensitivities) > 0
     if not isinstance(sensitivities[0], dict):
@@ -182,8 +219,13 @@ def merge_sensitive(sensitivities):
 
 
 def load_sensitivities(sensitivities_file):
-    """
-    Load sensitivities from file.
+    """Load sensitivities from file.
+
+    Args:
+       sensitivities_file(str):  The file storing sensitivities.
+
+    Returns:
+       dict: A dict stroring sensitivities.
     """
     sensitivities = {}
     if sensitivities_file and os.path.exists(sensitivities_file):
@@ -196,8 +238,11 @@ def load_sensitivities(sensitivities_file):
 
 
 def _save_sensitivities(sensitivities, sensitivities_file):
-    """
-    Save sensitivities into file.
+    """Save sensitivities into file.
+    
+    Args:
+        sensitivities(dict): The sensitivities to be saved.
+        sensitivities_file(str): The file to saved sensitivities.
     """
     with open(sensitivities_file, 'wb') as f:
         pickle.dump(sensitivities, f)
@@ -217,11 +262,12 @@ def get_ratios_by_loss(sensitivities, loss):
 
     Returns:
 
-      ratios(dict): A group of ratios. The key of dict is name of parameters while the value is the ratio to be pruned.
+      dict: A group of ratios. The key of dict is name of parameters while the value is the ratio to be pruned.
     """
     ratios = {}
     for param, losses in sensitivities.items():
         losses = losses.items()
+        losses = list(losses)
         losses.sort()
         for i in range(len(losses))[::-1]:
             if losses[i][1] <= loss:
@@ -236,7 +282,7 @@ def get_ratios_by_loss(sensitivities, loss):
                     ratio = r0 + (loss - l0) * (r1 - r0) / (l1 - l0)
                     ratios[param] = ratio
                     if ratio > 1:
-                        print losses, ratio, (r1 - r0) / (l1 - l0), i
+                        _logger.info(losses, ratio, (r1 - r0) / (l1 - l0), i)
 
                 break
     return ratios
diff --git a/paddleslim/prune/sensitive_pruner.py b/paddleslim/prune/sensitive_pruner.py
index 37d5965af06f2b4fc56e5ef03429ff22fb0cbd6b..c216482271129a222ad128c14052855a34bafe80 100644
--- a/paddleslim/prune/sensitive_pruner.py
+++ b/paddleslim/prune/sensitive_pruner.py
@@ -30,18 +30,20 @@ _logger = get_logger(__name__, level=logging.INFO)
 
 
 class SensitivePruner(object):
+    """
+    Pruner used to prune parameters iteratively according to sensitivities
+    of parameters in each step.
+
+    Args:
+        place(fluid.CUDAPlace | fluid.CPUPlace): The device place where
+            program execute.
+        eval_func(function): A callback function used to evaluate pruned
+            program. The argument of this function is pruned program.
+            And it return a score of given program.
+        scope(fluid.scope): The scope used to execute program.
+    """
+
     def __init__(self, place, eval_func, scope=None, checkpoints=None):
-        """
-        Pruner used to prune parameters iteratively according to sensitivities
-        of parameters in each step.
-        Args:
-            place(fluid.CUDAPlace | fluid.CPUPlace): The device place where
-                program execute.
-            eval_func(function): A callback function used to evaluate pruned
-                program. The argument of this function is pruned program.
-                And it return a score of given program.
-            scope(fluid.scope): The scope used to execute program.
-        """
         self._eval_func = eval_func
         self._iter = 0
         self._place = place
@@ -64,7 +66,7 @@ class SensitivePruner(object):
 
         exe = fluid.Executor(self._place)
         checkpoints = self._checkpoints if checkpoints is None else checkpoints
-        print("check points: {}".format(checkpoints))
+        _logger.info("check points: {}".format(checkpoints))
         main_program = None
         eval_program = None
         if checkpoints is not None:
@@ -87,8 +89,9 @@ class SensitivePruner(object):
                 with fluid.scope_guard(self._scope):
                     fluid.io.load_persistables(exe, latest_ck_path,
                                                main_program, "__params__")
-                print("load checkpoint from: {}".format(latest_ck_path))
-                print("flops of eval program: {}".format(flops(eval_program)))
+                _logger.info("load checkpoint from: {}".format(latest_ck_path))
+                _logger.info("flops of eval program: {}".format(
+                    flops(eval_program)))
         return main_program, eval_program, self._iter
 
     def greedy_prune(self,
@@ -108,7 +111,7 @@ class SensitivePruner(object):
                 self._eval_func,
                 sensitivities_file=sensitivities_file,
                 pruned_flops_rate=pruned_flops_rate)
-        print sensitivities
+        _logger.info(sensitivities)
         params, ratios = self._greedy_ratio_by_sensitive(sensitivities, topk)
 
         _logger.info("Pruning: {} by {}".format(params, ratios))
@@ -134,12 +137,14 @@ class SensitivePruner(object):
     def prune(self, train_program, eval_program, params, pruned_flops):
         """
         Pruning parameters of training and evaluation network by sensitivities in current step.
+
         Args:
             train_program(fluid.Program): The training program to be pruned.
             eval_program(fluid.Program): The evaluation program to be pruned. And it is also used to calculate sensitivities of parameters.
             params(list<str>): The parameters to be pruned.
             pruned_flops(float): The ratio of FLOPS to be pruned in current step.
-        Return:
+
+        Returns:
             tuple: A tuple of pruned training program and pruned evaluation program.
         """
         _logger.info("Pruning: {}".format(params))
@@ -152,7 +157,7 @@ class SensitivePruner(object):
                 self._eval_func,
                 sensitivities_file=sensitivities_file,
                 step_size=0.1)
-        print sensitivities
+        _logger.info(sensitivities)
         _, ratios = self.get_ratios_by_sensitive(sensitivities, pruned_flops,
                                                  eval_program)
 
@@ -198,9 +203,9 @@ class SensitivePruner(object):
           pruned_flops(float): The percent of FLOPS to be pruned.
           eval_program(Program): The program whose FLOPS is considered.
 
-        Return:
+        Returns:
 
-          ratios(dict): A group of ratios. The key of dict is name of parameters while the value is the ratio to be pruned.
+          dict: A group of ratios. The key of dict is name of parameters while the value is the ratio to be pruned.
         """
 
         min_loss = 0.
diff --git a/paddleslim/quant/__init__.py b/paddleslim/quant/__init__.py
index 5f5f9a300630abac32a9c0301328e344da082c55..3c9a52ed6a1f274c6e250dadd6d61e4aa6581217 100644
--- a/paddleslim/quant/__init__.py
+++ b/paddleslim/quant/__init__.py
@@ -12,5 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .quanter import quant_aware, quant_post, convert
+import logging
+
+import paddle.fluid as fluid
+import paddle.version as fluid_version
+from ..common import get_logger
+
+_logger = get_logger(__name__, level=logging.INFO)
+
+try:
+    fluid.require_version('1.8.4')
+    version_installed = [
+        fluid_version.major, fluid_version.minor, fluid_version.patch,
+        fluid_version.rc
+    ]
+    assert version_installed != [
+        '2', '0', '0-alpha0', '0'
+    ], "training-aware and post-training quant is not supported in 2.0 alpha version paddle"
+    from .quanter import quant_aware, convert, quant_post_static, quant_post_dynamic
+    from .quanter import quant_post, quant_post_only_weight
+except Exception as e:
+    _logger.warning(
+        "If you want to use training-aware and post-training quantization, "
+        "please use Paddle >= 1.8.4 or develop version")
+
 from .quant_embedding import quant_embedding
diff --git a/paddleslim/quant/quant_embedding.py b/paddleslim/quant/quant_embedding.py
index 46a81db65c55f91fdf5525bf0da25414598a0b71..2e4f229859097ec26d849f0f2c8839ba80b8c164 100755
--- a/paddleslim/quant/quant_embedding.py
+++ b/paddleslim/quant/quant_embedding.py
@@ -18,24 +18,29 @@ from __future__ import print_function
 import logging
 import copy
 import numpy as np
+import math
+from multiprocessing.dummy import Pool as ThreadPool
 
 import paddle.fluid as fluid
 from paddle.fluid.framework import IrGraph
 from paddle.fluid import core
 
-#_logger = logging.basicConfig(level=logging.DEBUG)
+from ..common import get_logger
+_logger = get_logger(__name__, level=logging.INFO)
 
 __all__ = ['quant_embedding']
 
-default_config = {
+_default_single_config = {
     "quantize_type": "abs_max",
     "quantize_bits": 8,
     "dtype": "int8"
 }
+SUPPORT_OP_TYPES = ['lookup_table', 'fused_embedding_seq_pool', 'pyramid_hash']
+SUPPORT_QUANTIZE_TYPES = ['abs_max', 'log']
+SUPPORT_QUANTIZE_BITS = [8]
+SUPPORT_DTYPE = ['int8']
 
-support_quantize_types = ['abs_max']
-support_quantize_bits = [8]
-support_dtype = ['int8']
+_default_config = {"quantize_op_types": SUPPORT_OP_TYPES, }
 
 
 def _merge_config(old_config, new_config):
@@ -49,32 +54,47 @@ def _merge_config(old_config, new_config):
     """
     old_config.update(new_config)
     keys = old_config.keys()
-    assert 'params_name' in keys, "params_name must be set"
-
-    quantize_type = old_config['quantize_type']
-    assert isinstance(quantize_type, str), "quantize_type must be \
+    assert isinstance(old_config['quantize_op_types'], (str, list)), \
+            'quantize_op_types can only be str or list[str]'
+    if isinstance(old_config['quantize_op_types'], str):
+        old_config['quantize_op_types'] = [old_config['quantize_op_types']]
+    for op_type in old_config['quantize_op_types']:
+        assert op_type in SUPPORT_OP_TYPES, \
+                '{} is not supported, supported op types are {}'.format(
+                        op_type, SUPPORT_OP_TYPES)
+        if op_type not in keys:
+            old_config[op_type] = _default_single_config
+            continue
+        else:
+            assert isinstance(old_config[op_type], dict), \
+                    "op type {}'s config must be dict"
+            config_tmp = copy.deepcopy(_default_single_config)
+            config_tmp.update(old_config[op_type])
+            old_config[op_type] = config_tmp
+
+        quantize_type = old_config[op_type]['quantize_type']
+        assert isinstance(quantize_type, str), "quantize_type must be \
             str"
 
-    assert quantize_type in support_quantize_types, " \
-            quantize_type {} is not supported, now supported quantize type \
-            are {}.".format(quantize_type, support_quantize_types)
-
-    quantize_bits = old_config['quantize_bits']
-    assert isinstance(quantize_bits, int), "quantize_bits must be int"
-    assert quantize_bits in support_quantize_bits, " quantize_bits {} \
-                is not supported, now supported quantize bits are \
-                {}. ".format(quantize_bits, support_quantize_bits)
-
-    dtype = old_config['dtype']
-    assert isinstance(dtype, str), "dtype must be str"
-    assert dtype in support_dtype, " dtype {} is not \
-            supported, now supported dtypes are {} \
-                ".format(dtype, support_dtype)
-    if 'threshold' in keys:
-        assert isinstance(old_config['threshold'], (float, int)), "threshold \
-                must be number."
-
-    print("quant_embedding config {}".format(old_config))
+        assert quantize_type in SUPPORT_QUANTIZE_TYPES , "" \
+            "quantize_type {} is not supported, now supported quantize type" \
+            " are {}.".format(quantize_type, SUPPORT_QUANTIZE_TYPES)
+
+        quantize_bits = old_config[op_type]['quantize_bits']
+        assert isinstance(quantize_bits, int), "quantize_bits must be int"
+        assert quantize_bits in SUPPORT_QUANTIZE_BITS , " quantize_bits {}" \
+                " is not supported, now supported quantize bits are" \
+                " {}. ".format(quantize_bits, SUPPORT_QUANTIZE_BITS)
+
+        dtype = old_config[op_type]['dtype']
+        assert isinstance(dtype, str), "dtype must be str"
+        assert dtype in SUPPORT_DTYPE , " dtype {} is not "\
+            "supported, now supported dtypes are {} ".format(dtype, SUPPORT_DTYPE)
+        if 'threshold' in old_config[op_type].keys():
+            assert isinstance(old_config[op_type]['threshold'], (float, int)), \
+                    "threshold must be number."
+
+    _logger.info("quant_embedding config {}".format(old_config))
     return old_config
 
 
@@ -90,18 +110,6 @@ def _get_var_tensor(scope, var_name):
     return np.array(scope.find_var(var_name).get_tensor())
 
 
-def _clip_tensor(tensor_array, threshold):
-    """
-    when 'threshold' is set, clip tensor by 'threshold' and '-threshold'
-    Args:
-        tensor_array(np.array): array to clip
-        config(dict): config dict
-    """
-    tensor_array[tensor_array > threshold] = threshold
-    tensor_array[tensor_array < -threshold] = -threshold
-    return tensor_array
-
-
 def _get_scale_var_name(var_name):
     """
     get scale var name 
@@ -109,6 +117,10 @@ def _get_scale_var_name(var_name):
     return var_name + '.scale'
 
 
+def _get_dict_var_name(var_name):
+    return var_name + '.dict'
+
+
 def _get_quant_var_name(var_name):
     """
     get quantized var name
@@ -139,7 +151,8 @@ def _clear_var(var_name, scope):
     tensor._clear()
 
 
-def _quant_embedding_abs_max(graph, scope, place, config):
+def _quant_embedding_abs_max(graph, scope, place, config, var_name,
+                             embedding_node):
     """
     quantize embedding using abs_max
 
@@ -190,16 +203,22 @@ def _quant_embedding_abs_max(graph, scope, place, config):
         for node in output_ops:
             graph.update_input_link(var_node, dequant_var_node, node)
 
-    all_var_nodes = graph.all_var_nodes()
-    var_name = config['params_name']
-    # find embedding var node by 'params_name'
-    embedding_node = graph._find_node_by_name(all_var_nodes, var_name)
-    embedding_tensor = _get_var_tensor(scope, var_name)
-    if 'threshold' in config.keys():
-        embedding_tensor = _clip_tensor(embedding_tensor, config['threshold'])
+    def _clip_array(array, config):
+        if 'threshold' in config.keys():
+            threshold = config['threshold']
+        else:
+            abs_array = np.max(np.abs(array))
+            if abs_array < 1.0:
+                return array
+            threshold = np.percentile(np.abs(array), 99.99)
+        return np.clip(array, -threshold, threshold)
 
+    _logger.info("Embedding {}: abs_max quantization".format(var_name))
+
+    embedding_tensor = _get_var_tensor(scope, var_name)
+    embedding_array = _clip_array(embedding_tensor, config)
     # get scale and quanted tensor
-    scale, quanted_tensor = _quant_abs_max(embedding_tensor, config)
+    scale, quanted_tensor = _quant_abs_max(embedding_array, config)
 
     #create params must to use create_persistable_node
     scale_var = graph.create_persistable_node(
@@ -221,39 +240,233 @@ def _quant_embedding_abs_max(graph, scope, place, config):
 
     # insert dequantize_abs_max op
     for op_node in embedding_node.outputs:
-        if op_node.name() == 'lookup_table':
-            graph.update_input_link(embedding_node, quant_tensor_var, op_node)
-            var_node = op_node.outputs[0]
-            _insert_dequant_abs_max_op(graph, scope, var_node, scale_var,
-                                       config)
+        graph.update_input_link(embedding_node, quant_tensor_var, op_node)
+        out_name = op_node.output('Out')[0]
+        var_node = graph._find_node_by_name(op_node.outputs, out_name)
+        _insert_dequant_abs_max_op(graph, scope, var_node, scale_var, config)
 
     # free float embedding params memory
     _clear_var(embedding_node.name(), scope)
     graph.safe_remove_nodes(embedding_node)
 
 
-def quant_embedding(program, place, config, scope=None):
+def _quant_embedding_log(graph, scope, place, config, var_name,
+                         embedding_node):
     """
-    quant lookup_table op parameters
+    quantize embedding using log
+
+    Args:
+        graph(IrGraph): graph that includes Embedding Parameter
+        scope(fluid.Scope): scope 
+        place(fluid.CPUPlace or flud.CUDAPlace): place to run program
+        config(dict): config to quant Embedding
+    """
+
+    _inverval = 0.125
+    _dict_len = 256
+    _dict = np.zeros(_dict_len)
+
+    def _search(array, num_array):
+        length = len(array)
+        res = np.searchsorted(array, num_array)
+        res_refine = []
+        for i in range(len(num_array)):
+            value = num_array[i]
+            idx = res[i]
+            if idx > 0 and ((idx == length) or (
+                    abs(array[idx - 1] - value) < abs(array[idx] - value))):
+                res_refine.append(idx - 1)
+            else:
+                res_refine.append(idx)
+        return np.array(res_refine)
+
+    def _quant_log(tensor_array, config):
+        """
+        quant array using log op
+        """
+        bit_length = config['quantize_bits']
+        log_and_quant = np.round(np.log2(np.abs(tensor_array)) /
+                                 _inverval) * _inverval
+        unique, counts = np.unique(log_and_quant, return_counts=True)
+        topk_num = np.sort(unique)[-int(_dict_len / 2):]
+
+        pool = ThreadPool(8)
+        quanted_array = pool.map(lambda x: _search(topk_num, x), log_and_quant)
+        quanted_array = np.array(quanted_array)
+        pool.close()
+        pool.join()
+        index_tmp = tensor_array < 0
+        quanted_array_tmp = quanted_array[index_tmp]
+        quanted_array_tmp = quanted_array_tmp - 128
+        quanted_array[index_tmp] = quanted_array_tmp
+        quanted_array = quanted_array.astype(config['dtype'])
+        return topk_num, quanted_array
+
+    def _insert_dequant_log_op(graph, scope, var_node, topk_num_node, config):
+        """
+        Insert dequantize_log op in graph
+        """
+        assert var_node.is_var(), "{} is not a var".format(var_node.name())
+
+        dequant_var_node = graph.create_var_node(
+            name=_get_dequant_var_name(var_node.name()),
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=core.VarDesc.VarType.FP32)
+        scope.var(dequant_var_node.name())
+
+        output_ops = var_node.outputs
+        dequant_op = graph.create_op_node(
+            op_type='dequantize_log',
+            attrs={'op_role': core.op_proto_and_checker_maker.OpRole.Forward},
+            inputs={'X': var_node,
+                    'Dict': topk_num_node},
+            outputs={'Out': dequant_var_node})
+        graph.link_to(var_node, dequant_op)
+        graph.link_to(topk_num_node, dequant_op)
+        graph.link_to(dequant_op, dequant_var_node)
+        for node in output_ops:
+            graph.update_input_link(var_node, dequant_var_node, node)
+
+    _logger.info("Embedding {}: log quantization".format(var_name))
+    # find embedding var node by 'var_name'
+    embedding_tensor = _get_var_tensor(scope, var_name)
+
+    # get quantize dict and quanted tensor
+    topk_num, quanted_tensor = _quant_log(embedding_tensor, config)
+    topk_num = np.power(2, topk_num)
+
+    #create params must use create_persistable_node
+    topk_num_var = graph.create_persistable_node(
+        _get_dict_var_name(var_name),
+        var_type=embedding_node.type(),
+        shape=topk_num.shape,
+        var_dtype=core.VarDesc.VarType.FP32)
+    quant_tensor_var = graph.create_persistable_node(
+        _get_quant_var_name(var_name),
+        var_type=embedding_node.type(),
+        shape=embedding_node.shape(),
+        var_dtype=core.VarDesc.VarType.INT8)
+    # create var in scope
+    scope.var(_get_quant_var_name(var_name))
+    scope.var(_get_dict_var_name(var_name))
+    #set var by tensor array or dict
+    _restore_var(_get_quant_var_name(var_name), quanted_tensor, scope, place)
+    _restore_var(_get_dict_var_name(var_name), topk_num, scope, place)
+
+    # insert dequantize_log op
+    for op_node in embedding_node.outputs:
+        graph.update_input_link(embedding_node, quant_tensor_var, op_node)
+        out_name = op_node.output('Out')[0]
+        var_node = graph._find_node_by_name(op_node.outputs, out_name)
+
+        _insert_dequant_log_op(graph, scope, var_node, topk_num_var, config)
+
+    # free float embedding params memory
+    _clear_var(embedding_node.name(), scope)
+    graph.safe_remove_nodes(embedding_node)
+
+
+def _remove_link(in_node, out_node):
+    in_node.remove_output(out_node)
+    out_node.remove_input(in_node)
+
+
+def _split_embedding_seq_pool(graph, op):
+    inputs = op.inputs
+    outputs = op.outputs
+    op_desc = op.node.op()
+    combiner = op_desc.attr("combiner")
+    padding_idx = op_desc.attr("padding_idx")
+    is_sparse = op_desc.attr("is_sparse")
+    ids = graph._find_node_by_name(inputs, op.input('Ids')[0])
+    weight = graph._find_node_by_name(inputs, op.input('W')[0])
+    out = outputs[0]
+    lookup_out = graph.create_var_node(
+        name=ids.name() + '.look_up_table.out',
+        var_type=core.VarDesc.VarType.LOD_TENSOR,
+        shape=[1],
+        var_dtype=weight.dtype())
+    lookup_table_op = graph.create_op_node(
+        op_type='lookup_table',
+        attrs={'is_sparse': is_sparse,
+               'padding_idx': padding_idx},
+        inputs={'W': weight,
+                'Ids': ids},
+        outputs={'Out': lookup_out})
+    _remove_link(ids, op)
+    _remove_link(weight, op)
+    _remove_link(op, out)
+    graph.link_to(ids, lookup_table_op)
+    graph.link_to(weight, lookup_table_op)
+    graph.link_to(lookup_table_op, lookup_out)
+    max_index = graph.create_var_node(
+        name=ids.name() + '.seq_pool_op.max_index',
+        var_type=core.VarDesc.VarType.LOD_TENSOR,
+        shape=[1],
+        var_dtype=weight.dtype())
+
+    seq_pool_op = graph.create_op_node(
+        op_type='sequence_pool',
+        inputs={'X': lookup_out},
+        outputs={'Out': out,
+                 'MaxIndex': max_index},
+        attrs={'pooltype': combiner.upper(),
+               'is_test': True})
+    if combiner == 'max':
+        max_index.stop_gradient = True
+    graph.link_to(lookup_out, seq_pool_op)
+    graph.link_to(seq_pool_op, out)
+    graph.link_to(seq_pool_op, max_index)
+
+
+def quant_embedding(program, place, config=None, scope=None):
+    """quantize lookup_table op parameters
+
     Args:
         program(fluid.Program): infer program
-        scope(fluid.Scope): the scope to store var, when is None will use fluid.global_scope()
-        place(fluid.CPUPlace or fluid.CUDAPlace): place
-        config(dict): config to quant. The keys are 'params_name', 'quantize_type', \
+        scope(fluid.Scope, optional): Scope records the mapping between variable names and variables, similar to brackets in programming languages. Usually users can use `fluid.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_ . When ``None`` will use `fluid.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_. Default : ``None``.
+        place(fluid.CPUPlace or fluid.CUDAPlace): This parameter represents the executor run on which device.
+        config(dict, optional): config to quantize. The keys are 'quantize_op_types'. For op in quantize_op_types, you can define 'quantize_type', \
                 'quantize_bits', 'dtype', 'threshold'. \
-                'params_name': parameter name to quant, must be set.
-                'quantize_type': quantize type, supported types are ['abs_max']. default is "abs_max".
-                'quantize_bits': quantize bits, supported bits are [8].  default is 8.
-                'dtype': quantize dtype, supported dtype are ['int8']. default is 'int8'.
-                'threshold': threshold to clip tensor before quant. When threshold is not set, \
+                ``quantize_type`` is  quantize type, supported types are ['abs_max'], default is "abs_max".
+                ``quantize_bits`` supported bits are [8] and default is 8.
+                ``dtype`` is quantize dtype, supported dtype are ['int8'], default is 'int8'.
+                ``threshold`` is threshold to clip tensor before quant. When threshold is not set, \
                         tensor will not be clipped.
+
+    Returns:
+        None
     """
-    assert isinstance(config, dict), "config must be dict"
-    config = _merge_config(copy.deepcopy(default_config), config)
+    config = config or {}
+    config = _merge_config(copy.deepcopy(_default_config), config)
     scope = fluid.global_scope() if scope is None else scope
 
     graph = IrGraph(core.Graph(program.desc), for_test=True)
-    if config['quantize_type'] == 'abs_max':
-        _quant_embedding_abs_max(graph, scope, place, config)
+    quantize_params_map = {}
+    all_op = graph.all_op_nodes()
+    for op in all_op:
+        if op.inputs == [] and op.outputs == []:
+            continue
+        op_type = op.name()
+        if op_type in config['quantize_op_types']:
+            weight_name = op.input('W')[0]
+            if weight_name in quantize_params_map.values():
+                continue
+            embedding_node = graph._find_node_by_name(op.inputs,
+                                                      op.input('W')[0])
+            for op_node in embedding_node.outputs:
+                if op_node.name() == 'fused_embedding_seq_pool':
+                    _split_embedding_seq_pool(graph, op_node)
+            if config[op_type]['quantize_type'] == 'abs_max':
+                _quant_embedding_abs_max(graph, scope, place, config[op_type],
+                                         weight_name, embedding_node)
+            elif config[op_type]['quantize_type'] == 'log':
+                _quant_embedding_log(graph, scope, place, config[op_type],
+                                     weight_name, embedding_node)
+            quantize_params_map[weight_name] = _get_quant_var_name(weight_name)
+    for op in all_op:
+        if op.name() == 'fused_embedding_seq_pool':
+            graph.safe_remove_nodes(op)
 
     return graph.to_program()
diff --git a/paddleslim/quant/quanter.py b/paddleslim/quant/quanter.py
index 254cf4958643ef5e4d4e6cd625028baef964e222..328861720ba92e5742fa4ae9e70bc0c329a7acb3 100755
--- a/paddleslim/quant/quanter.py
+++ b/paddleslim/quant/quanter.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import copy
+import json
+import logging
+
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.framework import IrGraph
@@ -22,24 +26,43 @@ from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass
 from paddle.fluid.contrib.slim.quantization import TransformForMobilePass
 from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
 from paddle.fluid.contrib.slim.quantization import AddQuantDequantPass
+from paddle.fluid.contrib.slim.quantization import OutScaleForTrainingPass
+from paddle.fluid.contrib.slim.quantization import OutScaleForInferencePass
 from paddle.fluid import core
+from paddle.fluid.contrib.slim.quantization import WeightQuantization
+
+from ..common import get_logger
+_logger = get_logger(__name__, level=logging.INFO)
 
 WEIGHT_QUANTIZATION_TYPES = [
-    'abs_max', 'channel_wise_abs_max', 'range_abs_max',
-    'moving_average_abs_max'
+    'abs_max', 'channel_wise_abs_max', 'range_abs_max', 'moving_average_abs_max'
 ]
+WEIGHT_QUANTIZATION_TYPES_TENSORRT = ['channel_wise_abs_max']
+
 ACTIVATION_QUANTIZATION_TYPES = [
     'abs_max', 'range_abs_max', 'moving_average_abs_max'
 ]
+
+ACTIVATION_QUANTIZATION_TYPES_TENSORRT = [
+    'range_abs_max', 'moving_average_abs_max'
+]
+
 VALID_DTYPES = ['int8']
-TRANSFORM_PASS_OP_TYPES = ['conv2d', 'depthwise_conv2d', 'mul']
-QUANT_DEQUANT_PASS_OP_TYPES = ['elementwise_add', 'pool2d']
+TRANSFORM_PASS_OP_TYPES = QuantizationTransformPass._supported_quantizable_op_type
+QUANT_DEQUANT_PASS_OP_TYPES = AddQuantDequantPass._supported_quantizable_op_type
+
+TENSORRT_OP_TYPES = [
+    'mul', 'conv2d', 'pool2d', 'depthwise_conv2d', 'elementwise_add',
+    'leaky_relu'
+]
+
+VARS_MAPPING_TABLE = './mapping_table_for_saving_inference_model'
 
 _quant_config_default = {
-    # weight quantize type, default is 'abs_max'
-    'weight_quantize_type': 'abs_max',
-    # activation quantize type, default is 'abs_max'
-    'activation_quantize_type': 'abs_max',
+    # weight quantize type, default is 'channel_wise_abs_max'
+    'weight_quantize_type': 'channel_wise_abs_max',
+    # activation quantize type, default is 'moving_average_abs_max'
+    'activation_quantize_type': 'moving_average_abs_max',
     # weight quantize bit num, default is 8
     'weight_bits': 8,
     # activation quantize bit num, default is 8
@@ -47,25 +70,37 @@ _quant_config_default = {
     # ops of name_scope in not_quant_pattern list, will not be quantized
     'not_quant_pattern': ['skip_quant'],
     # ops of type in quantize_op_types, will be quantized
-    'quantize_op_types':
-    ['conv2d', 'depthwise_conv2d', 'mul', 'elementwise_add', 'pool2d'],
+    'quantize_op_types': ['conv2d', 'depthwise_conv2d', 'mul'],
     # data type after quantization, such as 'uint8', 'int8', etc. default is 'int8'
     'dtype': 'int8',
     # window size for 'range_abs_max' quantization. defaulf is 10000
     'window_size': 10000,
     # The decay coefficient of moving average, default is 0.9
     'moving_rate': 0.9,
-    # if set quant_weight_only True, then only quantize parameters of layers which need to be quantized,
-    # and activations will not be quantized.
-    'quant_weight_only': False
+    # if True, 'quantize_op_types' will be TENSORRT_OP_TYPES
+    'for_tensorrt': False,
+    # if True, 'quantoze_op_types' will be TRANSFORM_PASS_OP_TYPES + QUANT_DEQUANT_PASS_OP_TYPES 
+    'is_full_quantize': False
 }
 
 
+def load_dict():
+    with open(VARS_MAPPING_TABLE, 'r') as file:
+        data = file.read()
+        data = json.loads(data)
+        return data
+
+
+def save_dict(table):
+    with open(VARS_MAPPING_TABLE, 'w') as file:
+        file.write(json.dumps(table))
+
+
 def _parse_configs(user_config):
     """
-    check user configs is valid, and set default value if user not config.
+    check if user's configs are valid.
     Args:
-        user_config(dict):the config of user.
+        user_config(dict): user's config.
     Return:
         configs(dict): final configs will be used.
     """
@@ -73,12 +108,26 @@ def _parse_configs(user_config):
     configs = copy.deepcopy(_quant_config_default)
     configs.update(user_config)
 
-    # check configs is valid
-    assert configs['weight_quantize_type'] in WEIGHT_QUANTIZATION_TYPES, \
-        "Unknown weight_quantize_type: '%s'. It can only be " + " ".join(WEIGHT_QUANTIZATION_TYPES)
+    assert isinstance(configs['for_tensorrt'], bool) and isinstance(
+        configs['is_full_quantize'],
+        bool), "'for_tensorrt' and 'is_full_quantize' must both be bool'"
+
+    # check if configs is valid
+    if configs['for_tensorrt']:
+        weight_types = WEIGHT_QUANTIZATION_TYPES_TENSORRT
+        activation_types = ACTIVATION_QUANTIZATION_TYPES_TENSORRT
+        platform = 'TensorRT'
+    else:
+        weight_types = WEIGHT_QUANTIZATION_TYPES
+        activation_types = WEIGHT_QUANTIZATION_TYPES
+        platform = 'PaddleLite'
+    assert configs['weight_quantize_type'] in weight_types, \
+        "Unknown weight_quantize_type: {}. {} only supports {} ".format(configs['weight_quantize_type'],
+                platform, weight_types)
 
-    assert configs['activation_quantize_type'] in ACTIVATION_QUANTIZATION_TYPES, \
-        "Unknown activation_quantize_type: '%s'. It can only be " + " ".join(ACTIVATION_QUANTIZATION_TYPES)
+    assert configs['activation_quantize_type'] in activation_types, \
+        "Unknown activation_quantize_type: {}. {} only supports {}".format(configs['activation_quantize_type'],
+                platform, activation_types)
 
     assert isinstance(configs['weight_bits'], int), \
         "weight_bits must be int value."
@@ -92,17 +141,24 @@ def _parse_configs(user_config):
     assert (configs['activation_bits'] >= 1 and configs['activation_bits'] <= 16), \
         "activation_bits should be between 1 and 16."
 
-    assert isinstance(configs['not_quant_pattern'], list), \
-        "not_quant_pattern must be a list"
+    assert isinstance(configs['not_quant_pattern'], (list, str)), \
+        "not_quant_pattern must be list or str"
 
     assert isinstance(configs['quantize_op_types'], list), \
         "quantize_op_types must be a list"
 
-    for op_type in configs['quantize_op_types']:
-        assert (op_type in QUANT_DEQUANT_PASS_OP_TYPES) or (
-            op_type in TRANSFORM_PASS_OP_TYPES), "{} is not support, \
-                    now support op types are {}".format(
-                op_type, TRANSFORM_PASS_OP_TYPES + QUANT_DEQUANT_PASS_OP_TYPES)
+    if configs['for_tensorrt']:
+        configs['quantize_op_types'] = TENSORRT_OP_TYPES
+    elif configs['is_full_quantize']:
+        configs[
+            'quantize_op_types'] = TRANSFORM_PASS_OP_TYPES + QUANT_DEQUANT_PASS_OP_TYPES
+    else:
+        for op_type in configs['quantize_op_types']:
+            assert (op_type in QUANT_DEQUANT_PASS_OP_TYPES) or (
+                op_type in TRANSFORM_PASS_OP_TYPES), "{} is not support, \
+                        now support op types are {}".format(
+                    op_type,
+                    TRANSFORM_PASS_OP_TYPES + QUANT_DEQUANT_PASS_OP_TYPES)
 
     assert isinstance(configs['dtype'], str), \
         "dtype must be a str."
@@ -116,36 +172,75 @@ def _parse_configs(user_config):
     assert isinstance(configs['moving_rate'], float), \
         "moving_rate must be float value, The decay coefficient of moving average, default is 0.9."
 
-    assert isinstance(configs['quant_weight_only'], bool), \
-        "quant_weight_only must be bool value, if set quant_weight_only True, " \
-        "then only quantize parameters of layers which need to be quantized, " \
-        " and activations will not be quantized."
-
     return configs
 
 
-def quant_aware(program, place, config, scope=None, for_test=False):
-    """
-    add trainable quantization ops in program.
+def quant_aware(program,
+                place,
+                config=None,
+                scope=None,
+                for_test=False,
+                weight_quantize_func=None,
+                act_quantize_func=None,
+                weight_preprocess_func=None,
+                act_preprocess_func=None,
+                optimizer_func=None,
+                executor=None,
+                return_program=False):
+    """Add quantization  and dequantization operators to "program" 
+    for quantization training or testing.
+
     Args:
-        program(fluid.Program): program
-        scope(fluid.Scope): the scope to store var, it's should be the value of program's scope, usually it's fluid.global_scope().
-        place(fluid.CPUPlace or fluid.CUDAPlace): place
-        config(dict): configs for quantization, default values are in quant_config_default dict.
-        for_test: if program is test program, for_test should be set True, else False.
-    Return:
-        fluid.Program: user can finetune this quantization program to enhance the accuracy.
+        program(fluid.Program): training or testing ``program``.
+        place(fluid.CPUPlace or fluid.CUDAPlace): This parameter represents 
+            the executor run on which device.
+        config(dict, optional): configs for quantization. if None, will use default config. 
+            Default: None.
+        scope(fluid.Scope): Scope records the mapping between variable names and variables, 
+            similar to brackets in programming languages. Usually users can use 
+            `fluid.global_scope <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_.              When ``None`` will use `fluid.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_ . Default: ``None``.
+        for_test(bool): If the 'program' parameter is a test program, this parameter should be set to ``True``. 
+            Otherwise, set to ``False``.Default: False
+       weight_quantize_func(function): Function that defines how to quantize weight. Using this
+                can quickly test if user's quantization method works or not. In this function, user should
+                both define quantization function and dequantization function, that is, the function's input
+                is non-quantized weight and function returns dequantized weight. If None, will use
+                quantization op defined by 'weight_quantize_type'.
+                Default is None.
+        act_quantize_func(function): Function that defines how to quantize activation. Using this
+                can quickly test if user's quantization method works or not. In this function, user should
+                both define quantization and dequantization process, that is, the function's input
+                is non-quantized activation and function returns dequantized activation. If None, will use 
+                quantization op defined by 'activation_quantize_type'.
+                Default is None.
+        weight_preprocess_func(function): Function that defines how to preprocess weight before quantization. Using this
+                can quickly test if user's preprocess method works or not. The function's input
+                is non-quantized weight and function returns processed weight to be quantized. If None, the weight will
+                be quantized directly.
+                Default is None.
+        act_preprocess_func(function): Function that defines how to preprocess activation before quantization. Using this
+                can quickly test if user's preprocess method works or not. The function's input
+                is non-quantized activation and function returns processed activation to be quantized. If None, the activation will
+                be quantized directly.
+                Default is None.
+        optimizer_func(function): Fuction return a optimizer. When 'is_test' is False and user want to use self-defined 
+            quantization function and preprocess function, this function must be set. Default is None.
+        exe(Fluid.Executor): If user want to use self-defined quantization function and preprocess function, exe must be set for
+                initialization. Default is None.
+        return_program(bool): If user want return value is a Program rather than Compiled Program, This argument should be set True.
+                Default is False.
+    Returns:
+        fluid.CompiledProgram | fluid.Program: Program with quantization and dequantization ``operators``
     """
 
     scope = fluid.global_scope() if not scope else scope
-    assert isinstance(config, dict), "config must be dict"
-
-    assert 'weight_quantize_type' in config.keys(
-    ), 'weight_quantize_type must be configured'
-    assert 'activation_quantize_type' in config.keys(
-    ), 'activation_quantize_type must be configured'
+    if config is None:
+        config = _quant_config_default
+    else:
+        assert isinstance(config, dict), "config must be dict"
+        config = _parse_configs(config)
+    _logger.info("quant_aware config {}".format(config))
 
-    config = _parse_configs(config)
     main_graph = IrGraph(core.Graph(program.desc), for_test=for_test)
 
     transform_pass_ops = []
@@ -166,7 +261,13 @@ def quant_aware(program, place, config, scope=None, for_test=False):
             window_size=config['window_size'],
             moving_rate=config['moving_rate'],
             quantizable_op_type=transform_pass_ops,
-            skip_pattern=config['not_quant_pattern'])
+            skip_pattern=config['not_quant_pattern'],
+            weight_quantize_func=weight_quantize_func,
+            act_quantize_func=act_quantize_func,
+            weight_preprocess_func=weight_preprocess_func,
+            act_preprocess_func=act_preprocess_func,
+            optimizer_func=optimizer_func,
+            executor=executor)
 
         transform_pass.apply(main_graph)
 
@@ -180,46 +281,77 @@ def quant_aware(program, place, config, scope=None, for_test=False):
             quantizable_op_type=quant_dequant_ops)
         quant_dequant_pass.apply(main_graph)
 
-    if for_test:
+    out_scale_training_pass = OutScaleForTrainingPass(
+        scope=scope, place=place, moving_rate=config['moving_rate'])
+    out_scale_training_pass.apply(main_graph)
+
+    if (weight_preprocess_func is not None or
+            act_preprocess_func is not None) and not for_test:
+        _logger.info(
+            "When a preprocess_func is used in quant_aware, Need to save a mapping table to match variable names in the convert phase."
+        )
+        _logger.info("The mapping table is saved as '{}'.".format(
+            VARS_MAPPING_TABLE))
+        save_dict(main_graph.out_node_mapping_table)
+
+    if for_test or return_program:
         quant_program = main_graph.to_program()
     else:
         quant_program = fluid.CompiledProgram(main_graph.graph)
     return quant_program
 
 
-def quant_post(executor,
-               model_dir,
-               quantize_model_path,
-               sample_generator,
-               model_filename=None,
-               params_filename=None,
-               batch_size=16,
-               batch_nums=None,
-               scope=None,
-               algo='KL',
-               quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"]):
+def quant_post_static(
+        executor,
+        model_dir,
+        quantize_model_path,
+        batch_generator=None,
+        sample_generator=None,
+        model_filename=None,
+        params_filename=None,
+        save_model_filename='__model__',
+        save_params_filename='__params__',
+        batch_size=16,
+        batch_nums=None,
+        scope=None,
+        algo='KL',
+        quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"],
+        is_full_quantize=False,
+        weight_bits=8,
+        activation_bits=8,
+        activation_quantize_type='range_abs_max',
+        weight_quantize_type='channel_wise_abs_max',
+        is_use_cache_file=False,
+        cache_dir="./temp_post_training"):
     """
-    The function utilizes post training quantization method to quantize the 
-    fp32 model. It uses calibrate data to calculate the scale factor of 
-    quantized variables, and inserts fake quant/dequant op to obtain the 
-    quantized model.
+    The function utilizes static post training quantization method to
+    quantize the fp32 model. It uses calibrate data to calculate the
+    scale factor of quantized variables, and inserts fake quantization
+    and dequantization operators to obtain the quantized model.
 
     Args:
         executor(fluid.Executor): The executor to load, run and save the 
             quantized model.
         model_dir(str): The path of fp32 model that will be quantized, and 
-            the model and params that saved by fluid.io.save_inference_model 
+            the model and params that saved by ``fluid.io.save_inference_model`` 
             are under the path.
         quantize_model_path(str): The path to save quantized model using api
-            fluid.io.save_inference_model.
+            ``fluid.io.save_inference_model``.
+        batch_generator(Python Generator): The batch generator provides 
+                calibrate data for DataLoader, and it returns a batch every
+                time. For sample_generator and batch_generator, only one
+                can be set. Beisdes, batch_generator supports lod tensor.
         sample_generator(Python Generator): The sample generator provides 
             calibrate data for DataLoader, and it only returns a sample every time.
         model_filename(str, optional): The name of model file. If parameters 
-            are saved in separate files, set it as 'None'. Default is 'None'.
+            are saved in separate files, set it as 'None'. Default: 'None'.
         params_filename(str, optional): The name of params file.
                 When all parameters are saved in a single file, set it 
                 as filename. If parameters are saved in separate files, 
-                set it as 'None'. Default is 'None'.
+                set it as 'None'. Default : 'None'.
+        save_model_filename(str): The name of model file to save the quantized inference program.  Default: '__model__'.
+        save_params_filename(str): The name of file to save all related parameters. 
+                If it is set None, parameters will be saved in separate files. Default: '__params__'.
         batch_size(int, optional): The batch size of DataLoader, default is 16.
         batch_nums(int, optional): If batch_nums is not None, the number of calibrate 
                         data is 'batch_size*batch_nums'. If batch_nums is None, use all data
@@ -228,16 +360,34 @@ def quant_post(executor,
                         and save variables. If scope is None, will use fluid.global_scope().
         algo(str, optional): If algo=KL, use KL-divergenc method to 
                         get the more precise scale factor. If algo='direct', use 
-                        abs_max method to get the scale factor. Default is 'KL'.
+                        abs_max method to get the scale factor. Default: 'KL'.
         quantizable_op_type(list[str], optional): The list of op types
-                        that will be quantized. Default is ["conv2d", "depthwise_conv2d", 
+                        that will be quantized. Default: ["conv2d", "depthwise_conv2d", 
                         "mul"].
+        weight_bits(int, optional): quantization bit number for weights.
+        activation_bits(int): quantization bit number for activation.
+	activation_quantize_type(str): quantization type for activation,
+                now support 'range_abs_max', 'moving_average_abs_max' and 'abs_max'.
+                This parameter only specifies the fake ops in quantized model.
+                If it is 'range_abs_max' or 'moving_average_abs_max', we save the scale
+                obtained by post training quantization in fake ops. If it
+                is 'abs_max', the scale will not be saved in fake ops.
+        weight_quantize_type(str): quantization type for weights,
+                support 'abs_max' and 'channel_wise_abs_max'. Compared to 'abs_max',
+                the model accuracy is usually higher when using 'channel_wise_abs_max'.
+        is_full_quantize(bool): if True, apply quantization to all supported quantizable op type.
+                        If False, only apply quantization to the input quantizable_op_type. Default is False.
+        is_use_cache_file(bool): If False, all temp data will be saved in memory. If True,
+                                all temp data will be saved to disk. Defalut: False.
+        cache_dir(str): When 'is_use_cache_file' is True, temp data will be save in 'cache_dir'. Default is './temp_post_training'.
+    
     Returns:
         None
     """
     post_training_quantization = PostTrainingQuantization(
         executor=executor,
         sample_generator=sample_generator,
+        batch_generator=batch_generator,
         model_dir=model_dir,
         model_filename=model_filename,
         params_filename=params_filename,
@@ -246,43 +396,157 @@ def quant_post(executor,
         scope=scope,
         algo=algo,
         quantizable_op_type=quantizable_op_type,
-        is_full_quantize=False)
+        is_full_quantize=is_full_quantize,
+        weight_bits=weight_bits,
+        activation_bits=activation_bits,
+        activation_quantize_type=activation_quantize_type,
+        weight_quantize_type=weight_quantize_type,
+        is_use_cache_file=is_use_cache_file,
+        cache_dir=cache_dir)
     post_training_quantization.quantize()
-    post_training_quantization.save_quantized_model(quantize_model_path)
+    post_training_quantization.save_quantized_model(
+        quantize_model_path,
+        model_filename=save_model_filename,
+        params_filename=save_params_filename)
 
 
-def convert(program, place, config, scope=None, save_int8=False):
+# We have changed the quant_post to quant_post_static.
+# For compatibility, we keep quant_post api for now, and it will be
+# deprecated in the future.
+quant_post = quant_post_static
+
+
+def convert(program, place, config=None, scope=None, save_int8=False):
     """
-    add quantization ops in program. the program returned is not trainable.
+    convert quantized and well-trained ``program`` to final  quantized
+    ``program``that can be used to  save ``inference model``.
+    
     Args:
-        program(fluid.Program): program
-        scope(fluid.Scope): the scope to store var, when is None will use fluid.global_scope()
-        place(fluid.CPUPlace or fluid.CUDAPlace): place
-        config(dict): configs for quantization, default values are in quant_config_default dict.
-        save_int8: is export int8 freezed program.
-    Return:
-        fluid.Program: freezed program which can be used for inference.
-                       parameters is float32 type, but it's value in int8 range.
-        fluid.Program: freezed int8 program which can be used for inference.
-                       if save_int8 is False, this value is None.
+        program(fluid.Program): quantized and well-trained ``test program``.
+        place(fluid.CPUPlace or fluid.CUDAPlace): This parameter represents
+                the executor run on which device.
+        config(dict, optional): configs for convert. if set None, will use
+                default config. It must be same with config that used in
+                'quant_aware'. Default is None.
+        scope(fluid.Scope, optional):  Scope records the mapping between
+                variable names and variables, similar to brackets in
+                programming languages. Usually users can use
+                `fluid.global_scope <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_.
+                When ``None`` will use 
+                `fluid.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_
+                . Default: ``None``.
+        save_int8: Whether to return ``program`` which model parameters'
+                dtype is ``int8``. This parameter can only be used to
+                get model size. Default: ``False``.
+
+    Returns:
+        Tuple : freezed program which can be used for inference.
+                when ``save_int8`` is False, return ``freezed_program(fluid.Program)``.
+                when ``save_int8`` is True, return ``freezed_program(fluid.Program)``
+                and ``freezed_program_int8(fluid.Program)``
     """
     scope = fluid.global_scope() if not scope else scope
+
+    if config is None:
+        config = _quant_config_default
+    else:
+        assert isinstance(config, dict), "config must be dict"
+        config = _parse_configs(config)
+    _logger.info("convert config {}".format(config))
     test_graph = IrGraph(core.Graph(program.desc), for_test=True)
 
+    out_scale_infer_pass = OutScaleForInferencePass(scope=scope)
+    out_scale_infer_pass.apply(test_graph)
+
     # Freeze the graph after training by adjusting the quantize
     # operators' order for the inference.
     freeze_pass = QuantizationFreezePass(
         scope=scope,
         place=place,
+        weight_bits=config['weight_bits'],
+        activation_bits=config['activation_bits'],
         weight_quantize_type=config['weight_quantize_type'])
+
+    if os.path.exists(VARS_MAPPING_TABLE):
+        test_graph.out_node_mapping_table = load_dict()
+
     freeze_pass.apply(test_graph)
     freezed_program = test_graph.to_program()
 
     if save_int8:
-        convert_int8_pass = ConvertToInt8Pass(
-            scope=fluid.global_scope(), place=place)
+        convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place)
         convert_int8_pass.apply(test_graph)
         freezed_program_int8 = test_graph.to_program()
         return freezed_program, freezed_program_int8
     else:
         return freezed_program
+
+
+def quant_post_dynamic(model_dir,
+                       save_model_dir,
+                       model_filename=None,
+                       params_filename=None,
+                       save_model_filename=None,
+                       save_params_filename=None,
+                       quantizable_op_type=["conv2d", "mul"],
+                       weight_bits=8,
+                       generate_test_model=False):
+    '''
+    The function utilizes static post training quantization method to
+    quantize the fp32 model. In details, it quantizes the weight of some
+    ops from float32 to int8/16. For the quantized model, there are two
+    kinds of calculation method in the reference stage. Firstly, the
+    quantized weight will be dequantized to float32, and then apply the
+    float32 calculation. Secondly, collect the quantized scales of the
+    inputs, and then apply the int8 calculation.
+        
+    Args:
+        model_dir(str): The path of the fp32 model that will be quantized,
+                and the model and params files are under the path.
+        save_model_dir(str): The path to save the quantized model.
+        model_filename(str, optional): The name of file used to load the
+                inference program. If it is None, the default filename
+                '__model__' will be used. Default is 'None'.
+        params_filename(str, optional): The name of file used to load all
+                parameters. When all parameters were saved in a single
+                binary file, set it as the real filename. If parameters
+                were saved in separate files, set it as 'None'. Default is
+                'None'.
+        save_model_dir(str): The path used to save the quantized model.
+        save_model_filename(str, optional): The name of file to 
+                save the inference program. If it is None, the default 
+                filename '__model__' will be used. Default is 'None'.
+        save_params_filename(str, optional): The name of file to 
+                save all parameters. If it is None, parameters were 
+                saved in separate files. If it is not None, all 
+                parameters were saved in a single binary file.
+        quantizable_op_type(list[str], optional): The list of ops 
+                that will be quantized, and the quantized ops should be
+                contained in ["conv2d", "depthwise_conv2d", "mul"]. 
+                Default is ["conv2d", "depthwise_conv2d", "mul"].
+        weight_bits(int, optional): The bits for the quantized weight, 
+                and it should be 8 or 16. Default is 8.
+        generate_test_model(bool, optional): If set generate_test_model 
+                as True, it saves a fake quantized model, in which the weights 
+                are quantized and dequantized. We can use PaddlePaddle to load 
+                the fake quantized model and test the accuracy on GPU or CPU.
+    '''
+
+    weight_quant = WeightQuantization(
+        model_dir=model_dir,
+        model_filename=model_filename,
+        params_filename=params_filename)
+
+    weight_quant.quantize_weight_to_int(
+        save_model_dir=save_model_dir,
+        save_model_filename=save_model_filename,
+        save_params_filename=save_params_filename,
+        quantizable_op_type=quantizable_op_type,
+        weight_bits=weight_bits,
+        generate_test_model=generate_test_model)
+
+
+# We have changed the quant_post_only_weight to quant_post_dynamic.
+# For compatibility, we keep quant_post_only_weight api for now,
+# and it will be deprecated in the future.
+quant_post_only_weight = quant_post_dynamic
diff --git a/paddleslim/teachers/__init__.py b/paddleslim/teachers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d0531501ca43921438ee5b2fb58ac0ad2396d1b
--- /dev/null
+++ b/paddleslim/teachers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddleslim/teachers/bert/__init__.py b/paddleslim/teachers/bert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..59b1fca6811dd4a42f3bc8d6606c366b4f081f15
--- /dev/null
+++ b/paddleslim/teachers/bert/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from ..bert import cls
+from .cls import *
+
+__all__ = []
+__all__ += cls.__all__
diff --git a/paddleslim/teachers/bert/cls.py b/paddleslim/teachers/bert/cls.py
new file mode 100755
index 0000000000000000000000000000000000000000..8f7e1a4b0775c27f9623dfc3f98dd832a4925cf7
--- /dev/null
+++ b/paddleslim/teachers/bert/cls.py
@@ -0,0 +1,249 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT fine-tuning in Paddle Dygraph Mode."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import six
+import sys
+if six.PY2:
+    reload(sys)
+    sys.setdefaultencoding('utf8')
+import ast
+import time
+import argparse
+import numpy as np
+import multiprocessing
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import to_variable, Layer
+from .reader.cls import *
+from .model.bert import BertConfig
+from .model.cls import ClsModelLayer
+from .optimization import Optimizer
+from .utils.init import init_from_static_model
+
+__all__ = ["BERTClassifier"]
+
+
+def create_data(batch):
+    """
+    convert data to variable
+    """
+    src_ids = to_variable(batch[0], "src_ids")
+    position_ids = to_variable(batch[1], "position_ids")
+    sentence_ids = to_variable(batch[2], "sentence_ids")
+    input_mask = to_variable(batch[3], "input_mask")
+    labels = to_variable(batch[4], "labels")
+    labels.stop_gradient = True
+    return src_ids, position_ids, sentence_ids, input_mask, labels
+
+
+class BERTClassifier(Layer):
+    def __init__(self,
+                 num_labels,
+                 task_name="mnli",
+                 model_path=None,
+                 use_cuda=True,
+                 return_pooled_out=True):
+        super(BERTClassifier, self).__init__()
+        self.task_name = task_name.lower()
+        BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12/"
+        bert_config_path = BERT_BASE_PATH + "/bert_config.json"
+        self.vocab_path = BERT_BASE_PATH + "/vocab.txt"
+        self.init_pretraining_params = BERT_BASE_PATH + "/dygraph_params/"
+        self.do_lower_case = True
+        self.bert_config = BertConfig(bert_config_path)
+
+        if use_cuda:
+            self.dev_count = fluid.core.get_cuda_device_count()
+        else:
+            self.dev_count = int(
+                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+
+        self.trainer_count = fluid.dygraph.parallel.Env().nranks
+
+        self.processors = {
+            'xnli': XnliProcessor,
+            'cola': ColaProcessor,
+            'mrpc': MrpcProcessor,
+            'mnli': MnliProcessor,
+        }
+
+        self.cls_model = ClsModelLayer(
+            self.bert_config, num_labels, return_pooled_out=return_pooled_out)
+
+        if model_path is not None:
+            #restore the model
+            print("Load params from %s" % model_path)
+            model_dict, _ = fluid.load_dygraph(model_path)
+            self.cls_model.load_dict(model_dict)
+        elif self.init_pretraining_params:
+            print("Load pre-trained model from %s" %
+                  self.init_pretraining_params)
+            init_from_static_model(self.init_pretraining_params,
+                                   self.cls_model, self.bert_config)
+        else:
+            raise Exception(
+                "You should load pretrained model for training this teacher model."
+            )
+
+    def emb_names(self):
+        return self.cls_model.emb_names()
+
+    def forward(self, input):
+        return self.cls_model(input)
+
+    def test(self, data_dir, batch_size=64, max_seq_len=512):
+
+        processor = self.processors[self.task_name](
+            data_dir=data_dir,
+            vocab_path=self.vocab_path,
+            max_seq_len=max_seq_len,
+            do_lower_case=self.do_lower_case,
+            in_tokens=False)
+
+        test_data_generator = processor.data_generator(
+            batch_size=batch_size, phase='dev', epoch=1, shuffle=False)
+
+        self.cls_model.eval()
+        total_cost, final_acc, avg_acc, total_num_seqs = [], [], [], []
+        for batch in test_data_generator():
+            data_ids = create_data(batch)
+
+            total_loss, _, _, np_acces, np_num_seqs = self.cls_model(data_ids)
+
+            np_loss = total_loss.numpy()
+            np_acc = np_acces[-1].numpy()
+            np_avg_acc = np.mean([acc.numpy() for acc in np_acces])
+            np_num_seqs = np_num_seqs.numpy()
+
+            total_cost.extend(np_loss * np_num_seqs)
+            final_acc.extend(np_acc * np_num_seqs)
+            avg_acc.extend(np_avg_acc * np_num_seqs)
+            total_num_seqs.extend(np_num_seqs)
+
+        print("[evaluation] classifier[-1] average acc: %f; average acc: %f" %
+              (np.sum(final_acc) / np.sum(total_num_seqs),
+               np.sum(avg_acc) / np.sum(total_num_seqs)))
+        self.cls_model.train()
+
+    def fit(self,
+            data_dir,
+            epoch,
+            batch_size=64,
+            use_cuda=True,
+            max_seq_len=512,
+            warmup_proportion=0.1,
+            use_data_parallel=False,
+            learning_rate=0.00005,
+            weight_decay=0.01,
+            lr_scheduler="linear_warmup_decay",
+            skip_steps=10,
+            save_steps=1000,
+            checkpoints="checkpoints"):
+
+        processor = self.processors[self.task_name](
+            data_dir=data_dir,
+            vocab_path=self.vocab_path,
+            max_seq_len=max_seq_len,
+            do_lower_case=self.do_lower_case,
+            in_tokens=False,
+            random_seed=5512)
+        shuffle_seed = 1 if self.trainer_count > 1 else None
+
+        train_data_generator = processor.data_generator(
+            batch_size=batch_size,
+            phase='train',
+            epoch=epoch,
+            dev_count=self.trainer_count,
+            shuffle=True,
+            shuffle_seed=shuffle_seed)
+        num_train_examples = processor.get_num_examples(phase='train')
+        max_train_steps = epoch * num_train_examples // batch_size // self.trainer_count
+        warmup_steps = int(max_train_steps * warmup_proportion)
+
+        print("Device count: %d" % self.dev_count)
+        print("Trainer count: %d" % self.trainer_count)
+        print("Num train examples: %d" % num_train_examples)
+        print("Max train steps: %d" % max_train_steps)
+        print("Num warmup steps: %d" % warmup_steps)
+
+        if use_data_parallel:
+            strategy = fluid.dygraph.parallel.prepare_context()
+
+        optimizer = Optimizer(
+            warmup_steps=warmup_steps,
+            num_train_steps=max_train_steps,
+            learning_rate=learning_rate,
+            model_cls=self.cls_model,
+            weight_decay=weight_decay,
+            scheduler=lr_scheduler,
+            loss_scaling=1.0,
+            parameter_list=self.cls_model.parameters())
+
+        if use_data_parallel:
+            self.cls_model = fluid.dygraph.parallel.DataParallel(
+                self.cls_model, strategy)
+            train_data_generator = fluid.contrib.reader.distributed_batch_reader(
+                train_data_generator)
+
+        steps = 0
+        time_begin = time.time()
+
+        for batch in train_data_generator():
+            data_ids = create_data(batch)
+            total_loss, logits, losses, accuracys, num_seqs = self.cls_model(
+                data_ids)
+
+            optimizer.optimization(
+                total_loss,
+                use_data_parallel=use_data_parallel,
+                model=self.cls_model)
+            self.cls_model.clear_gradients()
+
+            if steps != 0 and steps % skip_steps == 0:
+                time_end = time.time()
+                used_time = time_end - time_begin
+                current_example, current_epoch = processor.get_train_progress()
+                localtime = time.asctime(time.localtime(time.time()))
+                print(
+                    "%s, epoch: %s, steps: %s, dy_graph loss: %f, acc: %f, speed: %f steps/s"
+                    % (localtime, current_epoch, steps, total_loss.numpy(),
+                       accuracys[-1].numpy(), skip_steps / used_time))
+                time_begin = time.time()
+
+            if steps != 0 and steps % save_steps == 0 and fluid.dygraph.parallel.Env(
+            ).local_rank == 0:
+
+                self.test(data_dir, batch_size=64, max_seq_len=512)
+
+                save_path = os.path.join(checkpoints,
+                                         "steps" + "_" + str(steps))
+                fluid.save_dygraph(self.cls_model.state_dict(), save_path)
+                fluid.save_dygraph(optimizer.optimizer.state_dict(), save_path)
+                print("Save model parameters and optimizer status at %s" %
+                      save_path)
+
+            steps += 1
+
+        if fluid.dygraph.parallel.Env().local_rank == 0:
+            save_path = os.path.join(checkpoints, "final")
+            fluid.save_dygraph(self.cls_model.state_dict(), save_path)
+            fluid.save_dygraph(optimizer.optimizer.state_dict(), save_path)
+            print("Save model parameters and optimizer status at %s" %
+                  save_path)
diff --git a/paddleslim/teachers/bert/model/__init__.py b/paddleslim/teachers/bert/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/paddleslim/teachers/bert/model/bert.py b/paddleslim/teachers/bert/model/bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09c0dea7805ebf3bfb6d4d8a4d3430a0e9844fa
--- /dev/null
+++ b/paddleslim/teachers/bert/model/bert.py
@@ -0,0 +1,273 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"dygraph transformer layers"
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import json
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, to_variable, Layer, guard
+
+from .transformer_encoder import EncoderLayer, PrePostProcessLayer
+
+
+class BertConfig(object):
+    def __init__(self, config_path):
+        self._config_dict = self._parse(config_path)
+
+    def _parse(self, config_path):
+        try:
+            with open(config_path) as json_file:
+                config_dict = json.load(json_file)
+        except Exception:
+            raise IOError("Error in parsing bert model config file '%s'" %
+                          config_path)
+        else:
+            return config_dict
+
+    def __getitem__(self, key):
+        return self._config_dict[key]
+
+    def print_config(self):
+        for arg, value in sorted(six.iteritems(self._config_dict)):
+            print('%s: %s' % (arg, value))
+        print('------------------------------------------------')
+
+
+class BertModelLayer(Layer):
+    """
+    bert
+    """
+
+    def __init__(self, config, return_pooled_out=True, use_fp16=False):
+        super(BertModelLayer, self).__init__()
+
+        self._emb_size = config['hidden_size']
+        self._n_layer = config['num_hidden_layers']
+        self._n_head = config['num_attention_heads']
+        self._voc_size = config['vocab_size']
+        self._max_position_seq_len = config['max_position_embeddings']
+        self._sent_types = config['type_vocab_size']
+        self._hidden_act = config['hidden_act']
+        self._prepostprocess_dropout = config['hidden_dropout_prob']
+        self._attention_dropout = config['attention_probs_dropout_prob']
+        self.return_pooled_out = return_pooled_out
+
+        self._word_emb_name = "word_embedding"
+        self._pos_emb_name = "pos_embedding"
+        self._sent_emb_name = "sent_embedding"
+        self._dtype = "float16" if use_fp16 else "float32"
+
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=config['initializer_range'])
+
+        self._src_emb = Embedding(
+            size=[self._voc_size, self._emb_size],
+            param_attr=fluid.ParamAttr(
+                name=self._word_emb_name, initializer=self._param_initializer),
+            dtype=self._dtype)
+
+        self._pos_emb = Embedding(
+            size=[self._max_position_seq_len, self._emb_size],
+            param_attr=fluid.ParamAttr(
+                name=self._pos_emb_name, initializer=self._param_initializer),
+            dtype=self._dtype)
+
+        self._sent_emb = Embedding(
+            size=[self._sent_types, self._emb_size],
+            param_attr=fluid.ParamAttr(
+                name=self._sent_emb_name, initializer=self._param_initializer),
+            dtype=self._dtype)
+
+        self.pooled_fc = Linear(
+            input_dim=self._emb_size,
+            output_dim=self._emb_size,
+            param_attr=fluid.ParamAttr(
+                name="pooled_fc.w_0", initializer=self._param_initializer),
+            bias_attr="pooled_fc.b_0",
+            act="tanh")
+
+        self.pre_process_layer = PrePostProcessLayer(
+            "nd", self._emb_size, self._prepostprocess_dropout, "")
+
+        self._encoder = EncoderLayer(
+            hidden_act=self._hidden_act,
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            preprocess_cmd="",
+            postprocess_cmd="dan",
+            param_initializer=self._param_initializer)
+
+    def emb_names(self):
+        return self._src_emb.parameters() + self._pos_emb.parameters(
+        ) + self._sent_emb.parameters()
+
+    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
+        """
+        forward
+        """
+        src_emb = self._src_emb(src_ids)
+        pos_emb = self._pos_emb(position_ids)
+        sent_emb = self._sent_emb(sentence_ids)
+
+        emb_out = src_emb + pos_emb
+        emb_out = emb_out + sent_emb
+
+        emb_out = self.pre_process_layer(emb_out)
+
+        self_attn_mask = fluid.layers.matmul(
+            x=input_mask, y=input_mask, transpose_y=True)
+        self_attn_mask = fluid.layers.scale(
+            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
+        n_head_self_attn_mask = fluid.layers.stack(
+            x=[self_attn_mask] * self._n_head, axis=1)
+        n_head_self_attn_mask.stop_gradient = True
+
+        enc_outputs = self._encoder(emb_out, n_head_self_attn_mask)
+
+        if not self.return_pooled_out:
+            return enc_outputs
+        next_sent_feats = []
+        for enc_output in enc_outputs:
+            next_sent_feat = fluid.layers.slice(
+                input=enc_output, axes=[1], starts=[0], ends=[1])
+            next_sent_feat = self.pooled_fc(next_sent_feat)
+            next_sent_feat = fluid.layers.reshape(
+                next_sent_feat, shape=[-1, self._emb_size])
+            next_sent_feats.append(next_sent_feat)
+
+        return enc_outputs, next_sent_feats
+
+
+class PretrainModelLayer(Layer):
+    """
+    pretrain model
+    """
+
+    def __init__(self,
+                 config,
+                 return_pooled_out=True,
+                 weight_sharing=True,
+                 use_fp16=False):
+        super(PretrainModelLayer, self).__init__()
+        self.config = config
+        self._voc_size = config['vocab_size']
+        self._emb_size = config['hidden_size']
+        self._hidden_act = config['hidden_act']
+        self._prepostprocess_dropout = config['hidden_dropout_prob']
+
+        self._word_emb_name = "word_embedding"
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=config['initializer_range'])
+        self._weight_sharing = weight_sharing
+        self.use_fp16 = use_fp16
+        self._dtype = "float16" if use_fp16 else "float32"
+
+        self.bert_layer = BertModelLayer(
+            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
+
+        self.pre_process_layer = PrePostProcessLayer(
+            "n", self._emb_size, self._prepostprocess_dropout, "pre_encoder")
+
+        self.pooled_fc = Linear(
+            input_dim=self._emb_size,
+            output_dim=self._emb_size,
+            param_attr=fluid.ParamAttr(
+                name="mask_lm_trans_fc.w_0",
+                initializer=self._param_initializer),
+            bias_attr="mask_lm_trans_fc.b_0",
+            act="tanh")
+
+        self.mask_lm_out_bias_attr = fluid.ParamAttr(
+            name="mask_lm_out_fc.b_0",
+            initializer=fluid.initializer.Constant(value=0.0))
+
+        if not self._weight_sharing:
+            self.out_fc = Linear(
+                input_dim=self._emb_size,
+                output_dim=self._voc_size,
+                param_attr=fluid.ParamAttr(
+                    name="mask_lm_out_fc.w_0",
+                    initializer=self._param_initializer),
+                bias_attr=self.mask_lm_out_bias_attr)
+        else:
+            self.fc_create_params = self.create_parameter(
+                shape=[self._voc_size],
+                dtype=self._dtype,
+                attr=self.mask_lm_out_bias_attr,
+                is_bias=True)
+
+        self.next_sent_fc = Linear(
+            input_dim=self._emb_size,
+            output_dim=2,
+            param_attr=fluid.ParamAttr(
+                name="next_sent_fc.w_0", initializer=self._param_initializer),
+            bias_attr="next_sent_fc.b_0")
+
+    def forward(self, src_ids, position_ids, sentence_ids, input_mask,
+                mask_label, mask_pos, labels):
+        """
+        forward
+        """
+        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
+
+        enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
+                                                     sentence_ids, input_mask)
+        reshaped_emb_out = fluid.layers.reshape(
+            x=enc_output, shape=[-1, self._emb_size])
+
+        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
+
+        mask_trans_feat = self.pooled_fc(mask_feat)
+        mask_trans_feat = self.pre_process_layer(None, mask_trans_feat, "n",
+                                                 self._prepostprocess_dropout)
+
+        if self._weight_sharing:
+            fc_out = fluid.layers.matmul(
+                x=mask_trans_feat,
+                y=self.bert_layer._src_emb._w,
+                transpose_y=True)
+            fc_out += self.fc_create_params
+        else:
+            fc_out = self.out_fc(mask_trans_feat)
+
+        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
+            logits=fc_out, label=mask_label)
+        mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
+
+        next_sent_fc_out = self.next_sent_fc(next_sent_feat)
+
+        next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(
+            logits=next_sent_fc_out, label=labels, return_softmax=True)
+
+        next_sent_acc = fluid.layers.accuracy(
+            input=next_sent_softmax, label=labels)
+
+        mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
+
+        loss = mean_next_sent_loss + mean_mask_lm_loss
+        return next_sent_acc, mean_mask_lm_loss, loss
diff --git a/paddleslim/teachers/bert/model/cls.py b/paddleslim/teachers/bert/model/cls.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdfef8b5b4d6c6133e176a146f170bbb633701aa
--- /dev/null
+++ b/paddleslim/teachers/bert/model/cls.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"dygraph transformer layers"
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import json
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Linear, Layer
+
+from .bert import BertModelLayer
+
+
+class ClsModelLayer(Layer):
+    """
+    classify model
+    """
+
+    def __init__(self,
+                 config,
+                 num_labels,
+                 is_training=True,
+                 return_pooled_out=True,
+                 loss_scaling=1.0,
+                 use_fp16=False):
+        super(ClsModelLayer, self).__init__()
+        self.config = config
+        self.is_training = is_training
+        self.use_fp16 = use_fp16
+        self.loss_scaling = loss_scaling
+        self.n_layers = config['num_hidden_layers']
+        self.return_pooled_out = return_pooled_out
+
+        self.bert_layer = BertModelLayer(
+            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
+
+        self.cls_fc = list()
+        for i in range(self.n_layers):
+            fc = Linear(
+                input_dim=self.config["hidden_size"],
+                output_dim=num_labels,
+                param_attr=fluid.ParamAttr(
+                    name="cls_out_%d_w" % i,
+                    initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+                bias_attr=fluid.ParamAttr(
+                    name="cls_out_%d_b" % i,
+                    initializer=fluid.initializer.Constant(0.)))
+            fc = self.add_sublayer("cls_fc_%d" % i, fc)
+            self.cls_fc.append(fc)
+
+    def emb_names(self):
+        return self.bert_layer.emb_names()
+
+    def forward(self, data_ids):
+        """
+        forward
+        """
+        src_ids = data_ids[0]
+        position_ids = data_ids[1]
+        sentence_ids = data_ids[2]
+        input_mask = data_ids[3]
+        labels = data_ids[4]
+
+        enc_outputs, next_sent_feats = self.bert_layer(
+            src_ids, position_ids, sentence_ids, input_mask)
+
+        if not self.return_pooled_out:
+            cls_feat = fluid.layers.dropout(
+                x=next_sent_feats[-1],
+                dropout_prob=0.1,
+                dropout_implementation="upscale_in_train")
+            logits = self.cls_fc[-1](cls_feat)
+            probs = fluid.layers.softmax(logits)
+            num_seqs = fluid.layers.create_tensor(dtype='int64')
+            accuracy = fluid.layers.accuracy(
+                input=probs, label=labels, total=num_seqs)
+            return enc_outputs, logits, accuracy, num_seqs
+
+        logits = []
+        losses = []
+        accuracys = []
+        for next_sent_feat, fc in zip(next_sent_feats, self.cls_fc):
+            cls_feat = fluid.layers.dropout(
+                x=next_sent_feat,
+                dropout_prob=0.1,
+                dropout_implementation="upscale_in_train")
+            logit = fc(cls_feat)
+            logits.append(logit)
+
+            ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
+                logits=logit, label=labels, return_softmax=True)
+            loss = fluid.layers.mean(x=ce_loss)
+            losses.append(loss)
+
+            if self.use_fp16 and self.loss_scaling > 1.0:
+                loss *= self.loss_scaling
+
+            num_seqs = fluid.layers.create_tensor(dtype='int64')
+            accuracy = fluid.layers.accuracy(
+                input=probs, label=labels, total=num_seqs)
+            accuracys.append(accuracy)
+        total_loss = fluid.layers.sum(losses)
+
+        return total_loss, logits, losses, accuracys, num_seqs
diff --git a/paddleslim/teachers/bert/model/transformer_encoder.py b/paddleslim/teachers/bert/model/transformer_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff6e2b092b48b730238c7515b96f634f6226e597
--- /dev/null
+++ b/paddleslim/teachers/bert/model/transformer_encoder.py
@@ -0,0 +1,398 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"dygraph transformer layers"
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer
+
+
+class PrePostProcessLayer(Layer):
+    """
+    PrePostProcessLayer
+    """
+
+    def __init__(self, process_cmd, d_model, dropout_rate, name):
+        super(PrePostProcessLayer, self).__init__()
+        self.process_cmd = process_cmd
+        self.functors = []
+        self.exec_order = ""
+
+        for cmd in self.process_cmd:
+            if cmd == "a":  # add residual connection
+                self.functors.append(
+                    lambda x, y: x + y if y is not None else x)
+                self.exec_order += "a"
+            elif cmd == "n":  # add layer normalization
+                self.functors.append(
+                    self.add_sublayer(
+                        "layer_norm_%d" % len(
+                            self.sublayers(include_sublayers=False)),
+                        LayerNorm(
+                            normalized_shape=d_model,
+                            param_attr=fluid.ParamAttr(
+                                name=name + "_layer_norm_scale",
+                                initializer=fluid.initializer.Constant(1.)),
+                            bias_attr=fluid.ParamAttr(
+                                name=name + "_layer_norm_bias",
+                                initializer=fluid.initializer.Constant(0.)))))
+                self.exec_order += "n"
+            elif cmd == "d":  # add dropout
+                if dropout_rate:
+                    self.functors.append(lambda x: fluid.layers.dropout(
+                        x, dropout_prob=dropout_rate, is_test=False))
+                    self.exec_order += "d"
+
+    def forward(self, x, residual=None):
+        for i, cmd in enumerate(self.exec_order):
+            if cmd == "a":
+                x = self.functors[i](x, residual)
+            else:
+                x = self.functors[i](x)
+        return x
+
+
+class PositionwiseFeedForwardLayer(Layer):
+    """
+    PositionwiseFeedForwardLayer
+    """
+
+    def __init__(self,
+                 hidden_act,
+                 d_inner_hid,
+                 d_model,
+                 dropout_rate,
+                 param_initializer=None,
+                 name=""):
+        super(PositionwiseFeedForwardLayer, self).__init__()
+
+        self._i2h = Linear(
+            input_dim=d_model,
+            output_dim=d_inner_hid,
+            param_attr=fluid.ParamAttr(
+                name=name + '_fc_0.w_0', initializer=param_initializer),
+            bias_attr=name + '_fc_0.b_0',
+            act=hidden_act)
+
+        self._h2o = Linear(
+            input_dim=d_inner_hid,
+            output_dim=d_model,
+            param_attr=fluid.ParamAttr(
+                name=name + '_fc_1.w_0', initializer=param_initializer),
+            bias_attr=name + '_fc_1.b_0')
+
+        self._dropout_rate = dropout_rate
+
+    def forward(self, x):
+        """
+        forward
+        :param x:
+        :return:
+        """
+        hidden = self._i2h(x)
+        if self._dropout_rate:
+            hidden = fluid.layers.dropout(
+                hidden,
+                dropout_prob=self._dropout_rate,
+                upscale_in_train="upscale_in_train",
+                is_test=False)
+        out = self._h2o(hidden)
+        return out
+
+
+class MultiHeadAttentionLayer(Layer):
+    """
+    MultiHeadAttentionLayer
+    """
+
+    def __init__(self,
+                 d_key,
+                 d_value,
+                 d_model,
+                 n_head=1,
+                 dropout_rate=0.,
+                 cache=None,
+                 gather_idx=None,
+                 static_kv=False,
+                 param_initializer=None,
+                 name=""):
+        super(MultiHeadAttentionLayer, self).__init__()
+        self._n_head = n_head
+        self._d_key = d_key
+        self._d_value = d_value
+        self._d_model = d_model
+        self._dropout_rate = dropout_rate
+
+        self._q_fc = Linear(
+            input_dim=d_model,
+            output_dim=d_key * n_head,
+            param_attr=fluid.ParamAttr(
+                name=name + '_query_fc.w_0', initializer=param_initializer),
+            bias_attr=name + '_query_fc.b_0')
+
+        self._k_fc = Linear(
+            input_dim=d_model,
+            output_dim=d_key * n_head,
+            param_attr=fluid.ParamAttr(
+                name=name + '_key_fc.w_0', initializer=param_initializer),
+            bias_attr=name + '_key_fc.b_0')
+
+        self._v_fc = Linear(
+            input_dim=d_model,
+            output_dim=d_value * n_head,
+            param_attr=fluid.ParamAttr(
+                name=name + '_value_fc.w_0', initializer=param_initializer),
+            bias_attr=name + '_value_fc.b_0')
+
+        self._proj_fc = Linear(
+            input_dim=d_value * n_head,
+            output_dim=d_model,
+            param_attr=fluid.ParamAttr(
+                name=name + '_output_fc.w_0', initializer=param_initializer),
+            bias_attr=name + '_output_fc.b_0')
+
+    def forward(self, queries, keys, values, attn_bias):
+        """
+        forward
+        :param queries:
+        :param keys:
+        :param values:
+        :param attn_bias:
+        :return:
+        """
+        # compute q ,k ,v
+        keys = queries if keys is None else keys
+        values = keys if values is None else values
+
+        q = self._q_fc(queries)
+        k = self._k_fc(keys)
+        v = self._v_fc(values)
+
+        # split head
+
+        q_hidden_size = q.shape[-1]
+        reshaped_q = fluid.layers.reshape(
+            x=q,
+            shape=[0, 0, self._n_head, q_hidden_size // self._n_head],
+            inplace=False)
+        transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
+
+        k_hidden_size = k.shape[-1]
+        reshaped_k = fluid.layers.reshape(
+            x=k,
+            shape=[0, 0, self._n_head, k_hidden_size // self._n_head],
+            inplace=False)
+        transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3])
+
+        v_hidden_size = v.shape[-1]
+        reshaped_v = fluid.layers.reshape(
+            x=v,
+            shape=[0, 0, self._n_head, v_hidden_size // self._n_head],
+            inplace=False)
+        transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
+
+        scaled_q = fluid.layers.scale(x=transpose_q, scale=self._d_key**-0.5)
+        # scale dot product attention
+        product = fluid.layers.matmul(
+            #x=transpose_q,
+            x=scaled_q,
+            y=transpose_k,
+            transpose_y=True)
+        #alpha=self._d_model**-0.5)
+        if attn_bias is not None:
+            product += attn_bias
+        weights = fluid.layers.softmax(product)
+        if self._dropout_rate:
+            weights_droped = fluid.layers.dropout(
+                weights,
+                dropout_prob=self._dropout_rate,
+                dropout_implementation="upscale_in_train",
+                is_test=False)
+            out = fluid.layers.matmul(weights_droped, transpose_v)
+        else:
+            out = fluid.layers.matmul(weights, transpose_v)
+
+        # combine heads
+        if len(out.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+        trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3])
+        final_out = fluid.layers.reshape(
+            x=trans_x,
+            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
+            inplace=False)
+
+        # fc to output
+        proj_out = self._proj_fc(final_out)
+        return proj_out
+
+
+class EncoderSubLayer(Layer):
+    """
+    EncoderSubLayer
+    """
+
+    def __init__(self,
+                 hidden_act,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 param_initializer=None,
+                 name=""):
+
+        super(EncoderSubLayer, self).__init__()
+        self.name = name
+        self._preprocess_cmd = preprocess_cmd
+        self._postprocess_cmd = postprocess_cmd
+        self._prepostprocess_dropout = prepostprocess_dropout
+
+        self._preprocess_layer = PrePostProcessLayer(
+            self._preprocess_cmd,
+            d_model,
+            prepostprocess_dropout,
+            name=name + "_pre_att")
+
+        self._multihead_attention_layer = MultiHeadAttentionLayer(
+            d_key,
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            None,
+            None,
+            False,
+            param_initializer,
+            name=name + "_multi_head_att")
+
+        self._postprocess_layer = PrePostProcessLayer(
+            self._postprocess_cmd,
+            d_model,
+            self._prepostprocess_dropout,
+            name=name + "_post_att")
+        self._preprocess_layer2 = PrePostProcessLayer(
+            self._preprocess_cmd,
+            d_model,
+            self._prepostprocess_dropout,
+            name=name + "_pre_ffn")
+
+        self._positionwise_feed_forward = PositionwiseFeedForwardLayer(
+            hidden_act,
+            d_inner_hid,
+            d_model,
+            relu_dropout,
+            param_initializer,
+            name=name + "_ffn")
+
+        self._postprocess_layer2 = PrePostProcessLayer(
+            self._postprocess_cmd,
+            d_model,
+            self._prepostprocess_dropout,
+            name=name + "_post_ffn")
+
+    def forward(self, enc_input, attn_bias):
+        """
+        forward
+        :param enc_input:
+        :param attn_bias:
+        :return:
+        """
+        pre_process_multihead = self._preprocess_layer(enc_input)
+
+        attn_output = self._multihead_attention_layer(pre_process_multihead,
+                                                      None, None, attn_bias)
+        attn_output = self._postprocess_layer(attn_output, enc_input)
+
+        pre_process2_output = self._preprocess_layer2(attn_output)
+
+        ffd_output = self._positionwise_feed_forward(pre_process2_output)
+
+        return self._postprocess_layer2(ffd_output, attn_output)
+
+
+class EncoderLayer(Layer):
+    """
+    encoder
+    """
+
+    def __init__(self,
+                 hidden_act,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 param_initializer=None,
+                 name=""):
+
+        super(EncoderLayer, self).__init__()
+        self._preprocess_cmd = preprocess_cmd
+        self._encoder_sublayers = list()
+        self._prepostprocess_dropout = prepostprocess_dropout
+        self._n_layer = n_layer
+        self._hidden_act = hidden_act
+        self._preprocess_layer = PrePostProcessLayer(
+            self._preprocess_cmd, 3, self._prepostprocess_dropout,
+            "post_encoder")
+
+        for i in range(n_layer):
+            self._encoder_sublayers.append(
+                self.add_sublayer(
+                    'esl_%d' % i,
+                    EncoderSubLayer(
+                        hidden_act,
+                        n_head,
+                        d_key,
+                        d_value,
+                        d_model,
+                        d_inner_hid,
+                        prepostprocess_dropout,
+                        attention_dropout,
+                        relu_dropout,
+                        preprocess_cmd,
+                        postprocess_cmd,
+                        param_initializer,
+                        name=name + '_layer_' + str(i))))
+
+    def forward(self, enc_input, attn_bias):
+        """
+        forward
+        :param enc_input:
+        :param attn_bias:
+        :return:
+        """
+        outputs = []
+        for i in range(self._n_layer):
+            enc_output = self._encoder_sublayers[i](enc_input, attn_bias)
+            outputs.append(enc_output)
+            enc_input = enc_output
+
+        return outputs
diff --git a/paddleslim/teachers/bert/optimization.py b/paddleslim/teachers/bert/optimization.py
new file mode 100755
index 0000000000000000000000000000000000000000..bf004ae030b6235910e13bb01f538a117a21043a
--- /dev/null
+++ b/paddleslim/teachers/bert/optimization.py
@@ -0,0 +1,170 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimization and learning rate scheduling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle.fluid as fluid
+
+from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
+
+
+class ConstantLR(LearningRateDecay):
+    def __init__(self, learning_rate, begin=0, step=1, dtype='float32'):
+        super(ConstantLR, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+
+    def step(self):
+        return self.learning_rate
+
+
+class LinearDecay(LearningRateDecay):
+    def __init__(self,
+                 learning_rate,
+                 warmup_steps,
+                 decay_steps,
+                 end_learning_rate=0.0001,
+                 power=1.0,
+                 cycle=False,
+                 begin=0,
+                 step=1,
+                 dtype='float32'):
+        super(LinearDecay, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+        self.warmup_steps = warmup_steps
+        self.decay_steps = decay_steps
+        self.end_learning_rate = end_learning_rate
+        self.power = power
+        self.cycle = cycle
+
+    def step(self):
+        if self.step_num < self.warmup_steps:
+            decayed_lr = self.learning_rate * (self.step_num /
+                                               self.warmup_steps)
+            decayed_lr = self.create_lr_var(decayed_lr)
+        else:
+            tmp_step_num = self.step_num
+            tmp_decay_steps = self.decay_steps
+            if self.cycle:
+                div_res = fluid.layers.ceil(
+                    self.create_lr_var(tmp_step_num / float(self.decay_steps)))
+                if tmp_step_num == 0:
+                    div_res = self.create_lr_var(1.0)
+                tmp_decay_steps = self.decay_steps * div_res
+            else:
+                tmp_step_num = self.create_lr_var(
+                    tmp_step_num
+                    if tmp_step_num < self.decay_steps else self.decay_steps)
+                decayed_lr = (self.learning_rate - self.end_learning_rate) * \
+                    ((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate
+
+        return decayed_lr
+
+
+class Optimizer(object):
+    def __init__(self,
+                 warmup_steps,
+                 num_train_steps,
+                 learning_rate,
+                 model_cls,
+                 weight_decay,
+                 scheduler='linear_warmup_decay',
+                 loss_scaling=1.0,
+                 parameter_list=None):
+        self.warmup_steps = warmup_steps
+        self.num_train_steps = num_train_steps
+        self.learning_rate = learning_rate
+        self.model_cls = model_cls
+        self.weight_decay = weight_decay
+        self.scheduler = scheduler
+        self.loss_scaling = loss_scaling
+        self.parameter_list = parameter_list
+
+        self.scheduled_lr = 0.0
+        self.optimizer = self.lr_schedule()
+
+    def lr_schedule(self):
+        if self.warmup_steps > 0:
+            if self.scheduler == 'noam_decay':
+                self.scheduled_lr = fluid.dygraph.NoamDecay(1 / (
+                    self.warmup_steps * (self.learning_rate**2)),
+                                                            self.warmup_steps)
+            elif self.scheduler == 'linear_warmup_decay':
+                self.scheduled_lr = LinearDecay(self.learning_rate,
+                                                self.warmup_steps,
+                                                self.num_train_steps, 0.0)
+            else:
+                raise ValueError("Unkown learning rate scheduler, should be "
+                                 "'noam_decay' or 'linear_warmup_decay'")
+            optimizer = fluid.optimizer.Adam(
+                learning_rate=self.scheduled_lr,
+                parameter_list=self.parameter_list)
+        else:
+            self.scheduled_lr = ConstantLR(self.learning_rate)
+            optimizer = fluid.optimizer.Adam(
+                learning_rate=self.scheduled_lr,
+                parameter_list=self.parameter_list)
+
+        return optimizer
+
+    def exclude_from_weight_decay(self, name):
+        if name.find("layer_norm") > -1:
+            return True
+        bias_suffix = ["_bias", "_b", ".b_0"]
+        for suffix in bias_suffix:
+            if name.endswith(suffix):
+                return True
+        return False
+
+    def optimization(self, loss, use_data_parallel=False, model=None):
+        param_list = dict()
+
+        clip_norm_thres = 1.0
+        #grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm_thres)
+
+        if use_data_parallel:
+            loss = model.scale_loss(loss)
+
+        loss.backward()
+
+        if self.weight_decay > 0:
+            for param in self.model_cls.parameters():
+                param_list[param.name] = param * 1.0
+                param_list[param.name].stop_gradient = True
+
+        if use_data_parallel:
+            assert model is not None
+            model.apply_collective_grads()
+
+        #_, param_grads = self.optimizer.minimize(loss, grad_clip=grad_clip)
+        _, param_grads = self.optimizer.minimize(loss)
+
+        if self.weight_decay > 0:
+            for param, grad in param_grads:
+                if self.exclude_from_weight_decay(param.name):
+                    continue
+                if isinstance(self.scheduled_lr.step(), float):
+                    updated_param = param.numpy() - param_list[
+                        param.name].numpy(
+                        ) * self.weight_decay * self.scheduled_lr.step()
+                else:
+                    updated_param = param.numpy(
+                    ) - param_list[param.name].numpy(
+                    ) * self.weight_decay * self.scheduled_lr.step().numpy()
+                updated_param_var = fluid.dygraph.to_variable(updated_param)
+                param = updated_param_var
+                #param = fluid.layers.reshape(x=updated_param_var, shape=list(updated_param_var.shape))
diff --git a/paddleslim/teachers/bert/reader/__init__.py b/paddleslim/teachers/bert/reader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/paddleslim/teachers/bert/reader/batching.py b/paddleslim/teachers/bert/reader/batching.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a214700a9e2db27900602c235c32e435e7b85fb
--- /dev/null
+++ b/paddleslim/teachers/bert/reader/batching.py
@@ -0,0 +1,189 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask, padding and batching."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
+    """
+    Add mask for batch_tokens, return out, mask_label, mask_pos;
+    Note: mask_pos responding the batch_tokens after padded;
+    """
+    max_len = max([len(sent) for sent in batch_tokens])
+    mask_label = []
+    mask_pos = []
+    prob_mask = np.random.rand(total_token_num)
+    # Note: the first token is [CLS], so [low=1]
+    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
+    pre_sent_len = 0
+    prob_index = 0
+    for sent_index, sent in enumerate(batch_tokens):
+        mask_flag = False
+        prob_index += pre_sent_len
+        for token_index, token in enumerate(sent):
+            prob = prob_mask[prob_index + token_index]
+            if prob > 0.15:
+                continue
+            elif 0.03 < prob <= 0.15:
+                # mask
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = MASK
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            elif 0.015 < prob <= 0.03:
+                # random replace
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = replace_ids[prob_index + token_index]
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            else:
+                # keep the original token
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    mask_pos.append(sent_index * max_len + token_index)
+        pre_sent_len = len(sent)
+
+        # ensure at least mask one word in a sentence
+        while not mask_flag:
+            token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
+            if sent[token_index] != SEP and sent[token_index] != CLS:
+                mask_label.append(sent[token_index])
+                sent[token_index] = MASK
+                mask_flag = True
+                mask_pos.append(sent_index * max_len + token_index)
+    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
+    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+    return batch_tokens, mask_label, mask_pos
+
+
+def prepare_batch_data(insts,
+                       total_token_num,
+                       voc_size=0,
+                       pad_id=None,
+                       cls_id=None,
+                       sep_id=None,
+                       mask_id=None,
+                       return_input_mask=True,
+                       return_max_len=True,
+                       return_num_token=False):
+    """
+    1. generate Tensor of data
+    2. generate Tensor of position
+    3. generate self attention mask, [shape: batch_size *  max_len * max_len]
+    """
+
+    batch_src_ids = [inst[0] for inst in insts]
+    batch_sent_ids = [inst[1] for inst in insts]
+    batch_pos_ids = [inst[2] for inst in insts]
+    labels_list = []
+    # compatible with squad, whose example includes start/end positions, 
+    # or unique id
+
+    for i in range(3, len(insts[0]), 1):
+        labels = [inst[i] for inst in insts]
+        labels = np.array(labels).astype("int64").reshape([-1, 1])
+        labels_list.append(labels)
+
+    # First step: do mask without padding
+    if mask_id >= 0:
+        out, mask_label, mask_pos = mask(
+            batch_src_ids,
+            total_token_num,
+            vocab_size=voc_size,
+            CLS=cls_id,
+            SEP=sep_id,
+            MASK=mask_id)
+    else:
+        out = batch_src_ids
+    # Second step: padding
+    src_id, self_input_mask = pad_batch_data(
+        out, pad_idx=pad_id, return_input_mask=True)
+    pos_id = pad_batch_data(
+        batch_pos_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    sent_id = pad_batch_data(
+        batch_sent_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+
+    if mask_id >= 0:
+        return_list = [
+            src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos
+        ] + labels_list
+    else:
+        return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+def pad_batch_data(insts,
+                   pad_idx=0,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and input mask.
+    """
+    return_list = []
+    max_len = max(len(inst) for inst in insts)
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+
+    inst_data = np.array([
+        list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
+    ])
+    return_list += [inst_data.astype("int64").reshape([-1, max_len])]
+
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len])]
+
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] *
+                                    (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+
+    if return_max_len:
+        return_list += [max_len]
+
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+if __name__ == "__main__":
+    pass
diff --git a/paddleslim/teachers/bert/reader/cls.py b/paddleslim/teachers/bert/reader/cls.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfdfaf38bee20c8296c5c45a7e9ff120c0d0048c
--- /dev/null
+++ b/paddleslim/teachers/bert/reader/cls.py
@@ -0,0 +1,612 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import os
+import types
+import csv
+import random
+import numpy as np
+from . import tokenization
+from .batching import prepare_batch_data
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def __init__(self,
+                 data_dir,
+                 vocab_path,
+                 max_seq_len,
+                 do_lower_case,
+                 in_tokens,
+                 random_seed=None):
+        self.data_dir = data_dir
+        self.max_seq_len = max_seq_len
+        self.tokenizer = tokenization.FullTokenizer(
+            vocab_file=vocab_path, do_lower_case=do_lower_case)
+        self.vocab = self.tokenizer.vocab
+        self.in_tokens = in_tokens
+
+        np.random.seed(random_seed)
+
+        self.current_train_example = -1
+        self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
+        self.current_train_epoch = -1
+
+    def get_train_aug_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_test_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for prediction."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    def convert_example(self, index, example, labels, max_seq_len, tokenizer):
+        """Converts a single `InputExample` into a single `InputFeatures`."""
+        feature = convert_single_example(index, example, labels, max_seq_len,
+                                         tokenizer)
+        return feature
+
+    def generate_instance(self, feature):
+        """
+        generate instance with given feature
+
+        Args:
+            feature: InputFeatures(object). A single set of features of data.
+        """
+        input_pos = list(range(len(feature.input_ids)))
+        return [
+            feature.input_ids, feature.segment_ids, input_pos, feature.label_id
+        ]
+
+    def generate_batch_data(self,
+                            batch_data,
+                            total_token_num,
+                            voc_size=-1,
+                            mask_id=-1,
+                            return_input_mask=True,
+                            return_max_len=False,
+                            return_num_token=False):
+        return prepare_batch_data(
+            batch_data,
+            total_token_num,
+            voc_size=-1,
+            pad_id=self.vocab["[PAD]"],
+            cls_id=self.vocab["[CLS]"],
+            sep_id=self.vocab["[SEP]"],
+            mask_id=-1,
+            return_input_mask=True,
+            return_max_len=False,
+            return_num_token=False)
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with io.open(input_file, "r", encoding="utf8") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                lines.append(line)
+            return lines
+
+    def get_num_examples(self, phase):
+        """Get number of examples for train, dev or test."""
+        if phase not in ['train', 'dev', 'test', 'train_aug']:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'dev', 'test'].")
+        return self.num_examples[phase]
+
+    def get_train_progress(self):
+        """Gets progress for training phase."""
+        return self.current_train_example, self.current_train_epoch
+
+    def data_generator(self,
+                       batch_size,
+                       phase='train',
+                       epoch=1,
+                       dev_count=1,
+                       shuffle=True,
+                       shuffle_seed=None):
+        """
+        Generate data for train, dev or test.
+    
+        Args:
+          batch_size: int. The batch size of generated data.
+          phase: string. The phase for which to generate data.
+          epoch: int. Total epoches to generate data.
+          shuffle: bool. Whether to shuffle examples.
+        """
+        search_examples = self.get_train_examples(self.data_dir)
+        random.shuffle(search_examples)
+        if phase == 'train':
+            examples = self.get_train_examples(self.data_dir)
+            self.num_examples['train'] = len(examples)
+        elif phase == 'train_aug':
+            examples = self.get_train_aug_examples(self.data_dir)
+            self.num_examples['train'] = len(examples)
+        elif phase == 'dev':
+            examples = self.get_dev_examples(self.data_dir)
+            self.num_examples['dev'] = len(examples)
+        elif phase == 'test':
+            examples = self.get_test_examples(self.data_dir)
+            self.num_examples['test'] = len(examples)
+        elif phase == 'search_train':
+            #examples = self.get_train_examples(self.data_dir)
+            self.num_examples['search_train'] = len(search_examples) / 2
+            examples = search_examples[:self.num_examples['search_train']]
+        elif phase == 'search_valid':
+            #examples = self.get_train_examples(self.data_dir)
+            self.num_examples['search_valid'] = len(search_examples) / 2
+            examples = search_examples[self.num_examples['search_valid']:]
+        else:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'dev', 'test'].")
+
+        def instance_reader():
+            for epoch_index in range(epoch):
+                if shuffle:
+                    if shuffle_seed is not None:
+                        np.random.seed(shuffle_seed)
+                    np.random.shuffle(examples)
+                if phase == 'train' or phase == 'search_train':
+                    self.current_train_epoch = epoch_index
+                for (index, example) in enumerate(examples):
+                    if phase == 'train' or phase == "search_train":
+                        self.current_train_example = index + 1
+                    feature = self.convert_example(
+                        index, example,
+                        self.get_labels(), self.max_seq_len, self.tokenizer)
+
+                    instance = self.generate_instance(feature)
+                    yield instance
+
+        def batch_reader(reader, batch_size, in_tokens):
+            batch, total_token_num, max_len = [], 0, 0
+            for instance in reader():
+                token_ids, sent_ids, pos_ids, label = instance[:4]
+                max_len = max(max_len, len(token_ids))
+                if in_tokens:
+                    to_append = (len(batch) + 1) * max_len <= batch_size
+                else:
+                    to_append = len(batch) < batch_size
+                if to_append:
+                    batch.append(instance)
+                    total_token_num += len(token_ids)
+                else:
+                    yield batch, total_token_num
+                    batch, total_token_num, max_len = [instance], len(
+                        token_ids), len(token_ids)
+
+            if len(batch) > 0:
+                yield batch, total_token_num
+
+        def wrapper():
+            all_dev_batches = []
+            for batch_data, total_token_num in batch_reader(
+                    instance_reader, batch_size, self.in_tokens):
+                batch_data = self.generate_batch_data(
+                    batch_data,
+                    total_token_num,
+                    voc_size=-1,
+                    mask_id=-1,
+                    return_input_mask=True,
+                    return_max_len=False,
+                    return_num_token=False)
+
+                if len(all_dev_batches) < dev_count:
+                    all_dev_batches.append(batch_data)
+
+                if len(all_dev_batches) == dev_count:
+                    for batch in all_dev_batches:
+                        batch = self.split_seq_pair(batch)
+                        yield batch
+                    all_dev_batches = []
+
+        return wrapper
+
+    def split_seq_pair(self, data_ids):
+        src_ids = data_ids[0]
+        sentence_ids = data_ids[2]
+
+        ids = np.squeeze(src_ids)
+        sids = np.squeeze(sentence_ids)
+        batchsize = ids.shape[0]
+
+        ids_0 = ids[((sids == 0) & (ids != 0))]
+        seqlen_0 = ((sids == 0) & (ids != 0)).astype(np.int64).sum(1)
+        y_0 = np.concatenate([np.arange(s) for s in seqlen_0])
+        x_0 = np.concatenate([
+            np.ones(
+                [s], dtype=np.int64) * i for i, s in enumerate(seqlen_0)
+        ])
+        ids0 = np.zeros([batchsize, seqlen_0.max()], dtype=np.int64)
+        ids0[(x_0, y_0)] = ids_0
+
+        ids_1 = ids[(sids == 1) & (ids != 0)]
+        seqlen_1 = ((sids == 1) & (ids != 0)).astype(np.int64).sum(1)
+        y_1 = np.concatenate([np.arange(s) for s in seqlen_1])
+        x_1 = np.concatenate([
+            np.ones(
+                [s], dtype=np.int64) * i for i, s in enumerate(seqlen_1)
+        ])
+        ids1 = np.zeros([batchsize, seqlen_1.max()], dtype=np.int64)
+        ids1[(x_1, y_1)] = ids_1
+
+        msl = max(seqlen_0.max(), seqlen_1.max())
+        ids0 = np.pad(ids0, [[0, 0], [0, msl - seqlen_0.max()]],
+                      mode='constant')
+        ids1 = np.pad(ids1, [[0, 0], [0, msl - seqlen_1.max()]],
+                      mode='constant')
+        return data_ids + [ids0, ids1]
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+    Args:
+      guid: Unique id for the example.
+      text_a: string. The untokenized text of the first sequence. For single
+        sequence tasks, only this sequence must be specified.
+      text_b: (Optional) string. The untokenized text of the second sequence.
+        Only must be specified for sequence pair tasks.
+      label: (Optional) string. The label of the example. This should be
+        specified for train and dev examples, but not for test examples.
+    """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+
+
+class XnliProcessor(DataProcessor):
+    """Processor for the XNLI data set."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        self.language = "zh"
+        lines = self._read_tsv(
+            os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" %
+                         self.language))
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "train-%d" % (i)
+            text_a = tokenization.convert_to_unicode(line[0])
+            text_b = tokenization.convert_to_unicode(line[1])
+            label = tokenization.convert_to_unicode(line[2])
+            if label == tokenization.convert_to_unicode("contradictory"):
+                label = tokenization.convert_to_unicode("contradiction")
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        self.language = "zh"
+        lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "dev-%d" % (i)
+            language = tokenization.convert_to_unicode(line[0])
+            if language != tokenization.convert_to_unicode(self.language):
+                continue
+            text_a = tokenization.convert_to_unicode(line[6])
+            text_b = tokenization.convert_to_unicode(line[7])
+            label = tokenization.convert_to_unicode(line[1])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        self.language = "zh"
+        lines = self._read_tsv(os.path.join(data_dir, "xnli.test.tsv"))
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "test-%d" % (i)
+            language = tokenization.convert_to_unicode(line[0])
+            if language != tokenization.convert_to_unicode(self.language):
+                continue
+            text_a = tokenization.convert_to_unicode(line[6])
+            text_b = tokenization.convert_to_unicode(line[7])
+            label = tokenization.convert_to_unicode(line[1])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+
+    def get_train_aug_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "train")
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
+            "dev_matched")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type,
+                              tokenization.convert_to_unicode(line[0]))
+            text_a = tokenization.convert_to_unicode(line[8])
+            text_b = tokenization.convert_to_unicode(line[9])
+            if set_type == "test":
+                label = "contradiction"
+            else:
+                label = tokenization.convert_to_unicode(line[-1])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = tokenization.convert_to_unicode(line[3])
+            text_b = tokenization.convert_to_unicode(line[4])
+            if set_type == "test":
+                label = "0"
+            else:
+                label = tokenization.convert_to_unicode(line[0])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class ColaProcessor(DataProcessor):
+    """Processor for the CoLA data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            # Only the test set has a header
+            if set_type == "test" and i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            if set_type == "test":
+                text_a = tokenization.convert_to_unicode(line[1])
+                label = "0"
+            else:
+                text_a = tokenization.convert_to_unicode(line[3])
+                label = tokenization.convert_to_unicode(line[1])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+def convert_single_example_to_unicode(guid, single_example):
+    text_a = tokenization.convert_to_unicode(single_example[0])
+    text_b = tokenization.convert_to_unicode(single_example[1])
+    label = tokenization.convert_to_unicode(single_example[2])
+    return InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
+
+
+def convert_single_example(ex_index, example, label_list, max_seq_length,
+                           tokenizer):
+    """Converts a single `InputExample` into a single `InputFeatures`."""
+    label_map = {}
+    for (i, label) in enumerate(label_list):
+        label_map[label] = i
+
+    tokens_a = tokenizer.tokenize(example.text_a)
+    tokens_b = None
+    if example.text_b:
+        tokens_b = tokenizer.tokenize(example.text_b)
+
+    if tokens_b:
+        # Modifies `tokens_a` and `tokens_b` in place so that the total
+        # length is less than the specified length.
+        # Account for [CLS], [SEP], [SEP] with "- 3"
+        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+    else:
+        # Account for [CLS] and [SEP] with "- 2"
+        if len(tokens_a) > max_seq_length - 2:
+            tokens_a = tokens_a[0:(max_seq_length - 2)]
+
+    # The convention in BERT is:
+    # (a) For sequence pairs:
+    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+    # (b) For single sequences:
+    #  tokens:   [CLS] the dog is hairy . [SEP]
+    #  type_ids: 0     0   0   0  0     0 0
+    #
+    # Where "type_ids" are used to indicate whether this is the first
+    # sequence or the second sequence. The embedding vectors for `type=0` and
+    # `type=1` were learned during pre-training and are added to the wordpiece
+    # embedding vector (and position vector). This is not *strictly* necessary
+    # since the [SEP] token unambiguously separates the sequences, but it makes
+    # it easier for the model to learn the concept of sequences.
+    #
+    # For classification tasks, the first vector (corresponding to [CLS]) is
+    # used as as the "sentence vector". Note that this only makes sense because
+    # the entire model is fine-tuned.
+    tokens = []
+    segment_ids = []
+    tokens.append("[CLS]")
+    segment_ids.append(0)
+    for token in tokens_a:
+        tokens.append(token)
+        segment_ids.append(0)
+    tokens.append("[SEP]")
+    segment_ids.append(0)
+
+    if tokens_b:
+        for token in tokens_b:
+            tokens.append(token)
+            segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+    # The mask has 1 for real tokens and 0 for padding tokens. Only real
+    # tokens are attended to.
+    input_mask = [1] * len(input_ids)
+
+    label_id = label_map[example.label]
+
+    feature = InputFeatures(
+        input_ids=input_ids,
+        input_mask=input_mask,
+        segment_ids=segment_ids,
+        label_id=label_id)
+    return feature
+
+
+def convert_examples_to_features(examples, label_list, max_seq_length,
+                                 tokenizer):
+    """Convert a set of `InputExample`s to a list of `InputFeatures`."""
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            print("Writing example %d of %d" % (ex_index, len(examples)))
+
+        feature = convert_single_example(ex_index, example, label_list,
+                                         max_seq_length, tokenizer)
+
+        features.append(feature)
+    return features
+
+
+if __name__ == '__main__':
+    pass
diff --git a/paddleslim/teachers/bert/reader/tokenization.py b/paddleslim/teachers/bert/reader/tokenization.py
new file mode 100644
index 0000000000000000000000000000000000000000..08570f30fe9e6a8036a15095e67e6e8dd8686c14
--- /dev/null
+++ b/paddleslim/teachers/bert/reader/tokenization.py
@@ -0,0 +1,371 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import unicodedata
+import six
+import io
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    fin = io.open(vocab_file, encoding="utf8")
+    for num, line in enumerate(fin):
+        items = convert_to_unicode(line.strip()).split("\t")
+        if len(items) > 2:
+            break
+        token = items[0]
+        index = items[1] if len(items) == 2 else num
+        token = token.strip()
+        vocab[token] = int(index)
+    return vocab
+
+
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class CharTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in text.lower().split(" "):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+
+        Args:
+            do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+            input = "unaffable"
+            output = ["un", "##aff", "##able"]
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through `BasicTokenizer.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        text = convert_to_unicode(text)
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
diff --git a/paddleslim/teachers/bert/utils/__init__.py b/paddleslim/teachers/bert/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/paddleslim/teachers/bert/utils/convert_static_to_dygraph.py b/paddleslim/teachers/bert/utils/convert_static_to_dygraph.py
new file mode 100755
index 0000000000000000000000000000000000000000..cbd4f7f74003cbcb1f7f800e7f72e69fbbb3a5f9
--- /dev/null
+++ b/paddleslim/teachers/bert/utils/convert_static_to_dygraph.py
@@ -0,0 +1,228 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import shutil
+import sys
+import os
+
+
+def usage():
+    """
+    usage information
+    """
+    print
+    print("please use command: ")
+    print(
+        "python convert_static_to_dygraph.py input_params_dir output_params_dir"
+    )
+    print
+
+
+def convert_static_to_dygraph(static_model_path, dygraph_model_path):
+    """
+    convert paddle static bert model to dygraph model 
+    """
+
+    def mkdir(path):
+        if not os.path.isdir(path):
+            if os.path.split(path)[0]:
+                mkdir(os.path.split(path)[0])
+        else:
+            return
+        os.mkdir(path)
+
+    if os.path.exists(dygraph_model_path):
+        shutil.rmtree(dygraph_model_path)
+    mkdir(dygraph_model_path)
+
+    if not os.path.exists(static_model_path):
+        print("paddle static model path doesn't exist.....")
+        return -1
+
+    file_list = []
+    for root, dirs, files in os.walk(static_model_path):
+        file_list.extend(files)
+
+    os.makedirs(os.path.join(dygraph_model_path, "PretrainModelLayer_0"))
+    os.makedirs(
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/BertModelLayer_0"))
+    os.makedirs(
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/PrePostProcessLayer_0"))
+    os.makedirs(
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0"))
+
+    #os.chdir(static_model_path)
+    #convert embedding file
+    embedding_type = ["word", "pos", "sent"]
+    for i in range(3):
+        src_name = embedding_type[i] + "_embedding"
+        trg_name = "Embedding_" + str(i) + "." + src_name
+        shutil.copyfile(
+            os.path.join(static_model_path, src_name),
+            os.path.join(dygraph_model_path,
+                         "PretrainModelLayer_0/BertModelLayer_0/" + trg_name))
+
+    #convert pre_encoder file
+    shutil.copyfile(
+        os.path.join(static_model_path, "pre_encoder_layer_norm_scale"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
+        ))
+    shutil.copyfile(
+        os.path.join(static_model_path, "pre_encoder_layer_norm_bias"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
+        ))
+
+    #convert mask lm params file
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_out_fc.b_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/Layer_0.mask_lm_out_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_fc.b_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_0.mask_lm_trans_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_fc.w_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_0.mask_lm_trans_fc.w_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_layer_norm_bias"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
+        ))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_layer_norm_scale"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
+        ))
+    shutil.copyfile(
+        os.path.join(static_model_path, "next_sent_fc.b_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_1.next_sent_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "next_sent_fc.w_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_1.next_sent_fc.w_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "pooled_fc.b_0"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "pooled_fc.w_0"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.w_0"))
+
+    encoder_num = 0
+    for f in file_list:
+        if not f.startswith("encoder_layer"):
+            continue
+        layer_num = f.split('_')[2]
+        if int(layer_num) > encoder_num:
+            encoder_num = int(layer_num)
+
+    encoder_num += 1
+    for i in range(encoder_num):
+        encoder_dir = "EncoderSubLayer_" + str(i)
+        os.makedirs(
+            os.path.join(dygraph_model_path,
+                         "PretrainModelLayer_0/BertModelLayer_0/" +
+                         "EncoderLayer_0/", encoder_dir))
+        os.makedirs(
+            os.path.join(dygraph_model_path,
+                         "PretrainModelLayer_0/BertModelLayer_0/" +
+                         "EncoderLayer_0/", encoder_dir +
+                         "/PositionwiseFeedForwardLayer_0"))
+        os.makedirs(
+            os.path.join(
+                dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
+                "EncoderLayer_0/", encoder_dir + "/MultiHeadAttentionLayer_0"))
+        os.makedirs(
+            os.path.join(
+                dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
+                "EncoderLayer_0/", encoder_dir + "/PrePostProcessLayer_1"))
+        os.makedirs(
+            os.path.join(
+                dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
+                "EncoderLayer_0/", encoder_dir + "/PrePostProcessLayer_3"))
+
+    encoder_map_dict = {
+        "ffn_fc_0.b_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_0.ffn_fc_0.b_0"),
+        "ffn_fc_0.w_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_0.ffn_fc_0.w_0"),
+        "ffn_fc_1.b_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_1.ffn_fc_1.b_0"),
+        "ffn_fc_1.w_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_1.ffn_fc_1.w_0"),
+        "multi_head_att_key_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_1.key_fc.b_0"),
+        "multi_head_att_key_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_1.key_fc.w_0"),
+        "multi_head_att_output_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_3.output_fc.b_0"),
+        "multi_head_att_output_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_3.output_fc.w_0"),
+        "multi_head_att_query_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_0.query_fc.b_0"),
+        "multi_head_att_query_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_0.query_fc.w_0"),
+        "multi_head_att_value_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_2.value_fc.b_0"),
+        "multi_head_att_value_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_2.value_fc.w_0"),
+        "post_att_layer_norm_bias":
+        ("PrePostProcessLayer_1", "LayerNorm_0.post_att_layer_norm_bias"),
+        "post_att_layer_norm_scale":
+        ("PrePostProcessLayer_1", "LayerNorm_0.post_att_layer_norm_scale"),
+        "post_ffn_layer_norm_bias":
+        ("PrePostProcessLayer_3", "LayerNorm_0.post_ffn_layer_norm_bias"),
+        "post_ffn_layer_norm_scale":
+        ("PrePostProcessLayer_3", "LayerNorm_0.post_ffn_layer_norm_scale")
+    }
+
+    for f in file_list:
+        if not f.startswith("encoder_layer"):
+            continue
+        layer_num = f.split('_')[2]
+        suffix_name = "_".join(f.split('_')[3:])
+        in_dir = encoder_map_dict[suffix_name][0]
+        rename = encoder_map_dict[suffix_name][1]
+        encoder_layer = "EncoderSubLayer_" + layer_num
+        shutil.copyfile(
+            os.path.join(static_model_path, f),
+            os.path.join(
+                dygraph_model_path,
+                "PretrainModelLayer_0/BertModelLayer_0/EncoderLayer_0/" +
+                encoder_layer + "/" + in_dir + "/" + rename))
+
+
+if __name__ == "__main__":
+
+    if len(sys.argv) < 3:
+        usage()
+        exit(1)
+    static_model_path = sys.argv[1]
+    dygraph_model_path = sys.argv[2]
+    convert_static_to_dygraph(static_model_path, dygraph_model_path)
diff --git a/paddleslim/teachers/bert/utils/fp16.py b/paddleslim/teachers/bert/utils/fp16.py
new file mode 100644
index 0000000000000000000000000000000000000000..e153c2b9a1029897def264278c5dbe72e1f369f5
--- /dev/null
+++ b/paddleslim/teachers/bert/utils/fp16.py
@@ -0,0 +1,97 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+
+
+def cast_fp16_to_fp32(i, o, prog):
+    prog.global_block().append_op(
+        type="cast",
+        inputs={"X": i},
+        outputs={"Out": o},
+        attrs={
+            "in_dtype": fluid.core.VarDesc.VarType.FP16,
+            "out_dtype": fluid.core.VarDesc.VarType.FP32
+        })
+
+
+def cast_fp32_to_fp16(i, o, prog):
+    prog.global_block().append_op(
+        type="cast",
+        inputs={"X": i},
+        outputs={"Out": o},
+        attrs={
+            "in_dtype": fluid.core.VarDesc.VarType.FP32,
+            "out_dtype": fluid.core.VarDesc.VarType.FP16
+        })
+
+
+def copy_to_master_param(p, block):
+    v = block.vars.get(p.name, None)
+    if v is None:
+        raise ValueError("no param name %s found!" % p.name)
+    new_p = fluid.framework.Parameter(
+        block=block,
+        shape=v.shape,
+        dtype=fluid.core.VarDesc.VarType.FP32,
+        type=v.type,
+        lod_level=v.lod_level,
+        stop_gradient=p.stop_gradient,
+        trainable=p.trainable,
+        optimize_attr=p.optimize_attr,
+        regularizer=p.regularizer,
+        gradient_clip_attr=p.gradient_clip_attr,
+        error_clip=p.error_clip,
+        name=v.name + ".master")
+    return new_p
+
+
+def create_master_params_grads(params_grads, main_prog, startup_prog,
+                               loss_scaling):
+    master_params_grads = []
+    tmp_role = main_prog._current_role
+    OpRole = fluid.core.op_proto_and_checker_maker.OpRole
+    main_prog._current_role = OpRole.Backward
+    for p, g in params_grads:
+        # create master parameters
+        master_param = copy_to_master_param(p, main_prog.global_block())
+        startup_master_param = startup_prog.global_block()._clone_variable(
+            master_param)
+        startup_p = startup_prog.global_block().var(p.name)
+        cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
+        # cast fp16 gradients to fp32 before apply gradients
+        if g.name.find("layer_norm") > -1:
+            if loss_scaling > 1:
+                scaled_g = g / float(loss_scaling)
+            else:
+                scaled_g = g
+            master_params_grads.append([p, scaled_g])
+            continue
+        master_grad = fluid.layers.cast(g, "float32")
+        if loss_scaling > 1:
+            master_grad = master_grad / float(loss_scaling)
+        master_params_grads.append([master_param, master_grad])
+    main_prog._current_role = tmp_role
+    return master_params_grads
+
+
+def master_param_to_train_param(master_params_grads, params_grads, main_prog):
+    for idx, m_p_g in enumerate(master_params_grads):
+        train_p, _ = params_grads[idx]
+        if train_p.name.find("layer_norm") > -1:
+            continue
+        with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
+            cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
diff --git a/paddleslim/teachers/bert/utils/init.py b/paddleslim/teachers/bert/utils/init.py
new file mode 100644
index 0000000000000000000000000000000000000000..52f9b38082fd79258c292c9970e3d65ffb9a2d52
--- /dev/null
+++ b/paddleslim/teachers/bert/utils/init.py
@@ -0,0 +1,245 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import six
+import ast
+import copy
+
+import numpy as np
+import paddle.fluid as fluid
+
+
+def cast_fp32_to_fp16(exe, main_program):
+    print("Cast parameters to float16 data format.")
+    for param in main_program.global_block().all_parameters():
+        if not param.name.endswith(".master"):
+            param_t = fluid.global_scope().find_var(param.name).get_tensor()
+            data = np.array(param_t)
+            if param.name.find("layer_norm") == -1:
+                param_t.set(np.float16(data).view(np.uint16), exe.place)
+            master_param_var = fluid.global_scope().find_var(param.name +
+                                                             ".master")
+            if master_param_var is not None:
+                master_param_var.get_tensor().set(data, exe.place)
+
+
+def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
+    assert os.path.exists(
+        init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
+
+    def existed_persitables(var):
+        if not fluid.io.is_persistable(var):
+            return False
+        return os.path.exists(os.path.join(init_checkpoint_path, var.name))
+
+    fluid.io.load_vars(
+        exe,
+        init_checkpoint_path,
+        main_program=main_program,
+        predicate=existed_persitables)
+    print("Load model from {}".format(init_checkpoint_path))
+
+    if use_fp16:
+        cast_fp32_to_fp16(exe, main_program)
+
+
+def init_pretraining_params(exe,
+                            pretraining_params_path,
+                            main_program,
+                            use_fp16=False):
+    assert os.path.exists(pretraining_params_path
+                          ), "[%s] cann't be found." % pretraining_params_path
+
+    def existed_params(var):
+        if not isinstance(var, fluid.framework.Parameter):
+            return False
+        return os.path.exists(os.path.join(pretraining_params_path, var.name))
+
+    fluid.io.load_vars(
+        exe,
+        pretraining_params_path,
+        main_program=main_program,
+        predicate=existed_params)
+    print("Load pretraining parameters from {}.".format(
+        pretraining_params_path))
+
+    if use_fp16:
+        cast_fp32_to_fp16(exe, main_program)
+
+
+def init_from_static_model(dir_path, cls_model, bert_config):
+    def load_numpy_weight(file_name):
+        if six.PY2:
+            res = np.load(os.path.join(dir_path, file_name), allow_pickle=True)
+        else:
+            res = np.load(
+                os.path.join(dir_path, file_name),
+                allow_pickle=True,
+                encoding='latin1')
+        assert res is not None
+        return res
+
+    # load word embedding
+    _param = load_numpy_weight("word_embedding")
+    cls_model.bert_layer._src_emb.set_dict({"weight": _param})
+    print("INIT word embedding")
+
+    _param = load_numpy_weight("pos_embedding")
+    cls_model.bert_layer._pos_emb.set_dict({"weight": _param})
+    print("INIT pos embedding")
+
+    _param = load_numpy_weight("sent_embedding")
+    cls_model.bert_layer._sent_emb.set_dict({"weight": _param})
+    print("INIT sent embedding")
+
+    _param0 = load_numpy_weight("pooled_fc.w_0")
+    _param1 = load_numpy_weight("pooled_fc.b_0")
+    cls_model.bert_layer.pooled_fc.set_dict({
+        "weight": _param0,
+        "bias": _param1
+    })
+    print("INIT pooled_fc")
+
+    _param0 = load_numpy_weight("pre_encoder_layer_norm_scale")
+    _param1 = load_numpy_weight("pre_encoder_layer_norm_bias")
+    cls_model.bert_layer.pre_process_layer._sub_layers[
+        "layer_norm_0"].set_dict({
+            "weight": _param0,
+            "bias": _param1
+        })
+    print("INIT pre_encoder layer norm")
+
+    for _i in range(bert_config["num_hidden_layers"]):
+        _param_weight = "encoder_layer_%d_multi_head_att_query_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_query_fc.b_0" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        cls_model.bert_layer._encoder._sub_layers[
+            "esl_%d" % _i]._multihead_attention_layer._q_fc.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        print("INIT multi_head_att_query_fc %d" % _i)
+
+        _param_weight = "encoder_layer_%d_multi_head_att_key_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_key_fc.b_0" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        cls_model.bert_layer._encoder._sub_layers[
+            "esl_%d" % _i]._multihead_attention_layer._k_fc.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        print("INIT multi_head_att_key_fc %d" % _i)
+
+        _param_weight = "encoder_layer_%d_multi_head_att_value_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_value_fc.b_0" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        cls_model.bert_layer._encoder._sub_layers[
+            "esl_%d" % _i]._multihead_attention_layer._v_fc.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        print("INIT multi_head_att_value_fc %d" % _i)
+
+        # init output fc
+        _param_weight = "encoder_layer_%d_multi_head_att_output_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_output_fc.b_0" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        cls_model.bert_layer._encoder._sub_layers[
+            "esl_%d" % _i]._multihead_attention_layer._proj_fc.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        print("INIT multi_head_att_output_fc %d" % _i)
+
+        # init layer_norm 1
+        _param_weight = "encoder_layer_%d_post_att_layer_norm_scale" % _i
+        _param_bias = "encoder_layer_%d_post_att_layer_norm_bias" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        cls_model.bert_layer._encoder._sub_layers[
+            "esl_%d" % _i]._postprocess_layer.layer_norm_0.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        print("INIT layer norm in attention at %d layer" % _i)
+
+        # init layer_norm 2
+        _param_weight = "encoder_layer_%d_post_ffn_layer_norm_scale" % _i
+        _param_bias = "encoder_layer_%d_post_ffn_layer_norm_bias" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        cls_model.bert_layer._encoder._sub_layers[
+            "esl_%d" % _i]._postprocess_layer2.layer_norm_0.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        print("INIT layer norm in FFN at %d layer" % _i)
+
+        # init FFN 1
+        _param_weight = "encoder_layer_%d_ffn_fc_0.w_0" % _i
+        _param_bias = "encoder_layer_%d_ffn_fc_0.b_0" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        cls_model.bert_layer._encoder._sub_layers[
+            "esl_%d" % _i]._positionwise_feed_forward._i2h.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        print("INIT FFN-1 at %d layer" % _i)
+
+        # init FFN 2
+        _param_weight = "encoder_layer_%d_ffn_fc_1.w_0" % _i
+        _param_bias = "encoder_layer_%d_ffn_fc_1.b_0" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        cls_model.bert_layer._encoder._sub_layers[
+            "esl_%d" % _i]._positionwise_feed_forward._h2o.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        print("INIT FFN-2 at %d layer" % _i)
+
+    # init cls fc
+    #_param_weight = "cls_out_w"
+    #_param_bias = "cls_out_b"
+
+    #_param_weight = load_numpy_weight(_param_weight)
+    #_param_bias = load_numpy_weight(_param_bias)
+
+    #cls_model.cls_fc.set_dict({"weight":_param_weight, "bias":_param_bias})
+    #print("INIT CLS FC layer")
+    return True
diff --git a/paddleslim/version.py b/paddleslim/version.py
index 3e95d57aa8f5b558d818dfff8b5d85daba2c6068..baf27205046c005b85adc7b74a9e533d972a5128 100644
--- a/paddleslim/version.py
+++ b/paddleslim/version.py
@@ -14,4 +14,4 @@
 # limitations under the License.
 """ PaddleSlim version string """
 __all__ = ["slim_version"]
-slim_version = "0.1"
+slim_version = "1.0.0"
diff --git a/requirements.txt b/requirements.txt
index 8e3fcf66561d1965a047b8debd38a543373f534f..8b4fa54954796d32a96ec46e29fda484e18e94ec 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,3 @@
 #paddlepaddle == 1.6.0rc0
+tqdm
+pyzmq
diff --git a/setup.py b/setup.py
index 5ff0a92fdd48668c9447d8625f122d93a168444c..dc5be3fabefef59f6cb1919b5b85412765b7d0c6 100644
--- a/setup.py
+++ b/setup.py
@@ -32,17 +32,6 @@ max_version, mid_version, min_version = python_version()
 with open('./requirements.txt') as f:
     setup_requires = f.read().splitlines()
 
-packages = [
-    'paddleslim',
-    'paddleslim.prune',
-    'paddleslim.dist',
-    'paddleslim.nas',
-    'paddleslim.analysis',
-    'paddleslim.quant',
-    'paddleslim.core',
-    'paddleslim.common',
-]
-
 setup(
     name='paddleslim',
     version=slim_version,
@@ -52,7 +41,7 @@ setup(
     author='PaddlePaddle Author',
     author_email='dltp-all@baidu.com',
     install_requires=setup_requires,
-    packages=packages,
+    packages=find_packages(),
     # PyPI package information.
     classifiers=[
         'Development Status :: 4 - Beta',
diff --git a/tests/layers.py b/tests/layers.py
index 140ff5919b9d8c9821b371db5ca4896db28bf7f0..a5f0b37d846f2789ac9a87754a36782b2e5244c4 100644
--- a/tests/layers.py
+++ b/tests/layers.py
@@ -21,7 +21,9 @@ def conv_bn_layer(input,
                   name,
                   stride=1,
                   groups=1,
-                  act=None):
+                  act=None,
+                  bias=False,
+                  use_cudnn=True):
     conv = fluid.layers.conv2d(
         input=input,
         num_filters=num_filters,
@@ -31,8 +33,9 @@ def conv_bn_layer(input,
         groups=groups,
         act=None,
         param_attr=ParamAttr(name=name + "_weights"),
-        bias_attr=False,
-        name=name + "_out")
+        bias_attr=bias,
+        name=name + "_out",
+        use_cudnn=use_cudnn)
     bn_name = name + "_bn"
     return fluid.layers.batch_norm(
         input=conv,
diff --git a/tests/test_autoprune.py b/tests/test_autoprune.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8a2111789fc86de898a63eb77666d93572714fa
--- /dev/null
+++ b/tests/test_autoprune.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import unittest
+import paddle.fluid as fluid
+from paddleslim.prune import Pruner
+from paddleslim.prune import AutoPruner
+from layers import conv_bn_layer
+
+
+class TestPrune(unittest.TestCase):
+    def test_prune(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        #   X       X              O       X              O
+        # conv1-->conv2-->sum1-->conv3-->conv4-->sum2-->conv5-->conv6
+        #     |            ^ |                    ^
+        #     |____________| |____________________|
+        #
+        # X: prune output channels
+        # O: prune input channels
+        with fluid.program_guard(main_program, startup_program):
+            input = fluid.data(name="image", shape=[None, 3, 16, 16])
+            conv1 = conv_bn_layer(input, 8, 3, "conv1")
+            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+            sum1 = conv1 + conv2
+            conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
+            conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
+            sum2 = conv4 + sum1
+            conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
+            conv6 = conv_bn_layer(conv5, 8, 3, "conv6")
+
+        shapes = {}
+        params = []
+        for param in main_program.global_block().all_parameters():
+            shapes[param.name] = param.shape
+            if 'weights' in param.name:
+                params.append(param.name)
+
+        val_program = fluid.default_main_program().clone(for_test=True)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        scope = fluid.Scope()
+        exe.run(startup_program, scope=scope)
+
+        pruner = AutoPruner(
+            val_program,
+            fluid.global_scope(),
+            place,
+            params=params,
+            init_ratios=[0.33] * len(params),
+            pruned_flops=0.5,
+            pruned_latency=None,
+            server_addr=("", 0),
+            init_temperature=100,
+            reduce_rate=0.85,
+            max_try_times=300,
+            max_client_num=10,
+            search_steps=100,
+            max_ratios=0.9,
+            min_ratios=0.,
+            is_server=True,
+            key="auto_pruner")
+        baseratio = None
+        lastratio = None
+        for i in range(10):
+            pruned_program, pruned_val_program = pruner.prune(
+                fluid.default_main_program(), val_program)
+            score = 0.2
+            pruner.reward(score)
+            if i == 0:
+                baseratio = pruner._current_ratios
+            if i == 9:
+                lastratio = pruner._current_ratios
+        changed = False
+        for i in range(len(baseratio)):
+            if baseratio[i] != lastratio[i]:
+                changed = True
+        self.assertTrue(changed == True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_client_connect.py b/tests/test_client_connect.py
new file mode 100644
index 0000000000000000000000000000000000000000..a008acfedf5ccd5fd1c392e662f5b625f878fb9f
--- /dev/null
+++ b/tests/test_client_connect.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import os
+import time
+import signal
+import unittest
+import paddle.fluid as fluid
+from paddleslim.nas import SANAS
+from paddleslim.common.controller_client import ControllerClient
+import numpy as np
+from multiprocessing import Process
+import socket
+
+
+def start_client(configs, addr, port):
+    client_sanas = SANAS(
+        configs=configs,
+        server_addr=(addr, port),
+        save_checkpoint=None,
+        is_server=False)
+    for _ in range(2):
+        arch = client_sanas.next_archs()[0]
+        time.sleep(1)
+        client_sanas.reward(0.1)
+
+
+def start_server(configs, port):
+    server_sanas = SANAS(
+        configs=configs, server_addr=("", port), save_checkpoint=None)
+    server_sanas.next_archs()[0]
+    return server_sanas
+
+
+class TestClientConnect(unittest.TestCase):
+    def setUp(self):
+        self.configs = [('MobileNetV2BlockSpace', {'block_mask': [0]})]
+        self.port = np.random.randint(8337, 8773)
+        self.addr = socket.gethostbyname(socket.gethostname())
+
+    def test_client_start_first(self):
+        p = Process(
+            target=start_client, args=(self.configs, self.addr, self.port))
+        p.start()
+
+        start_server(self.configs, self.port)
+
+
+class TestClientConnectCase1(unittest.TestCase):
+    def setUp(self):
+        self.configs = [('MobileNetV2BlockSpace', {'block_mask': [0]})]
+        self.port = np.random.randint(8337, 8773)
+        self.addr = socket.gethostbyname(socket.gethostname())
+
+    def test_client_start_first(self):
+        p = Process(
+            target=start_client, args=(self.configs, self.addr, self.port))
+        p.start()
+
+        time.sleep(60)
+        server_sanas = start_server(self.configs, self.port)
+        os.kill(os.getpid(), 0)
+
+
+class TestClientConnectCase2(unittest.TestCase):
+    def setUp(self):
+        self.port = np.random.randint(8337, 8773)
+        self.addr = socket.gethostbyname(socket.gethostname())
+
+    def test_request_current_info(self):
+        client = ControllerClient(self.addr, self.port)
+        client.request_current_info()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_darts.py b/tests/test_darts.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c6a5817c466aa11ae7f440fae64df9e6960f60d
--- /dev/null
+++ b/tests/test_darts.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import paddle
+import unittest
+import paddle.fluid as fluid
+import numpy as np
+from paddleslim.nas.darts import DARTSearch
+from layers import conv_bn_layer
+
+
+class TestDARTS(unittest.TestCase):
+    def test_darts(self):
+        class SuperNet(fluid.dygraph.Layer):
+            def __init__(self):
+                super(SuperNet, self).__init__()
+                self._method = 'DARTS'
+                self._steps = 1
+                self.stem = fluid.dygraph.nn.Conv2D(
+                    num_channels=1, num_filters=3, filter_size=3, padding=1)
+                self.classifier = fluid.dygraph.nn.Linear(
+                    input_dim=2352, output_dim=10)
+                self._multiplier = 4
+                self._primitives = [
+                    'none', 'max_pool_3x3', 'avg_pool_3x3', 'skip_connect',
+                    'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3',
+                    'dil_conv_5x5'
+                ]
+                self._initialize_alphas()
+
+            def _initialize_alphas(self):
+                self.alphas_normal = fluid.layers.create_parameter(
+                    shape=[14, 8], dtype="float32")
+                self.alphas_reduce = fluid.layers.create_parameter(
+                    shape=[14, 8], dtype="float32")
+                self._arch_parameters = [
+                    self.alphas_normal,
+                    self.alphas_reduce,
+                ]
+
+            def arch_parameters(self):
+                return self._arch_parameters
+
+            def forward(self, input):
+                out = self.stem(input) * self.alphas_normal[0][
+                    0] * self.alphas_reduce[0][0]
+                out = fluid.layers.reshape(out, [0, -1])
+                logits = self.classifier(out)
+                return logits
+
+            def _loss(self, input, label):
+                logits = self.forward(input)
+                return fluid.layers.reduce_mean(
+                    fluid.layers.softmax_with_cross_entropy(logits, label))
+
+        def batch_generator(reader):
+            def wrapper():
+                batch_data = []
+                batch_label = []
+                for sample in reader():
+                    image = np.array(sample[0]).reshape(1, 28, 28)
+                    label = np.array(sample[1]).reshape(1)
+                    batch_data.append(image)
+                    batch_label.append(label)
+                    if len(batch_data) == 128:
+                        batch_data = np.array(batch_data, dtype='float32')
+                        batch_label = np.array(batch_label, dtype='int64')
+                        yield [batch_data, batch_label]
+                        batch_data = []
+                        batch_label = []
+
+            return wrapper
+
+        place = fluid.CUDAPlace(0)
+        with fluid.dygraph.guard(place):
+            model = SuperNet()
+            trainset = paddle.dataset.mnist.train()
+            validset = paddle.dataset.mnist.test()
+            train_reader = batch_generator(trainset)
+            valid_reader = batch_generator(validset)
+            searcher = DARTSearch(
+                model, train_reader, valid_reader, place, num_epochs=5)
+            searcher.train()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_deep_mutual_learning.py b/tests/test_deep_mutual_learning.py
new file mode 100755
index 0000000000000000000000000000000000000000..60762e330b25d868dcaba98b1e45cc0c1d47dedc
--- /dev/null
+++ b/tests/test_deep_mutual_learning.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import unittest
+import logging
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.dataset.mnist as reader
+from paddle.fluid.dygraph.base import to_variable
+from paddleslim.models.dygraph import MobileNetV1
+from paddleslim.dist import DML
+from paddleslim.common import get_logger
+logger = get_logger(__name__, level=logging.INFO)
+
+
+class Model(fluid.dygraph.Layer):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.conv = fluid.dygraph.nn.Conv2D(
+            num_channels=1,
+            num_filters=256,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            use_cudnn=False)
+        self.pool2d_avg = fluid.dygraph.nn.Pool2D(
+            pool_type='avg', global_pooling=True)
+        self.out = fluid.dygraph.nn.Linear(256, 10)
+
+    def forward(self, inputs):
+        inputs = fluid.layers.reshape(inputs, shape=[0, 1, 28, 28])
+        y = self.conv(inputs)
+        y = self.pool2d_avg(y)
+        y = fluid.layers.reshape(y, shape=[-1, 256])
+        y = self.out(y)
+        return y
+
+
+class TestDML(unittest.TestCase):
+    def test_dml(self):
+        place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.dygraph.guard(place):
+            train_reader = paddle.fluid.io.batch(
+                paddle.dataset.mnist.train(), batch_size=256)
+            train_loader = fluid.io.DataLoader.from_generator(
+                capacity=1024, return_list=True)
+            train_loader.set_sample_list_generator(train_reader, places=place)
+
+            models = [Model(), Model()]
+            optimizers = []
+            for cur_model in models:
+                opt = fluid.optimizer.MomentumOptimizer(
+                    0.1, 0.9, parameter_list=cur_model.parameters())
+                optimizers.append(opt)
+            dml_model = DML(models)
+            dml_optimizer = dml_model.opt(optimizers)
+
+            def train(train_loader, dml_model, dml_optimizer):
+                dml_model.train()
+                for step_id, (images, labels) in enumerate(train_loader):
+                    images, labels = to_variable(images), to_variable(labels)
+                    labels = fluid.layers.reshape(labels, [0, 1])
+
+                    logits = dml_model.forward(images)
+                    precs = [
+                        fluid.layers.accuracy(
+                            input=l, label=labels, k=1).numpy() for l in logits
+                    ]
+                    losses = dml_model.loss(logits, labels)
+                    dml_optimizer.minimize(losses)
+                    if step_id % 10 == 0:
+                        print(step_id, precs)
+
+            for epoch_id in range(10):
+                current_step_lr = dml_optimizer.get_lr()
+                lr_msg = "Epoch {}".format(epoch_id)
+                for model_id, lr in enumerate(current_step_lr):
+                    lr_msg += ", {} lr: {:.6f}".format(
+                        dml_model.full_name()[model_id], lr)
+                logger.info(lr_msg)
+                train(train_loader, dml_model, dml_optimizer)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_earlystop.py b/tests/test_earlystop.py
new file mode 100644
index 0000000000000000000000000000000000000000..62ea3ed08b2a09b6ab58b794021b7c7eeaee936b
--- /dev/null
+++ b/tests/test_earlystop.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import unittest
+import paddle
+from paddleslim.nas import SANAS
+from paddleslim.nas.early_stop import MedianStop
+steps = 5
+epochs = 5
+
+
+class TestMedianStop(unittest.TestCase):
+    def test_median_stop(self):
+        config = [('MobileNetV2Space')]
+        sanas = SANAS(config, server_addr=("", 8732), save_checkpoint=None)
+        earlystop = MedianStop(sanas, 2)
+        avg_loss = 1.0
+        for step in range(steps):
+            status = earlystop.get_status(step, avg_loss, epochs)
+            self.assertTrue(status, 'GOOD')
+
+        avg_loss = 0.5
+        for step in range(steps):
+            status = earlystop.get_status(step, avg_loss, epochs)
+            if step < 2:
+                self.assertTrue(status, 'GOOD')
+            else:
+                self.assertTrue(status, 'BAD')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_flops.py b/tests/test_flops.py
index cd16b8618d0271e6a0b7e609f8820e16c380b9db..9d50ebc573a4320c1343874309ce75815bd53bd2 100644
--- a/tests/test_flops.py
+++ b/tests/test_flops.py
@@ -33,7 +33,7 @@ class TestPrune(unittest.TestCase):
             sum2 = conv4 + sum1
             conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
             conv6 = conv_bn_layer(conv5, 8, 3, "conv6")
-        self.assertTrue(1597440 == flops(main_program))
+        self.assertTrue(792576 == flops(main_program))
 
 
 if __name__ == '__main__':
diff --git a/tests/test_fpgm_prune.py b/tests/test_fpgm_prune.py
new file mode 100644
index 0000000000000000000000000000000000000000..092ff8d8e0c0540ec6069ce73d20c9301c6d2cbb
--- /dev/null
+++ b/tests/test_fpgm_prune.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import unittest
+import paddle.fluid as fluid
+from paddleslim.prune import Pruner
+from layers import conv_bn_layer
+
+
+class TestPrune(unittest.TestCase):
+    def test_prune(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        #   X       X              O       X              O
+        # conv1-->conv2-->sum1-->conv3-->conv4-->sum2-->conv5-->conv6
+        #     |            ^ |                    ^
+        #     |____________| |____________________|
+        #
+        # X: prune output channels
+        # O: prune input channels
+        with fluid.program_guard(main_program, startup_program):
+            input = fluid.data(name="image", shape=[None, 3, 16, 16])
+            conv1 = conv_bn_layer(input, 8, 3, "conv1")
+            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+            sum1 = conv1 + conv2
+            conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
+            conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
+            sum2 = conv4 + sum1
+            conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
+            conv6 = conv_bn_layer(conv5, 8, 3, "conv6")
+
+        shapes = {}
+        for param in main_program.global_block().all_parameters():
+            shapes[param.name] = param.shape
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        scope = fluid.Scope()
+        exe.run(startup_program, scope=scope)
+        criterion = 'geometry_median'
+        pruner = Pruner(criterion)
+        main_program, _, _ = pruner.prune(
+            main_program,
+            scope,
+            params=["conv4_weights"],
+            ratios=[0.5],
+            place=place,
+            lazy=False,
+            only_graph=False,
+            param_backup=None,
+            param_shape_backup=None)
+
+        shapes = {
+            "conv1_weights": (4, 3, 3, 3),
+            "conv2_weights": (4, 4, 3, 3),
+            "conv3_weights": (8, 4, 3, 3),
+            "conv4_weights": (4, 8, 3, 3),
+            "conv5_weights": (8, 4, 3, 3),
+            "conv6_weights": (8, 8, 3, 3)
+        }
+
+        for param in main_program.global_block().all_parameters():
+            if "weights" in param.name:
+                print("param: {}; param shape: {}".format(param.name,
+                                                          param.shape))
+                self.assertTrue(param.shape == shapes[param.name])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_fsp_loss.py b/tests/test_fsp_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..a71cf143da5bbe300666d60e73b7f1ef0566e70d
--- /dev/null
+++ b/tests/test_fsp_loss.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import unittest
+import paddle.fluid as fluid
+from paddleslim.dist import merge, fsp_loss
+from layers import conv_bn_layer
+
+
+class TestFSPLoss(unittest.TestCase):
+    def test_fsp_loss(self):
+        student_main = fluid.Program()
+        student_startup = fluid.Program()
+        with fluid.program_guard(student_main, student_startup):
+            input = fluid.data(name="image", shape=[None, 3, 224, 224])
+            conv1 = conv_bn_layer(input, 8, 3, "conv1")
+            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+            student_predict = conv1 + conv2
+
+        teacher_main = fluid.Program()
+        teacher_startup = fluid.Program()
+        with fluid.program_guard(teacher_main, teacher_startup):
+            input = fluid.data(name="image", shape=[None, 3, 224, 224])
+            conv1 = conv_bn_layer(input, 8, 3, "conv1")
+            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+            sum1 = conv1 + conv2
+            conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
+            conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
+            sum2 = conv4 + sum1
+            conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
+            teacher_predict = conv_bn_layer(conv5, 8, 3, "conv6")
+
+        place = fluid.CPUPlace()
+        data_name_map = {'image': 'image'}
+        merge(teacher_main, student_main, data_name_map, place)
+        merged_ops = []
+        for block in student_main.blocks:
+            for op in block.ops:
+                merged_ops.append(op.type)
+        with fluid.program_guard(student_main):
+            distill_loss = fsp_loss('teacher_conv5_bn_output.tmp_2',
+                                    'teacher_conv6_bn_output.tmp_2',
+                                    'conv1_bn_output.tmp_2',
+                                    'conv2_bn_output.tmp_2', student_main)
+        loss_ops = []
+        for block in student_main.blocks:
+            for op in block.ops:
+                loss_ops.append(op.type)
+        self.assertTrue(set(merged_ops).difference(set(loss_ops)) == set())
+        self.assertTrue(
+            set(loss_ops).difference(set(merged_ops)) ==
+            {'elementwise_sub', 'reduce_mean', 'square', 'fsp'})
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_group_param.py b/tests/test_group_param.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0fb73611b8143ae7e570f213382cc7931130ff2
--- /dev/null
+++ b/tests/test_group_param.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import unittest
+import paddle.fluid as fluid
+from layers import conv_bn_layer
+from paddleslim.prune import collect_convs
+
+
+class TestPrune(unittest.TestCase):
+    def test_prune(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        #   X       X              O       X              O
+        # conv1-->conv2-->sum1-->conv3-->conv4-->sum2-->conv5-->conv6
+        #     |            ^ |                    ^
+        #     |____________| |____________________|
+        #
+        # X: prune output channels
+        # O: prune input channels
+        with fluid.program_guard(main_program, startup_program):
+            input = fluid.data(name="image", shape=[None, 3, 16, 16])
+            conv1 = conv_bn_layer(input, 8, 3, "conv1")
+            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+            sum1 = conv1 + conv2
+            conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
+            conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
+            sum2 = conv4 + sum1
+            conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
+            conv6 = conv_bn_layer(conv5, 8, 3, "conv6")
+        groups = collect_convs(
+            ["conv1_weights", "conv2_weights", "conv3_weights"], main_program)
+        while [] in groups:
+            groups.remove([])
+        self.assertTrue(len(groups) == 2)
+        self.assertTrue(len(groups[0]) == 18)
+        self.assertTrue(len(groups[1]) == 6)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_l2_loss.py b/tests/test_l2_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9f50479f233d03d3a68c7eefe4cc82bb0e1ea57
--- /dev/null
+++ b/tests/test_l2_loss.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import unittest
+import paddle.fluid as fluid
+from paddleslim.dist import merge, l2_loss
+from layers import conv_bn_layer
+
+
+class TestL2Loss(unittest.TestCase):
+    def test_l2_loss(self):
+        student_main = fluid.Program()
+        student_startup = fluid.Program()
+        with fluid.program_guard(student_main, student_startup):
+            input = fluid.data(name="image", shape=[None, 3, 224, 224])
+            conv1 = conv_bn_layer(input, 8, 3, "conv1")
+            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+            student_predict = conv1 + conv2
+
+        teacher_main = fluid.Program()
+        teacher_startup = fluid.Program()
+        with fluid.program_guard(teacher_main, teacher_startup):
+            input = fluid.data(name="image", shape=[None, 3, 224, 224])
+            conv1 = conv_bn_layer(input, 8, 3, "conv1")
+            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+            sum1 = conv1 + conv2
+            conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
+            conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
+            sum2 = conv4 + sum1
+            conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
+            teacher_predict = conv_bn_layer(conv5, 8, 3, "conv6")
+
+        place = fluid.CPUPlace()
+        data_name_map = {'image': 'image'}
+        merge(teacher_main, student_main, data_name_map, place)
+        merged_ops = []
+        for block in student_main.blocks:
+            for op in block.ops:
+                merged_ops.append(op.type)
+        with fluid.program_guard(student_main):
+            distill_loss = l2_loss('teacher_conv6_bn_output.tmp_2',
+                                   'conv2_bn_output.tmp_2', student_main)
+        loss_ops = []
+        for block in student_main.blocks:
+            for op in block.ops:
+                loss_ops.append(op.type)
+        self.assertTrue(set(merged_ops).difference(set(loss_ops)) == set())
+        self.assertTrue(
+            set(loss_ops).difference(set(merged_ops)) ==
+            {'reduce_mean', 'square', 'elementwise_sub'})
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_loss.py b/tests/test_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..8afa5018ba0d015bcd038f1205e48b5e3f371476
--- /dev/null
+++ b/tests/test_loss.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import unittest
+import paddle.fluid as fluid
+from paddleslim.dist import merge, loss
+from layers import conv_bn_layer
+
+
+class TestLoss(unittest.TestCase):
+    def test_loss(self):
+        student_main = fluid.Program()
+        student_startup = fluid.Program()
+        with fluid.program_guard(student_main, student_startup):
+            input = fluid.data(name="image", shape=[None, 3, 224, 224])
+            conv1 = conv_bn_layer(input, 8, 3, "conv1")
+            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+            student_predict = conv1 + conv2
+
+        teacher_main = fluid.Program()
+        teacher_startup = fluid.Program()
+        with fluid.program_guard(teacher_main, teacher_startup):
+            input = fluid.data(name="image", shape=[None, 3, 224, 224])
+            conv1 = conv_bn_layer(input, 8, 3, "conv1")
+            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+            sum1 = conv1 + conv2
+            conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
+            conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
+            sum2 = conv4 + sum1
+            conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
+            teacher_predict = conv_bn_layer(conv5, 8, 3, "conv6")
+
+        place = fluid.CPUPlace()
+        data_name_map = {'image': 'image'}
+        merge(teacher_main, student_main, data_name_map, place)
+        merged_ops = []
+        for block in student_main.blocks:
+            for op in block.ops:
+                merged_ops.append(op.type)
+
+        def adaptation_loss(t_var, s_var):
+            teacher_channel = t_var.shape[1]
+            s_hint = fluid.layers.conv2d(s_var, teacher_channel, 1)
+            hint_loss = fluid.layers.reduce_mean(
+                fluid.layers.square(s_hint - t_var))
+            return hint_loss
+
+        with fluid.program_guard(student_main):
+            distill_loss = loss(
+                adaptation_loss,
+                student_main,
+                t_var='teacher_conv6_bn_output.tmp_2',
+                s_var='conv2_bn_output.tmp_2')
+        loss_ops = []
+        for block in student_main.blocks:
+            for op in block.ops:
+                loss_ops.append(op.type)
+        self.assertTrue(set(merged_ops).difference(set(loss_ops)) == set())
+        self.assertTrue(
+            set(loss_ops).difference(set(merged_ops)) ==
+            {'reduce_mean', 'elementwise_sub', 'square'})
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_merge.py b/tests/test_merge.py
new file mode 100644
index 0000000000000000000000000000000000000000..070a7febb8fd5ef6a3676e72f609528fba95e0ce
--- /dev/null
+++ b/tests/test_merge.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import unittest
+import paddle.fluid as fluid
+from paddleslim.dist import merge
+from layers import conv_bn_layer
+
+
+class TestMerge(unittest.TestCase):
+    def test_merge(self):
+        student_main = fluid.Program()
+        student_startup = fluid.Program()
+        with fluid.program_guard(student_main, student_startup):
+            input = fluid.data(name="image", shape=[None, 3, 224, 224])
+            conv1 = conv_bn_layer(input, 8, 3, "conv1")
+            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+            student_predict = conv1 + conv2
+        student_ops = []
+        for block in student_main.blocks:
+            for op in block.ops:
+                student_ops.append(op)
+
+        teacher_main = fluid.Program()
+        teacher_startup = fluid.Program()
+        with fluid.program_guard(teacher_main, teacher_startup):
+            input = fluid.data(name="image", shape=[None, 3, 224, 224])
+            conv1 = conv_bn_layer(input, 8, 3, "conv1")
+            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+            sum1 = conv1 + conv2
+            conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
+            conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
+            sum2 = conv4 + sum1
+            conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
+            teacher_predict = conv_bn_layer(conv5, 8, 3, "conv6")
+        teacher_ops = []
+        for block in teacher_main.blocks:
+            for op in block.ops:
+                teacher_ops.append(op)
+
+        place = fluid.CPUPlace()
+        data_name_map = {'image': 'image'}
+        merge(teacher_main, student_main, data_name_map, place)
+        merged_ops = []
+        for block in student_main.blocks:
+            for op in block.ops:
+                merged_ops.append(op)
+        self.assertTrue(len(student_ops) + len(teacher_ops) == len(merged_ops))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_nas_search_space.py b/tests/test_nas_search_space.py
deleted file mode 100644
index ad373cf146fecb1cf9ea2b3681eaf73e9e65dd3d..0000000000000000000000000000000000000000
--- a/tests/test_nas_search_space.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-sys.path.append('..')
-import unittest
-import paddle.fluid as fluid
-from nas.search_space_factory import SearchSpaceFactory
-
-
-class TestSearchSpace(unittest.TestCase):
-    def test_searchspace(self):
-        # if output_size is 1, the model will add fc layer in the end.
-        config = {'input_size': 224, 'output_size': 7, 'block_num': 5}
-        space = SearchSpaceFactory()
-
-        my_space = space.get_search_space([('MobileNetV2Space', config)])
-        model_arch = my_space.token2arch()
-
-        train_prog = fluid.Program()
-        startup_prog = fluid.Program()
-        with fluid.program_guard(train_prog, startup_prog):
-            input_size = config['input_size']
-            model_input = fluid.layers.data(
-                name='model_in',
-                shape=[1, 3, input_size, input_size],
-                dtype='float32',
-                append_batch_size=False)
-            predict = model_arch[0](model_input)
-            self.assertTrue(predict.shape[2] == config['output_size'])
-
-
-class TestMultiSearchSpace(unittest.TestCase):
-    space = SearchSpaceFactory()
-
-    config0 = {'input_size': 224, 'output_size': 7, 'block_num': 5}
-    config1 = {'input_size': 7, 'output_size': 1, 'block_num': 2}
-    my_space = space.get_search_space(
-        [('MobileNetV2Space', config0), ('ResNetSpace', config1)])
-    model_archs = my_space.token2arch()
-
-    train_prog = fluid.Program()
-    startup_prog = fluid.Program()
-    with fluid.program_guard(train_prog, startup_prog):
-        input_size = config0['input_size']
-        model_input = fluid.layers.data(
-            name='model_in',
-            shape=[1, 3, input_size, input_size],
-            dtype='float32',
-            append_batch_size=False)
-        for model_arch in model_archs:
-            predict = model_arch(model_input)
-            model_input = predict
-        print(predict)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/test_ofa.py b/tests/test_ofa.py
new file mode 100644
index 0000000000000000000000000000000000000000..b65d12e74a6f9ece7866db8468f7e8a1337e485c
--- /dev/null
+++ b/tests/test_ofa.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append("../")
+import numpy as np
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.dygraph.nn as nn
+from paddle.nn import ReLU
+from paddleslim.nas import ofa
+from paddleslim.nas.ofa import OFA, RunConfig, DistillConfig
+from paddleslim.nas.ofa.convert_super import supernet
+from paddleslim.nas.ofa.layers import Block, SuperSeparableConv2D
+
+
+class ModelConv(fluid.dygraph.Layer):
+    def __init__(self):
+        super(ModelConv, self).__init__()
+        with supernet(
+                kernel_size=(3, 5, 7),
+                channel=((4, 8, 12), (8, 12, 16), (8, 12, 16),
+                         (8, 12, 16))) as ofa_super:
+            models = []
+            models += [nn.Conv2D(3, 4, 3)]
+            models += [nn.InstanceNorm(4)]
+            models += [ReLU()]
+            models += [nn.Conv2D(4, 4, 3, groups=4)]
+            models += [nn.InstanceNorm(4)]
+            models += [ReLU()]
+            models += [nn.Conv2DTranspose(4, 4, 3, groups=4, use_cudnn=True)]
+            models += [nn.BatchNorm(4)]
+            models += [ReLU()]
+            models += [nn.Conv2D(4, 3, 3)]
+            models += [ReLU()]
+            models = ofa_super.convert(models)
+
+        models += [
+            Block(
+                SuperSeparableConv2D(
+                    3, 6, 1, candidate_config={'channel': (3, 6)}))
+        ]
+        with supernet(
+                kernel_size=(3, 5, 7), expand_ratio=(1, 2, 4)) as ofa_super:
+            models1 = []
+            models1 += [nn.Conv2D(6, 4, 3)]
+            models1 += [nn.BatchNorm(4)]
+            models1 += [ReLU()]
+            models1 += [nn.Conv2D(4, 4, 3, groups=2)]
+            models1 += [nn.InstanceNorm(4)]
+            models1 += [ReLU()]
+            models1 += [nn.Conv2DTranspose(4, 4, 3, groups=2)]
+            models1 += [nn.BatchNorm(4)]
+            models1 += [ReLU()]
+            models1 += [nn.Conv2DTranspose(4, 4, 3)]
+            models1 += [nn.BatchNorm(4)]
+            models1 += [ReLU()]
+            models1 = ofa_super.convert(models1)
+
+        models += models1
+
+        self.models = paddle.nn.Sequential(*models)
+
+    def forward(self, inputs, depth=None):
+        if depth != None:
+            assert isinstance(depth, int)
+            assert depth <= len(self.models)
+        else:
+            depth = len(self.models)
+        for idx in range(depth):
+            layer = self.models[idx]
+            inputs = layer(inputs)
+        return inputs
+
+
+class ModelLinear(fluid.dygraph.Layer):
+    def __init__(self):
+        super(ModelLinear, self).__init__()
+        models = []
+        with supernet(expand_ratio=(1, 2, 4)) as ofa_super:
+            models1 = []
+            models1 += [nn.Linear(64, 128)]
+            models1 += [nn.Linear(128, 256)]
+            models1 = ofa_super.convert(models1)
+
+        models += models1
+
+        with supernet(channel=((64, 128, 256), (64, 128, 256))) as ofa_super:
+            models1 = []
+            models1 += [nn.Linear(256, 128)]
+            models1 += [nn.Linear(128, 256)]
+            models1 = ofa_super.convert(models1)
+
+        models += models1
+
+        self.models = paddle.nn.Sequential(*models)
+
+    def forward(self, inputs, depth=None):
+        if depth != None:
+            assert isinstance(depth, int)
+            assert depth < len(self.models)
+        else:
+            depth = len(self.models)
+        for idx in range(depth):
+            layer = self.models[idx]
+            inputs = layer(inputs)
+        return inputs
+
+
+class TestOFA(unittest.TestCase):
+    def setUp(self):
+        fluid.enable_dygraph()
+        self.init_model_and_data()
+        self.init_config()
+
+    def init_model_and_data(self):
+        self.model = ModelConv()
+        self.teacher_model = ModelConv()
+        data_np = np.random.random((1, 3, 10, 10)).astype(np.float32)
+        label_np = np.random.random((1)).astype(np.float32)
+
+        self.data = fluid.dygraph.to_variable(data_np)
+
+    def init_config(self):
+        default_run_config = {
+            'train_batch_size': 1,
+            'eval_batch_size': 1,
+            'n_epochs': [[1], [2, 3], [4, 5]],
+            'init_learning_rate': [[0.001], [0.003, 0.001], [0.003, 0.001]],
+            'dynamic_batch_size': [1, 1, 1],
+            'total_images': 1,
+            'elastic_depth': (5, 15, 24)
+        }
+        self.run_config = RunConfig(**default_run_config)
+
+        default_distill_config = {
+            'lambda_distill': 0.01,
+            'teacher_model': self.teacher_model,
+            'mapping_layers': ['models.0.fn']
+        }
+        self.distill_config = DistillConfig(**default_distill_config)
+
+    def test_ofa(self):
+        ofa_model = OFA(self.model,
+                        self.run_config,
+                        distill_config=self.distill_config)
+
+        start_epoch = 0
+        for idx in range(len(self.run_config.n_epochs)):
+            cur_idx = self.run_config.n_epochs[idx]
+            for ph_idx in range(len(cur_idx)):
+                cur_lr = self.run_config.init_learning_rate[idx][ph_idx]
+                adam = fluid.optimizer.Adam(
+                    learning_rate=cur_lr,
+                    parameter_list=(
+                        ofa_model.parameters() + ofa_model.netAs_param))
+                for epoch_id in range(start_epoch,
+                                      self.run_config.n_epochs[idx][ph_idx]):
+                    for model_no in range(self.run_config.dynamic_batch_size[
+                            idx]):
+                        output, _ = ofa_model(self.data)
+                        loss = fluid.layers.reduce_mean(output)
+                        if self.distill_config.mapping_layers != None:
+                            dis_loss = ofa_model.calc_distill_loss()
+                            loss += dis_loss
+                            dis_loss = dis_loss.numpy()[0]
+                        else:
+                            dis_loss = 0
+                        print('epoch: {}, loss: {}, distill loss: {}'.format(
+                            epoch_id, loss.numpy()[0], dis_loss))
+                        loss.backward()
+                        adam.minimize(loss)
+                        adam.clear_gradients()
+                start_epoch = self.run_config.n_epochs[idx][ph_idx]
+
+
+class TestOFACase1(TestOFA):
+    def init_model_and_data(self):
+        self.model = ModelLinear()
+        self.teacher_model = ModelLinear()
+        data_np = np.random.random((3, 64)).astype(np.float32)
+
+        self.data = fluid.dygraph.to_variable(data_np)
+
+    def init_config(self):
+        default_run_config = {
+            'train_batch_size': 1,
+            'eval_batch_size': 1,
+            'n_epochs': [[2, 5]],
+            'init_learning_rate': [[0.003, 0.001]],
+            'dynamic_batch_size': [1],
+            'total_images': 1,
+        }
+        self.run_config = RunConfig(**default_run_config)
+
+        default_distill_config = {
+            'lambda_distill': 0.01,
+            'teacher_model': self.teacher_model,
+        }
+        self.distill_config = DistillConfig(**default_distill_config)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_optimal_threshold.py b/tests/test_optimal_threshold.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e5d19d4dc71235d89cfdccede99f7ddf0ebcae0
--- /dev/null
+++ b/tests/test_optimal_threshold.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import unittest
+import paddle.fluid as fluid
+from paddleslim.prune import Pruner
+from layers import conv_bn_layer
+
+
+class TestPrune(unittest.TestCase):
+    def test_prune(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        #   X       X              O       X              O
+        # conv1-->conv2-->sum1-->conv3-->conv4-->sum2-->conv5-->conv6
+        #     |            ^ |                    ^
+        #     |____________| |____________________|
+        #
+        # X: prune output channels
+        # O: prune input channels
+        with fluid.program_guard(main_program, startup_program):
+            input = fluid.data(name="image", shape=[None, 3, 16, 16])
+            conv1 = conv_bn_layer(input, 8, 3, "conv1")
+            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+            sum1 = conv1 + conv2
+            conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
+            conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
+            sum2 = conv4 + sum1
+            conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
+            conv6 = conv_bn_layer(conv5, 8, 3, "conv6")
+
+        shapes = {}
+        for param in main_program.global_block().all_parameters():
+            shapes[param.name] = param.shape
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        scope = fluid.Scope()
+        exe.run(startup_program, scope=scope)
+        criterion = 'bn_scale'
+        idx_selector = 'optimal_threshold'
+        pruner = Pruner(criterion, idx_selector=idx_selector)
+        main_program, _, _ = pruner.prune(
+            main_program,
+            scope,
+            params=["conv4_weights"],
+            ratios=[0.5],
+            place=place,
+            lazy=False,
+            only_graph=False,
+            param_backup=None,
+            param_shape_backup=None)
+
+        shapes = {
+            "conv1_weights": (4, 3, 3, 3),
+            "conv2_weights": (4, 4, 3, 3),
+            "conv3_weights": (8, 4, 3, 3),
+            "conv4_weights": (4, 8, 3, 3),
+            "conv5_weights": (8, 4, 3, 3),
+            "conv6_weights": (8, 8, 3, 3)
+        }
+
+        for param in main_program.global_block().all_parameters():
+            if "weights" in param.name:
+                print("param: {}; param shape: {}".format(param.name,
+                                                          param.shape))
+                #self.assertTrue(param.shape == shapes[param.name])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_prune.py b/tests/test_prune.py
index 931cf9cf35429a1aa9ca53b5b5d8444f71d3ec39..74e78192a785950733adcbbed6c79fe6d1f1e7db 100644
--- a/tests/test_prune.py
+++ b/tests/test_prune.py
@@ -41,6 +41,9 @@ class TestPrune(unittest.TestCase):
             conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
             conv6 = conv_bn_layer(conv5, 8, 3, "conv6")
 
+            conv7 = fluid.layers.conv2d_transpose(
+                input=conv6, num_filters=16, filter_size=2, stride=2)
+
         shapes = {}
         for param in main_program.global_block().all_parameters():
             shapes[param.name] = param.shape
@@ -53,8 +56,8 @@ class TestPrune(unittest.TestCase):
         main_program, _, _ = pruner.prune(
             main_program,
             scope,
-            params=["conv4_weights"],
-            ratios=[0.5],
+            params=["conv4_weights", "conv2d_transpose_0.w_0"],
+            ratios=[0.5, 0.6],
             place=place,
             lazy=False,
             only_graph=False,
@@ -62,16 +65,19 @@ class TestPrune(unittest.TestCase):
             param_shape_backup=None)
 
         shapes = {
-            "conv1_weights": (4L, 3L, 3L, 3L),
-            "conv2_weights": (4L, 4L, 3L, 3L),
-            "conv3_weights": (8L, 4L, 3L, 3L),
-            "conv4_weights": (4L, 8L, 3L, 3L),
-            "conv5_weights": (8L, 4L, 3L, 3L),
-            "conv6_weights": (8L, 8L, 3L, 3L)
+            "conv1_weights": (4, 3, 3, 3),
+            "conv2_weights": (4, 4, 3, 3),
+            "conv3_weights": (8, 4, 3, 3),
+            "conv4_weights": (4, 8, 3, 3),
+            "conv5_weights": (8, 4, 3, 3),
+            "conv6_weights": (8, 8, 3, 3),
+            "conv2d_transpose_0.w_0": (8, 16, 2, 2),
         }
 
         for param in main_program.global_block().all_parameters():
-            if "weights" in param.name:
+            if param.name in shapes:
+                print("param: {}; param shape: {}".format(param.name,
+                                                          param.shape))
                 self.assertTrue(param.shape == shapes[param.name])
 
 
diff --git a/tests/test_prune_walker.py b/tests/test_prune_walker.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d4855ee0dc137bfa5e1795a45cd012a4c2ff602
--- /dev/null
+++ b/tests/test_prune_walker.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddleslim.prune import Pruner
+from layers import conv_bn_layer
+import random
+from paddleslim.core import GraphWrapper
+
+
+class TestPrune(unittest.TestCase):
+    def test_prune(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        #   X       X              O       X              O
+        # conv1-->conv2-->sum1-->conv3-->conv4-->sum2-->conv5-->conv6
+        #     |            ^ |                    ^
+        #     |____________| |____________________|
+        #
+        # X: prune output channels
+        # O: prune input channels
+        with fluid.program_guard(main_program, startup_program):
+            input = fluid.data(name="image", shape=[None, 3, 16, 16])
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            conv1 = conv_bn_layer(input, 8, 3, "conv1", act='relu')
+            conv2 = conv_bn_layer(conv1, 8, 3, "conv2", act='leaky_relu')
+            sum1 = conv1 + conv2
+            conv3 = conv_bn_layer(sum1, 8, 3, "conv3", act='relu6')
+            conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
+            sum2 = conv4 + sum1
+            conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
+
+            flag = fluid.layers.fill_constant([1], value=1, dtype='int32')
+            rand_flag = paddle.randint(2, dtype='int32')
+            cond = fluid.layers.less_than(x=flag, y=rand_flag)
+            cond_output = fluid.layers.create_global_var(
+                shape=[1],
+                value=0.0,
+                dtype='float32',
+                persistable=False,
+                name='cond_output')
+
+            def cond_block1():
+                cond_conv = conv_bn_layer(conv5, 8, 3, "conv_cond1_1")
+                fluid.layers.assign(input=cond_conv, output=cond_output)
+
+            def cond_block2():
+                cond_conv1 = conv_bn_layer(conv5, 8, 3, "conv_cond2_1")
+                cond_conv2 = conv_bn_layer(cond_conv1, 8, 3, "conv_cond2_2")
+                fluid.layers.assign(input=cond_conv2, output=cond_output)
+
+            fluid.layers.cond(cond, cond_block1, cond_block2)
+            sum3 = fluid.layers.sum([sum2, cond_output])
+
+            conv6 = conv_bn_layer(sum3, 8, 3, "conv6")
+            sub1 = conv6 - sum3
+            mult = sub1 * sub1
+            conv7 = conv_bn_layer(
+                mult, 8, 3, "Depthwise_Conv7", groups=8, use_cudnn=False)
+            floored = fluid.layers.floor(conv7)
+            scaled = fluid.layers.scale(floored)
+            concated = fluid.layers.concat([scaled, mult], axis=1)
+            conv8 = conv_bn_layer(concated, 8, 3, "conv8")
+            predict = fluid.layers.fc(input=conv8, size=10, act='softmax')
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            adam_optimizer = fluid.optimizer.AdamOptimizer(0.01)
+            avg_cost = fluid.layers.mean(cost)
+            adam_optimizer.minimize(avg_cost)
+
+        params = []
+        for param in main_program.all_parameters():
+            if 'conv' in param.name:
+                params.append(param.name)
+        #TODO: To support pruning convolution before fc layer.
+        params.remove('conv8_weights')
+
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(startup_program)
+        x = np.random.random(size=(10, 3, 16, 16)).astype('float32')
+        label = np.random.random(size=(10, 1)).astype('int64')
+        loss_data, = exe.run(main_program,
+                             feed={"image": x,
+                                   "label": label},
+                             fetch_list=[cost.name])
+        pruner = Pruner()
+        main_program, _, _ = pruner.prune(
+            main_program,
+            fluid.global_scope(),
+            params=params,
+            ratios=[0.5] * len(params),
+            place=place,
+            lazy=False,
+            only_graph=False,
+            param_backup=None,
+            param_shape_backup=None)
+
+        loss_data, = exe.run(main_program,
+                             feed={"image": x,
+                                   "label": label},
+                             fetch_list=[cost.name])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_pruned_model_save_load.py b/tests/test_pruned_model_save_load.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3ee10c8b7876cbe1dfeb6066a2e5b0168d07579
--- /dev/null
+++ b/tests/test_pruned_model_save_load.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import unittest
+import paddle.fluid as fluid
+from paddleslim.prune import Pruner, save_model, load_model
+from layers import conv_bn_layer
+import numpy as np
+import numpy
+
+
+class TestSaveAndLoad(unittest.TestCase):
+    def test_prune(self):
+        train_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            input = fluid.data(name="image", shape=[None, 3, 16, 16])
+            conv1 = conv_bn_layer(input, 8, 3, "conv1")
+            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+            sum1 = conv1 + conv2
+            conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
+            conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
+            sum2 = conv4 + sum1
+            conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
+            conv6 = conv_bn_layer(conv5, 8, 3, "conv6")
+            feature = fluid.layers.reshape(conv6, [-1, 128, 16])
+            predict = fluid.layers.fc(input=feature, size=10, act='softmax')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            print(label.shape)
+            print(predict.shape)
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(cost)
+            adam_optimizer = fluid.optimizer.AdamOptimizer(0.01)
+            adam_optimizer.minimize(avg_cost)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        scope = fluid.global_scope()
+        exe.run(startup_program, scope=scope)
+        criterion = 'bn_scale'
+        pruner = Pruner(criterion)
+        main_program, _, _ = pruner.prune(
+            train_program,
+            scope,
+            params=["conv4_weights"],
+            ratios=[0.5],
+            place=place,
+            lazy=False,
+            only_graph=False,
+            param_backup=None,
+            param_shape_backup=None)
+
+        x = numpy.random.random(size=(10, 3, 16, 16)).astype('float32')
+        label = numpy.random.random(size=(10, 1)).astype('int64')
+        loss_data, = exe.run(train_program,
+                             feed={"image": x,
+                                   "label": label},
+                             fetch_list=[cost.name])
+
+        save_model(exe, main_program, 'model_file')
+        pruned_program = fluid.Program()
+        pruned_startup_program = fluid.Program()
+        with fluid.program_guard(pruned_program, pruned_startup_program):
+            input = fluid.data(name="image", shape=[None, 3, 16, 16])
+            conv1 = conv_bn_layer(input, 8, 3, "conv1")
+            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+            sum1 = conv1 + conv2
+            conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
+            conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
+            sum2 = conv4 + sum1
+            conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
+            conv6 = conv_bn_layer(conv5, 8, 3, "conv6")
+        pruned_test_program = pruned_program.clone(for_test=True)
+        exe.run(pruned_startup_program)
+        load_model(exe, pruned_program, 'model_file')
+        load_model(exe, pruned_test_program, 'model_file')
+        shapes = {
+            "conv1_weights": (4, 3, 3, 3),
+            "conv2_weights": (4, 4, 3, 3),
+            "conv3_weights": (8, 4, 3, 3),
+            "conv4_weights": (4, 8, 3, 3),
+            "conv5_weights": (8, 4, 3, 3),
+            "conv6_weights": (8, 8, 3, 3)
+        }
+
+        for param in pruned_program.global_block().all_parameters():
+            if "weights" in param.name:
+                print("param: {}; param shape: {}".format(param.name,
+                                                          param.shape))
+                self.assertTrue(param.shape == shapes[param.name])
+        for param in pruned_test_program.global_block().all_parameters():
+            if "weights" in param.name:
+                print("param: {}; param shape: {}".format(param.name,
+                                                          param.shape))
+                self.assertTrue(param.shape == shapes[param.name])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_quant_aware.py b/tests/test_quant_aware.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0a8464f7e688625ff87279ce3fb030f7ba9cff3
--- /dev/null
+++ b/tests/test_quant_aware.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import unittest
+import paddle
+import paddle.fluid as fluid
+from paddleslim.quant import quant_aware, convert
+sys.path.append("../demo")
+from models import MobileNet
+from layers import conv_bn_layer
+import paddle.dataset.mnist as reader
+from paddle.fluid.framework import IrGraph
+from paddle.fluid import core
+import numpy as np
+
+
+class TestQuantAwareCase1(unittest.TestCase):
+    def get_model(self):
+        image = fluid.layers.data(
+            name='image', shape=[1, 28, 28], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        model = MobileNet()
+        out = model.net(input=image, class_dim=10)
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        startup_prog = fluid.default_startup_program()
+        train_prog = fluid.default_main_program()
+        return startup_prog, train_prog
+
+    def get_op_number(self, prog):
+
+        graph = IrGraph(core.Graph(prog.desc), for_test=False)
+        quant_op_nums = 0
+        op_nums = 0
+        for op in graph.all_op_nodes():
+            if op.name() in ['conv2d', 'depthwise_conv2d', 'mul']:
+                op_nums += 1
+            elif 'fake_' in op.name():
+                quant_op_nums += 1
+        return op_nums, quant_op_nums
+
+    def test_quant_op(self):
+        startup_prog, train_prog = self.get_model()
+        place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        config_1 = {
+            'weight_quantize_type': 'channel_wise_abs_max',
+            'activation_quantize_type': 'moving_average_abs_max',
+            'quantize_op_types': ['depthwise_conv2d', 'mul', 'conv2d'],
+        }
+
+        quant_prog_1 = quant_aware(
+            train_prog, place, config=config_1, for_test=True)
+        op_nums_1, quant_op_nums_1 = self.get_op_number(quant_prog_1)
+        convert_prog_1 = convert(quant_prog_1, place, config=config_1)
+        convert_op_nums_1, convert_quant_op_nums_1 = self.get_op_number(
+            convert_prog_1)
+
+        config_1['not_quant_pattern'] = ['last_fc']
+        quant_prog_2 = quant_aware(
+            train_prog, place, config=config_1, for_test=True)
+        op_nums_2, quant_op_nums_2 = self.get_op_number(quant_prog_2)
+        convert_prog_2 = convert(quant_prog_2, place, config=config_1)
+        convert_op_nums_2, convert_quant_op_nums_2 = self.get_op_number(
+            convert_prog_2)
+
+        self.assertTrue(op_nums_1 == op_nums_2)
+        # test quant_aware op numbers
+        self.assertTrue(op_nums_1 * 4 == quant_op_nums_1)
+        # test convert op numbers
+        self.assertTrue(convert_op_nums_1 * 2 == convert_quant_op_nums_1)
+        # test skip_quant
+        self.assertTrue(quant_op_nums_1 - 4 == quant_op_nums_2)
+        self.assertTrue(convert_quant_op_nums_1 - 2 == convert_quant_op_nums_2)
+
+
+class TestQuantAwareCase2(unittest.TestCase):
+    def test_accuracy(self):
+        image = fluid.layers.data(
+            name='image', shape=[1, 28, 28], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        model = MobileNet()
+        out = model.net(input=image, class_dim=10)
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+        optimizer = fluid.optimizer.Momentum(
+            momentum=0.9,
+            learning_rate=0.01,
+            regularization=fluid.regularizer.L2Decay(4e-5))
+        optimizer.minimize(avg_cost)
+        main_prog = fluid.default_main_program()
+        val_prog = main_prog.clone(for_test=True)
+
+        place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        feeder = fluid.DataFeeder([image, label], place, program=main_prog)
+        train_reader = paddle.fluid.io.batch(
+            paddle.dataset.mnist.train(), batch_size=64)
+        eval_reader = paddle.fluid.io.batch(
+            paddle.dataset.mnist.test(), batch_size=64)
+
+        def train(program):
+            iter = 0
+            for data in train_reader():
+                cost, top1, top5 = exe.run(
+                    program,
+                    feed=feeder.feed(data),
+                    fetch_list=[avg_cost, acc_top1, acc_top5])
+                iter += 1
+                if iter % 100 == 0:
+                    print(
+                        'train iter={}, avg loss {}, acc_top1 {}, acc_top5 {}'.
+                        format(iter, cost, top1, top5))
+
+        def test(program):
+            iter = 0
+            result = [[], [], []]
+            for data in eval_reader():
+                cost, top1, top5 = exe.run(
+                    program,
+                    feed=feeder.feed(data),
+                    fetch_list=[avg_cost, acc_top1, acc_top5])
+                iter += 1
+                if iter % 100 == 0:
+                    print(
+                        'eval iter={}, avg loss {}, acc_top1 {}, acc_top5 {}'.
+                        format(iter, cost, top1, top5))
+                result[0].append(cost)
+                result[1].append(top1)
+                result[2].append(top5)
+            print(' avg loss {}, acc_top1 {}, acc_top5 {}'.format(
+                np.mean(result[0]), np.mean(result[1]), np.mean(result[2])))
+            return np.mean(result[1]), np.mean(result[2])
+
+        train(main_prog)
+        top1_1, top5_1 = test(main_prog)
+
+        config = {
+            'weight_quantize_type': 'channel_wise_abs_max',
+            'activation_quantize_type': 'moving_average_abs_max',
+            'quantize_op_types': ['depthwise_conv2d', 'mul', 'conv2d'],
+        }
+        quant_train_prog = quant_aware(
+            main_prog, place, config, for_test=False)
+        quant_eval_prog = quant_aware(val_prog, place, config, for_test=True)
+        train(quant_train_prog)
+        quant_eval_prog, int8_prog = convert(
+            quant_eval_prog, place, config, save_int8=True)
+        top1_2, top5_2 = test(quant_eval_prog)
+        # values before quantization and after quantization should be close
+        print("before quantization: top1: {}, top5: {}".format(top1_1, top5_1))
+        print("after quantization: top1: {}, top5: {}".format(top1_2, top5_2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_quant_aware_user_defined.py b/tests/test_quant_aware_user_defined.py
new file mode 100644
index 0000000000000000000000000000000000000000..d59c2ee16a9c920d86265b8eb7f8ca7517e3c329
--- /dev/null
+++ b/tests/test_quant_aware_user_defined.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import unittest
+import paddle
+import paddle.fluid as fluid
+from paddleslim.quant import quant_aware, convert
+sys.path.append("../demo")
+from models import MobileNet
+from layers import conv_bn_layer
+import paddle.dataset.mnist as reader
+from paddle.fluid.framework import IrGraph
+from paddle.fluid import core
+import numpy as np
+
+from paddle.fluid.layer_helper import LayerHelper
+
+
+def pact(x, name=None):
+    helper = LayerHelper("pact", **locals())
+    dtype = 'float32'
+    init_thres = 20
+    u_param_attr = fluid.ParamAttr(
+        name=x.name + '_pact',
+        initializer=fluid.initializer.ConstantInitializer(value=init_thres),
+        regularizer=fluid.regularizer.L2Decay(0.0001),
+        learning_rate=1)
+    u_param = helper.create_parameter(
+        attr=u_param_attr, shape=[1], dtype=dtype)
+    x = fluid.layers.elementwise_sub(
+        x, fluid.layers.relu(fluid.layers.elementwise_sub(x, u_param)))
+    x = fluid.layers.elementwise_add(
+        x, fluid.layers.relu(fluid.layers.elementwise_sub(-u_param, x)))
+
+    return x
+
+
+def get_optimizer():
+    return fluid.optimizer.MomentumOptimizer(0.0001, 0.9)
+
+
+class TestQuantAwareCase1(unittest.TestCase):
+    def get_model(self):
+        image = fluid.layers.data(
+            name='image', shape=[1, 28, 28], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        model = MobileNet()
+        out = model.net(input=image, class_dim=10)
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        startup_prog = fluid.default_startup_program()
+        train_prog = fluid.default_main_program()
+        return startup_prog, train_prog
+
+    def test_accuracy(self):
+        image = fluid.layers.data(
+            name='image', shape=[1, 28, 28], dtype='float32')
+        image.stop_gradient = False
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        model = MobileNet()
+        out = model.net(input=image, class_dim=10)
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+        optimizer = fluid.optimizer.Momentum(
+            momentum=0.9,
+            learning_rate=0.01,
+            regularization=fluid.regularizer.L2Decay(4e-5))
+        optimizer.minimize(avg_cost)
+        main_prog = fluid.default_main_program()
+        val_prog = main_prog.clone(for_test=True)
+
+        place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        feeder = fluid.DataFeeder([image, label], place, program=main_prog)
+        train_reader = paddle.fluid.io.batch(
+            paddle.dataset.mnist.train(), batch_size=64)
+        eval_reader = paddle.fluid.io.batch(
+            paddle.dataset.mnist.test(), batch_size=64)
+
+        def train(program):
+            iter = 0
+            for data in train_reader():
+                cost, top1, top5 = exe.run(
+                    program,
+                    feed=feeder.feed(data),
+                    fetch_list=[avg_cost, acc_top1, acc_top5])
+                iter += 1
+                if iter % 100 == 0:
+                    print(
+                        'train iter={}, avg loss {}, acc_top1 {}, acc_top5 {}'.
+                        format(iter, cost, top1, top5))
+
+        def test(program):
+            iter = 0
+            result = [[], [], []]
+            for data in eval_reader():
+                cost, top1, top5 = exe.run(
+                    program,
+                    feed=feeder.feed(data),
+                    fetch_list=[avg_cost, acc_top1, acc_top5])
+                iter += 1
+                if iter % 100 == 0:
+                    print(
+                        'eval iter={}, avg loss {}, acc_top1 {}, acc_top5 {}'.
+                        format(iter, cost, top1, top5))
+                result[0].append(cost)
+                result[1].append(top1)
+                result[2].append(top5)
+            print(' avg loss {}, acc_top1 {}, acc_top5 {}'.format(
+                np.mean(result[0]), np.mean(result[1]), np.mean(result[2])))
+            return np.mean(result[1]), np.mean(result[2])
+
+        train(main_prog)
+        top1_1, top5_1 = test(main_prog)
+
+        config = {
+            'weight_quantize_type': 'channel_wise_abs_max',
+            'activation_quantize_type': 'moving_average_abs_max',
+            'quantize_op_types': ['depthwise_conv2d', 'mul', 'conv2d'],
+        }
+        quant_train_prog_pact = quant_aware(
+            main_prog,
+            place,
+            config,
+            for_test=False,
+            act_preprocess_func=pact,
+            optimizer_func=get_optimizer,
+            executor=exe)
+
+        quant_eval_prog = quant_aware(val_prog, place, config, for_test=True)
+        train(quant_train_prog_pact)
+        quant_eval_prog, int8_prog = convert(
+            quant_eval_prog, place, config, save_int8=True)
+        top1_2, top5_2 = test(quant_eval_prog)
+        # values before quantization and after quantization should be close
+        print("before quantization: top1: {}, top5: {}".format(top1_1, top5_1))
+        print("after quantization: top1: {}, top5: {}".format(top1_2, top5_2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_quant_embedding.py b/tests/test_quant_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..028d4a6183676ec596cf5cb80a5b1ba76b7bb90a
--- /dev/null
+++ b/tests/test_quant_embedding.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import paddle.fluid as fluid
+import paddleslim.quant as quant
+import unittest
+
+
+class TestQuantEmbedding(unittest.TestCase):
+    def test_quant_embedding(self):
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program):
+            input_word = fluid.data(
+                name="input_word", shape=[None, 1], dtype='int64')
+            input_emb = fluid.embedding(
+                input=input_word,
+                is_sparse=False,
+                size=[100, 128],
+                param_attr=fluid.ParamAttr(
+                    name='emb',
+                    initializer=fluid.initializer.Uniform(-0.005, 0.005)))
+
+        infer_program = train_program.clone(for_test=True)
+
+        use_gpu = True
+        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        quant_program = quant.quant_embedding(infer_program, place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_quant_post.py b/tests/test_quant_post.py
new file mode 100644
index 0000000000000000000000000000000000000000..858fd29c5203198b8eb2b46b3454d5be20557397
--- /dev/null
+++ b/tests/test_quant_post.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import unittest
+import paddle
+import paddle.fluid as fluid
+from paddleslim.quant import quant_post_static
+sys.path.append("../demo")
+from models import MobileNet
+from layers import conv_bn_layer
+import paddle.dataset.mnist as reader
+from paddle.fluid.framework import IrGraph
+from paddle.fluid import core
+import numpy as np
+
+
+class TestQuantAwareCase1(unittest.TestCase):
+    def test_accuracy(self):
+        image = fluid.layers.data(
+            name='image', shape=[1, 28, 28], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        model = MobileNet()
+        out = model.net(input=image, class_dim=10)
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+        optimizer = fluid.optimizer.Momentum(
+            momentum=0.9,
+            learning_rate=0.01,
+            regularization=fluid.regularizer.L2Decay(4e-5))
+        optimizer.minimize(avg_cost)
+        main_prog = fluid.default_main_program()
+        val_prog = main_prog.clone(for_test=True)
+
+        place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        feeder = fluid.DataFeeder([image, label], place, program=main_prog)
+        train_reader = paddle.fluid.io.batch(
+            paddle.dataset.mnist.train(), batch_size=64)
+        eval_reader = paddle.fluid.io.batch(
+            paddle.dataset.mnist.test(), batch_size=64)
+
+        def train(program):
+            iter = 0
+            for data in train_reader():
+                cost, top1, top5 = exe.run(
+                    program,
+                    feed=feeder.feed(data),
+                    fetch_list=[avg_cost, acc_top1, acc_top5])
+                iter += 1
+                if iter % 100 == 0:
+                    print(
+                        'train iter={}, avg loss {}, acc_top1 {}, acc_top5 {}'.
+                        format(iter, cost, top1, top5))
+
+        def test(program, outputs=[avg_cost, acc_top1, acc_top5]):
+            iter = 0
+            result = [[], [], []]
+            for data in train_reader():
+                cost, top1, top5 = exe.run(program,
+                                           feed=feeder.feed(data),
+                                           fetch_list=outputs)
+                iter += 1
+                if iter % 100 == 0:
+                    print(
+                        'eval iter={}, avg loss {}, acc_top1 {}, acc_top5 {}'.
+                        format(iter, cost, top1, top5))
+                result[0].append(cost)
+                result[1].append(top1)
+                result[2].append(top5)
+            print(' avg loss {}, acc_top1 {}, acc_top5 {}'.format(
+                np.mean(result[0]), np.mean(result[1]), np.mean(result[2])))
+            return np.mean(result[1]), np.mean(result[2])
+
+        train(main_prog)
+        top1_1, top5_1 = test(val_prog)
+        fluid.io.save_inference_model(
+            dirname='./test_quant_post',
+            feeded_var_names=[image.name, label.name],
+            target_vars=[avg_cost, acc_top1, acc_top5],
+            main_program=val_prog,
+            executor=exe,
+            model_filename='model',
+            params_filename='params')
+
+        quant_post_static(
+            exe,
+            './test_quant_post',
+            './test_quant_post_inference',
+            sample_generator=paddle.dataset.mnist.test(),
+            model_filename='model',
+            params_filename='params',
+            batch_nums=10)
+        quant_post_prog, feed_target_names, fetch_targets = fluid.io.load_inference_model(
+            dirname='./test_quant_post_inference',
+            executor=exe,
+            model_filename='__model__',
+            params_filename='__params__')
+        top1_2, top5_2 = test(quant_post_prog, fetch_targets)
+        print("before quantization: top1: {}, top5: {}".format(top1_1, top5_1))
+        print("after quantization: top1: {}, top5: {}".format(top1_2, top5_2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_quant_post_only_weight.py b/tests/test_quant_post_only_weight.py
new file mode 100644
index 0000000000000000000000000000000000000000..ede4094dc2c7017cdad36669c51c4195e17fa498
--- /dev/null
+++ b/tests/test_quant_post_only_weight.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import unittest
+import paddle
+import paddle.fluid as fluid
+from paddleslim.quant import quant_post_dynamic
+sys.path.append("../demo")
+from models import MobileNet
+from layers import conv_bn_layer
+import paddle.dataset.mnist as reader
+from paddle.fluid.framework import IrGraph
+from paddle.fluid import core
+import numpy as np
+
+
+class TestQuantPostOnlyWeightCase1(unittest.TestCase):
+    def test_accuracy(self):
+        image = fluid.layers.data(
+            name='image', shape=[1, 28, 28], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        model = MobileNet()
+        out = model.net(input=image, class_dim=10)
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+        optimizer = fluid.optimizer.Momentum(
+            momentum=0.9,
+            learning_rate=0.01,
+            regularization=fluid.regularizer.L2Decay(4e-5))
+        optimizer.minimize(avg_cost)
+        main_prog = fluid.default_main_program()
+        val_prog = main_prog.clone(for_test=True)
+
+        place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        feeder = fluid.DataFeeder([image, label], place, program=main_prog)
+        train_reader = paddle.fluid.io.batch(
+            paddle.dataset.mnist.train(), batch_size=64)
+        eval_reader = paddle.fluid.io.batch(
+            paddle.dataset.mnist.test(), batch_size=64)
+
+        def train(program):
+            iter = 0
+            for data in train_reader():
+                cost, top1, top5 = exe.run(
+                    program,
+                    feed=feeder.feed(data),
+                    fetch_list=[avg_cost, acc_top1, acc_top5])
+                iter += 1
+                if iter % 100 == 0:
+                    print(
+                        'train iter={}, avg loss {}, acc_top1 {}, acc_top5 {}'.
+                        format(iter, cost, top1, top5))
+
+        def test(program, outputs=[avg_cost, acc_top1, acc_top5]):
+            iter = 0
+            result = [[], [], []]
+            for data in train_reader():
+                cost, top1, top5 = exe.run(program,
+                                           feed=feeder.feed(data),
+                                           fetch_list=outputs)
+                iter += 1
+                if iter % 100 == 0:
+                    print('eval iter={}, avg loss {}, acc_top1 {}, acc_top5 {}'.
+                          format(iter, cost, top1, top5))
+                result[0].append(cost)
+                result[1].append(top1)
+                result[2].append(top5)
+            print(' avg loss {}, acc_top1 {}, acc_top5 {}'.format(
+                np.mean(result[0]), np.mean(result[1]), np.mean(result[2])))
+            return np.mean(result[1]), np.mean(result[2])
+
+        train(main_prog)
+        top1_1, top5_1 = test(val_prog)
+        fluid.io.save_inference_model(
+            dirname='./test_quant_post_dynamic',
+            feeded_var_names=[image.name, label.name],
+            target_vars=[avg_cost, acc_top1, acc_top5],
+            main_program=val_prog,
+            executor=exe,
+            model_filename='model',
+            params_filename='params')
+
+        quant_post_dynamic(
+            model_dir='./test_quant_post_dynamic',
+            save_model_dir='./test_quant_post_inference',
+            model_filename='model',
+            params_filename='params',
+            generate_test_model=True)
+        quant_post_prog, feed_target_names, fetch_targets = fluid.io.load_inference_model(
+            dirname='./test_quant_post_inference/test_model', executor=exe)
+        top1_2, top5_2 = test(quant_post_prog, fetch_targets)
+        print("before quantization: top1: {}, top5: {}".format(top1_1, top5_1))
+        print("after quantization: top1: {}, top5: {}".format(top1_2, top5_2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_rl_nas.py b/tests/test_rl_nas.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba9c92210acb1bf0f6de6c1795d5255215dbfd16
--- /dev/null
+++ b/tests/test_rl_nas.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import unittest
+import paddle.fluid as fluid
+from paddleslim.nas import RLNAS
+from paddleslim.analysis import flops
+import numpy as np
+
+
+def compute_op_num(program):
+    params = {}
+    ch_list = []
+    for block in program.blocks:
+        for param in block.all_parameters():
+            if len(param.shape) == 4:
+                params[param.name] = param.shape
+                ch_list.append(int(param.shape[0]))
+    return params, ch_list
+
+
+class TestRLNAS(unittest.TestCase):
+    def setUp(self):
+        self.init_test_case()
+        port = np.random.randint(8337, 8773)
+        self.rlnas = RLNAS(
+            key='lstm',
+            configs=self.configs,
+            server_addr=("", port),
+            is_sync=False,
+            controller_batch_size=1,
+            lstm_num_layers=1,
+            hidden_size=10,
+            temperature=1.0,
+            save_controller=False)
+
+    def init_test_case(self):
+        self.configs = [('MobileNetV2BlockSpace', {'block_mask': [0]})]
+        self.filter_num = np.array([
+            3, 4, 8, 12, 16, 24, 32, 48, 64, 80, 96, 128, 144, 160, 192, 224,
+            256, 320, 384, 512
+        ])
+        self.k_size = np.array([3, 5])
+        self.multiply = np.array([1, 2, 3, 4, 5, 6])
+        self.repeat = np.array([1, 2, 3, 4, 5, 6])
+
+    def check_chnum_convnum(self, program, current_tokens):
+        channel_exp = self.multiply[current_tokens[0]]
+        filter_num = self.filter_num[current_tokens[1]]
+        repeat_num = self.repeat[current_tokens[2]]
+
+        conv_list, ch_pro = compute_op_num(program)
+        ### assert conv number
+        self.assertTrue((repeat_num * 3) == len(
+            conv_list
+        ), "the number of conv is NOT match, the number compute from token: {}, actual conv number: {}".
+                        format(repeat_num * 3, len(conv_list)))
+
+        ### assert number of channels
+        ch_token = []
+        init_ch_num = 32
+        for i in range(repeat_num):
+            ch_token.append(init_ch_num * channel_exp)
+            ch_token.append(init_ch_num * channel_exp)
+            ch_token.append(filter_num)
+            init_ch_num = filter_num
+
+        self.assertTrue(
+            str(ch_token) == str(ch_pro),
+            "channel num is WRONG, channel num from token is {}, channel num come fom program is {}".
+            format(str(ch_token), str(ch_pro)))
+
+    def test_all_function(self):
+        ### unittest for next_archs
+        next_program = fluid.Program()
+        startup_program = fluid.Program()
+        token2arch_program = fluid.Program()
+
+        with fluid.program_guard(next_program, startup_program):
+            inputs = fluid.data(
+                name='input', shape=[None, 3, 32, 32], dtype='float32')
+            archs = self.rlnas.next_archs(1)[0]
+            current_tokens = self.rlnas.tokens
+            for arch in archs:
+                output = arch(inputs)
+                inputs = output
+        self.check_chnum_convnum(next_program, current_tokens[0])
+
+        ### unittest for reward
+        self.assertTrue(self.rlnas.reward(float(1.0)), "reward is False")
+
+        ### uniitest for tokens2arch
+        with fluid.program_guard(token2arch_program, startup_program):
+            inputs = fluid.data(
+                name='input', shape=[None, 3, 32, 32], dtype='float32')
+            arch = self.rlnas.tokens2arch(self.rlnas.tokens[0])
+            for arch in archs:
+                output = arch(inputs)
+                inputs = output
+        self.check_chnum_convnum(token2arch_program, self.rlnas.tokens[0])
+
+    def test_final_archs(self):
+        ### unittest for final_archs
+        final_program = fluid.Program()
+        final_startup_program = fluid.Program()
+        with fluid.program_guard(final_program, final_startup_program):
+            inputs = fluid.data(
+                name='input', shape=[None, 3, 32, 32], dtype='float32')
+            archs = self.rlnas.final_archs(1)[0]
+            current_tokens = self.rlnas.tokens
+            for arch in archs:
+                output = arch(inputs)
+                inputs = output
+        self.check_chnum_convnum(final_program, current_tokens[0])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_sa_nas.py b/tests/test_sa_nas.py
index a4203a85a898632ac2102eb61ab7dd7b475e73ef..1ba7df9befbcd1def0f5353c0c6f164494620ff9 100644
--- a/tests/test_sa_nas.py
+++ b/tests/test_sa_nas.py
@@ -13,49 +13,105 @@
 # limitations under the License.
 import sys
 sys.path.append("../")
+import os
+import sys
 import unittest
 import paddle.fluid as fluid
 from paddleslim.nas import SANAS
-from paddleslim.nas import SearchSpaceFactory
 from paddleslim.analysis import flops
+import numpy as np
+
+
+def compute_op_num(program):
+    params = {}
+    ch_list = []
+    for block in program.blocks:
+        for param in block.all_parameters():
+            if len(param.shape) == 4:
+                params[param.name] = param.shape
+                ch_list.append(int(param.shape[0]))
+    return params, ch_list
 
 
 class TestSANAS(unittest.TestCase):
-    def test_nas(self):
-
-        factory = SearchSpaceFactory()
-        config0 = {'input_size': 224, 'output_size': 7, 'block_num': 5}
-        config1 = {'input_size': 7, 'output_size': 1, 'block_num': 2}
-        configs = [('MobileNetV2Space', config0), ('ResNetSpace', config1)]
-
-        space = factory.get_search_space([('MobileNetV2Space', config0)])
-        origin_arch = space.token2arch()[0]
-
-        main_program = fluid.Program()
-        s_program = fluid.Program()
-        with fluid.program_guard(main_program, s_program):
-            input = fluid.data(
-                name="input", shape=[None, 3, 224, 224], dtype="float32")
-            origin_arch(input)
-        base_flops = flops(main_program)
-
-        search_steps = 3
-        sa_nas = SANAS(
-            configs,
-            search_steps=search_steps,
-            server_addr=("", 0),
-            is_server=True)
-
-        for i in range(search_steps):
-            archs = sa_nas.next_archs()
-            main_program = fluid.Program()
-            s_program = fluid.Program()
-            with fluid.program_guard(main_program, s_program):
-                input = fluid.data(
-                    name="input", shape=[None, 3, 224, 224], dtype="float32")
-                archs[0](input)
-            sa_nas.reward(1)
-            self.assertTrue(flops(main_program) < base_flops)
+    def setUp(self):
+        self.init_test_case()
+        port = np.random.randint(8337, 8773)
+        self.sanas = SANAS(
+            configs=self.configs, server_addr=("", port), save_checkpoint=None)
+
+    def init_test_case(self):
+        self.configs = [('MobileNetV2BlockSpace', {'block_mask': [0]})]
+        self.filter_num = np.array([
+            3, 4, 8, 12, 16, 24, 32, 48, 64, 80, 96, 128, 144, 160, 192, 224,
+            256, 320, 384, 512
+        ])
+        self.k_size = np.array([3, 5])
+        self.multiply = np.array([1, 2, 3, 4, 5, 6])
+        self.repeat = np.array([1, 2, 3, 4, 5, 6])
+
+    def check_chnum_convnum(self, program):
+        current_tokens = self.sanas.current_info()['current_tokens']
+        channel_exp = self.multiply[current_tokens[0]]
+        filter_num = self.filter_num[current_tokens[1]]
+        repeat_num = self.repeat[current_tokens[2]]
+
+        conv_list, ch_pro = compute_op_num(program)
+        ### assert conv number
+        self.assertTrue((repeat_num * 3) == len(
+            conv_list
+        ), "the number of conv is NOT match, the number compute from token: {}, actual conv number: {}".
+                        format(repeat_num * 3, len(conv_list)))
+
+        ### assert number of channels
+        ch_token = []
+        init_ch_num = 32
+        for i in range(repeat_num):
+            ch_token.append(init_ch_num * channel_exp)
+            ch_token.append(init_ch_num * channel_exp)
+            ch_token.append(filter_num)
+            init_ch_num = filter_num
+
+        self.assertTrue(
+            str(ch_token) == str(ch_pro),
+            "channel num is WRONG, channel num from token is {}, channel num come fom program is {}".
+            format(str(ch_token), str(ch_pro)))
+
+    def test_all_function(self):
+        ### unittest for next_archs
+        next_program = fluid.Program()
+        startup_program = fluid.Program()
+        token2arch_program = fluid.Program()
+
+        with fluid.program_guard(next_program, startup_program):
+            inputs = fluid.data(
+                name='input', shape=[None, 3, 32, 32], dtype='float32')
+            archs = self.sanas.next_archs()
+            for arch in archs:
+                output = arch(inputs)
+                inputs = output
+        self.check_chnum_convnum(next_program)
+
+        ### unittest for reward
+        self.assertTrue(self.sanas.reward(float(1.0)), "reward is False")
+
+        ### uniitest for tokens2arch
+        with fluid.program_guard(token2arch_program, startup_program):
+            inputs = fluid.data(
+                name='input', shape=[None, 3, 32, 32], dtype='float32')
+            arch = self.sanas.tokens2arch(self.sanas.current_info()[
+                'current_tokens'])
+            for arch in archs:
+                output = arch(inputs)
+                inputs = output
+        self.check_chnum_convnum(token2arch_program)
+
+        ### unittest for current_info
+        current_info = self.sanas.current_info()
+        self.assertTrue(
+            isinstance(current_info, dict),
+            "the type of current info must be dict, but now is {}".format(
+                type(current_info)))
 
 
 if __name__ == '__main__':
diff --git a/tests/test_sensitivity.py b/tests/test_sensitivity.py
index e2cfa01d889db2891fd7507b2d4d9aec018a1163..948a57e1095d0190e1b9f79782c257fbc88fae57 100644
--- a/tests/test_sensitivity.py
+++ b/tests/test_sensitivity.py
@@ -17,7 +17,7 @@ import unittest
 import numpy
 import paddle
 import paddle.fluid as fluid
-from paddleslim.analysis import sensitivity
+from paddleslim.prune import sensitivity, merge_sensitive, load_sensitivities
 from layers import conv_bn_layer
 
 
@@ -44,15 +44,15 @@ class TestSensitivity(unittest.TestCase):
         exe = fluid.Executor(place)
         exe.run(startup_program)
 
-        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
+        val_reader = paddle.fluid.io.batch(
+            paddle.dataset.mnist.test(), batch_size=128)
 
-        def eval_func(program, scope):
+        def eval_func(program):
             feeder = fluid.DataFeeder(
                 feed_list=['image', 'label'], place=place, program=program)
             acc_set = []
             for data in val_reader():
                 acc_np = exe.run(program=program,
-                                 scope=scope,
                                  feed=feeder.feed(data),
                                  fetch_list=[acc_top1])
                 acc_set.append(float(acc_np[0]))
@@ -60,9 +60,53 @@ class TestSensitivity(unittest.TestCase):
             print("acc_val_mean: {}".format(acc_val_mean))
             return acc_val_mean
 
-        sensitivity(eval_program,
-                    fluid.global_scope(), place, ["conv4_weights"], eval_func,
-                    "./sensitivities_file")
+        def eval_func_for_args(args):
+            program = args[0]
+            feeder = fluid.DataFeeder(
+                feed_list=['image', 'label'], place=place, program=program)
+            acc_set = []
+            for data in val_reader():
+                acc_np = exe.run(program=program,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[acc_top1])
+                acc_set.append(float(acc_np[0]))
+            acc_val_mean = numpy.array(acc_set).mean()
+            print("acc_val_mean: {}".format(acc_val_mean))
+            return acc_val_mean
+
+        sensitivity(
+            eval_program,
+            place, ["conv4_weights"],
+            eval_func,
+            sensitivities_file="./sensitivities_file_0",
+            pruned_ratios=[0.1, 0.2])
+
+        sensitivity(
+            eval_program,
+            place, ["conv4_weights"],
+            eval_func,
+            sensitivities_file="./sensitivities_file_1",
+            pruned_ratios=[0.3, 0.4])
+
+        params_sens = sensitivity(
+            eval_program,
+            place, ["conv4_weights"],
+            eval_func_for_args,
+            eval_args=[eval_program],
+            sensitivities_file="./sensitivites_file_params",
+            pruned_ratios=[0.1, 0.2, 0.3, 0.4])
+
+        sens_0 = load_sensitivities('./sensitivities_file_0')
+        sens_1 = load_sensitivities('./sensitivities_file_1')
+        sens = merge_sensitive([sens_0, sens_1])
+        origin_sens = sensitivity(
+            eval_program,
+            place, ["conv4_weights"],
+            eval_func,
+            sensitivities_file="./sensitivities_file_2",
+            pruned_ratios=[0.1, 0.2, 0.3, 0.4])
+        self.assertTrue(params_sens == origin_sens)
+        self.assertTrue(sens == origin_sens)
 
 
 if __name__ == '__main__':
diff --git a/tests/test_auto_prune.py b/tests/test_slim_prune.py
similarity index 67%
rename from tests/test_auto_prune.py
rename to tests/test_slim_prune.py
index c9cdc72c33ce683f2dc3ecbfdf406740ef6e69a8..73216cfe1ea1b62bb5c381c12862b64580335ac7 100644
--- a/tests/test_auto_prune.py
+++ b/tests/test_slim_prune.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
@@ -15,8 +15,7 @@ import sys
 sys.path.append("../")
 import unittest
 import paddle.fluid as fluid
-from paddleslim.prune import AutoPruner
-from paddleslim.analysis import flops
+from paddleslim.prune import Pruner
 from layers import conv_bn_layer
 
 
@@ -50,34 +49,33 @@ class TestPrune(unittest.TestCase):
         exe = fluid.Executor(place)
         scope = fluid.Scope()
         exe.run(startup_program, scope=scope)
-
-        pruned_flops = 0.5
-        pruner = AutoPruner(
+        criterion = 'bn_scale'
+        pruner = Pruner(criterion)
+        main_program, _, _ = pruner.prune(
             main_program,
             scope,
-            place,
             params=["conv4_weights"],
-            init_ratios=[0.5],
-            pruned_flops=0.5,
-            pruned_latency=None,
-            server_addr=("", 0),
-            init_temperature=100,
-            reduce_rate=0.85,
-            max_try_number=300,
-            max_client_num=10,
-            search_steps=2,
-            max_ratios=[0.9],
-            min_ratios=[0],
-            key="auto_pruner")
+            ratios=[0.5],
+            place=place,
+            lazy=False,
+            only_graph=False,
+            param_backup=None,
+            param_shape_backup=None)
 
-        base_flops = flops(main_program)
-        program = pruner.prune(main_program)
-        self.assertTrue(flops(program) <= base_flops * (1 - pruned_flops))
-        pruner.reward(1)
+        shapes = {
+            "conv1_weights": (4, 3, 3, 3),
+            "conv2_weights": (4, 4, 3, 3),
+            "conv3_weights": (8, 4, 3, 3),
+            "conv4_weights": (4, 8, 3, 3),
+            "conv5_weights": (8, 4, 3, 3),
+            "conv6_weights": (8, 8, 3, 3)
+        }
 
-        program = pruner.prune(main_program)
-        self.assertTrue(flops(program) <= base_flops * (1 - pruned_flops))
-        pruner.reward(1)
+        for param in main_program.global_block().all_parameters():
+            if "weights" in param.name:
+                print("param: {}; param shape: {}".format(param.name,
+                                                          param.shape))
+                self.assertTrue(param.shape == shapes[param.name])
 
 
 if __name__ == '__main__':
diff --git a/tests/test_soft_label_loss.py b/tests/test_soft_label_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..965edee3130c58375821db630aa53182ac0489f5
--- /dev/null
+++ b/tests/test_soft_label_loss.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../")
+import unittest
+import paddle.fluid as fluid
+from paddleslim.dist import merge, soft_label_loss
+from layers import conv_bn_layer
+
+
+class TestSoftLabelLoss(unittest.TestCase):
+    def test_soft_label_loss(self):
+        student_main = fluid.Program()
+        student_startup = fluid.Program()
+        with fluid.program_guard(student_main, student_startup):
+            input = fluid.data(name="image", shape=[None, 3, 224, 224])
+            conv1 = conv_bn_layer(input, 8, 3, "conv1")
+            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+            student_predict = conv1 + conv2
+
+        teacher_main = fluid.Program()
+        teacher_startup = fluid.Program()
+        with fluid.program_guard(teacher_main, teacher_startup):
+            input = fluid.data(name="image", shape=[None, 3, 224, 224])
+            conv1 = conv_bn_layer(input, 8, 3, "conv1")
+            conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
+            sum1 = conv1 + conv2
+            conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
+            conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
+            sum2 = conv4 + sum1
+            conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
+            teacher_predict = conv_bn_layer(conv5, 8, 3, "conv6")
+
+        place = fluid.CPUPlace()
+        data_name_map = {'image': 'image'}
+        merge(teacher_main, student_main, data_name_map, place)
+        merged_ops = []
+        for block in student_main.blocks:
+            for op in block.ops:
+                merged_ops.append(op.type)
+        with fluid.program_guard(student_main):
+            distill_loss = soft_label_loss('teacher_conv6_bn_output.tmp_2',
+                                           'conv2_bn_output.tmp_2',
+                                           student_main)
+        loss_ops = []
+        for block in student_main.blocks:
+            for op in block.ops:
+                loss_ops.append(op.type)
+        self.assertTrue(set(merged_ops).difference(set(loss_ops)) == set())
+        self.assertTrue(
+            set(loss_ops).difference(set(merged_ops)) ==
+            {'cross_entropy', 'softmax', 'reduce_mean', 'scale'})
+
+
+if __name__ == '__main__':
+    unittest.main()