diff --git a/demo/quick_start/data/README.md b/demo/quick_start/data/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..63abcf7ebf31903213e44cf492b93e09f61db14e
--- /dev/null
+++ b/demo/quick_start/data/README.md
@@ -0,0 +1,9 @@
+This dataset consists of electronics product reviews associated with
+binary labels (positive/negative) for sentiment classification.
+
+The preprocessed data can be downloaded by script `get_data.sh`.
+The data was derived from reviews_Electronics_5.json.gz at
+
+http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+
+If you want to process the raw data, you can use the script `proc_from_raw_data/get_data.sh`.
diff --git a/demo/quick_start/data/get_data.sh b/demo/quick_start/data/get_data.sh
index f355d63225b28ab495b34e72dd3be8d237ae08f4..952de3f3c8f52a7a6f84412f9b38f16ac2503ac2 100755
--- a/demo/quick_start/data/get_data.sh
+++ b/demo/quick_start/data/get_data.sh
@@ -17,14 +17,11 @@ set -e
 DIR="$( cd "$(dirname "$0")" ; pwd -P )"
 cd $DIR
 
-echo "Downloading Amazon Electronics reviews data..."
-# http://jmcauley.ucsd.edu/data/amazon/
-wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+# Download the preprocessed data
+wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
 
-echo "Downloading mosesdecoder..."
-#https://github.com/moses-smt/mosesdecoder
-wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
+# Extract package
+tar zxvf preprocessed_data.tar.gz
 
-unzip master.zip
-rm master.zip
-echo "Done."
+# Remove compressed package
+rm preprocessed_data.tar.gz
diff --git a/demo/quick_start/data/pred.list b/demo/quick_start/data/pred.list
deleted file mode 100644
index d88b2b63851101a8b40e706b32d8c17b5fabb201..0000000000000000000000000000000000000000
--- a/demo/quick_start/data/pred.list
+++ /dev/null
@@ -1 +0,0 @@
-./data/pred.txt
diff --git a/demo/quick_start/data/pred.txt b/demo/quick_start/data/pred.txt
deleted file mode 100644
index 6ed5f738ddaff6645448d5e606dcef1baf01b282..0000000000000000000000000000000000000000
--- a/demo/quick_start/data/pred.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-the device is cute , but that &apos;s just about all that &apos;s good. the specs are what you &apos;d expect : it &apos;s a wifi mic , with some noise filter options. the app has the option to upload your baby &apos;s name and photo , which is a cutesy touch. but the app is otherwise unstable and useless unless you upgrade for $ 60 / year.set up involves downloading the app , turning on the mic , switching your phone to the wifi network of the mic , telling the app your wifi settings , switching your wifi back to your home router. the app is then directly connected to your mic.the app is adware ! the main screen says &quot; cry notifications on / off : upgrade to evoz premium and receive a text message of email when your baby is crying &quot; .but the adware points out an important limitation , this monitor is only intended to be used from your home network. if you want to access it remotely , get a webcam. this app would make a lot more sense of the premium features were included with the hardware .
-don &apos;t be fooled by my one star rating. if there was a zero , i would have selected it. this product was a waste of my money.it has never worked like the company said it supposed to. i only have one device , an iphone 4gs. after charging the the iphone mid way , the i.sound portable power max 16,000 mah is completely drained. the led light no longer lit up. when plugging the isound portable power max into a wall outlet to charge , it would charge for about 20-30 minutes and then all four battery led indicator lit up showing a full charge. i would leave it on to charge for the full 8 hours or more but each time with the same result upon using. don &apos;t buy this thing. put your money to good use elsewhere .
diff --git a/demo/quick_start/preprocess.sh b/demo/quick_start/data/proc_from_raw_data/get_data.sh
similarity index 65%
rename from demo/quick_start/preprocess.sh
rename to demo/quick_start/data/proc_from_raw_data/get_data.sh
index c9190e2dd2ef754bf3c7287006322b52493dc3a0..cd85e26842dfccea78e4f26bdfee938887021f03 100755
--- a/demo/quick_start/preprocess.sh
+++ b/demo/quick_start/data/proc_from_raw_data/get_data.sh
@@ -16,10 +16,26 @@
 # 1. size of pos : neg = 1:1.
 # 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
 # 3. distinct train set and test set.
-# 4. build dict
 
 set -e
 
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+
+# Download data
+echo "Downloading Amazon Electronics reviews data..."
+# http://jmcauley.ucsd.edu/data/amazon/
+wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+echo "Downloading mosesdecoder..."
+# https://github.com/moses-smt/mosesdecoder
+wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
+
+unzip master.zip
+rm master.zip
+
+##################
+# Preprocess data 
+echo "Preprocess data..."
 export LC_ALL=C
 UNAME_STR=`uname`
 
@@ -29,11 +45,11 @@ else
   SHUF_PROG='gshuf'
 fi
 
-mkdir -p data/tmp
-python preprocess.py -i data/reviews_Electronics_5.json.gz
+mkdir -p tmp
+python preprocess.py -i reviews_Electronics_5.json.gz
 # uniq and shuffle
-cd data/tmp
-echo 'uniq and shuffle...'
+cd tmp
+echo 'Uniq and shuffle...'
 cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed
 cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed
 
@@ -53,11 +69,11 @@ cat train.pos train.neg | ${SHUF_PROG} >../train.txt
 cat test.pos test.neg | ${SHUF_PROG} >../test.txt
 
 cd -
-echo 'data/train.txt' > data/train.list
-echo 'data/test.txt' > data/test.list
+echo 'train.txt' > train.list
+echo 'test.txt' > test.list
 
 # use 30k dict
-rm -rf data/tmp
-mv data/dict.txt data/dict_all.txt
-cat data/dict_all.txt | head -n 30001 > data/dict.txt
-echo 'preprocess finished'
+rm -rf tmp
+mv dict.txt dict_all.txt
+cat dict_all.txt | head -n 30001 > dict.txt
+echo 'Done.'
diff --git a/demo/quick_start/preprocess.py b/demo/quick_start/data/proc_from_raw_data/preprocess.py
similarity index 95%
rename from demo/quick_start/preprocess.py
rename to demo/quick_start/data/proc_from_raw_data/preprocess.py
index d87fad632a7429f7d9682badabe4c72ca127354f..56c2c5f16ceb63ff88fa51ed78c2e77ea5b64592 100755
--- a/demo/quick_start/preprocess.py
+++ b/demo/quick_start/data/proc_from_raw_data/preprocess.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-1. (remove HTML before or not)tokensizing
+1. Tokenize the words and punctuation 
 2. pos sample : rating score 5; neg sample: rating score 1-2.
 
 Usage:
@@ -76,7 +76,11 @@ def tokenize(sentences):
     sentences : a list of input sentences.
     return: a list of processed text.
     """
-    dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
+    dir = './mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
+    if not os.path.exists(dir):
+        sys.exit(
+            "The ./mosesdecoder-master/scripts/tokenizer/tokenizer.perl does not exists."
+        )
     tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
     assert isinstance(sentences, list)
     text = "\n".join(sentences)
@@ -104,7 +108,7 @@ def tokenize_batch(id):
         num_batch, instance, pre_fix = parse_queue.get()
         if num_batch == -1:  ### parse_queue finished
             tokenize_queue.put((-1, None, None))
-            sys.stderr.write("tokenize theread %s finish\n" % (id))
+            sys.stderr.write("Thread %s finish\n" % (id))
             break
         tokenize_instance = tokenize(instance)
         tokenize_queue.put((num_batch, tokenize_instance, pre_fix))
diff --git a/demo/semantic_role_labeling/data/get_data.sh b/demo/semantic_role_labeling/data/get_data.sh
index 55e33f4685627ed483aa6642c518a33558091531..99487e0d9a8c31d884c4a338386ad0ff8e5d9dc7 100644
--- a/demo/semantic_role_labeling/data/get_data.sh
+++ b/demo/semantic_role_labeling/data/get_data.sh
@@ -14,10 +14,10 @@
 # limitations under the License.
 set -e
 wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/verbDict.txt --no-check-certificate
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/targetDict.txt --no-check-certificate
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/wordDict.txt --no-check-certificate
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/emb --no-check-certificate
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt 
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt 
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb
 tar -xzvf conll05st-tests.tar.gz
 rm conll05st-tests.tar.gz
 cp ./conll05st-release/test.wsj/words/test.wsj.words.gz  .
diff --git a/doc/tutorials/quick_start/index_en.md b/doc/tutorials/quick_start/index_en.md
index 659485d9be1b6a3e9759a2fd040cb09d1f2a3005..ec548b5393d7b210d6409328c00917aeb679a451 100644
--- a/doc/tutorials/quick_start/index_en.md
+++ b/doc/tutorials/quick_start/index_en.md
@@ -59,12 +59,11 @@ To build your text classification system, your code will need to perform five st
 ## Preprocess data into standardized format
 In this example, you are going to use [Amazon electronic product review dataset](http://jmcauley.ucsd.edu/data/amazon/) to build a bunch of deep neural network models for text classification. Each text in this dataset is a product review. This dataset has two categories: “positive” and “negative”. Positive means the reviewer likes the product, while negative means the reviewer does not like the product.
 
-`demo/quick_start` in the [source code](https://github.com/baidu/Paddle) provides scripts for downloading data and preprocessing data as shown below. The data process takes several minutes (about 3 minutes in our machine).
+`demo/quick_start` in the [source code](https://github.com/PaddlePaddle/Paddle) provides script for downloading the preprocessed data as shown below. (If you want to process the raw data, you can use the script `demo/quick_start/data/proc_from_raw_data/get_data.sh`).
 
 ```bash
 cd demo/quick_start
 ./data/get_data.sh
-./preprocess.sh
 ```
 
 ## Transfer Data to Model
diff --git a/doc_cn/build_and_install/install/docker_install.rst b/doc_cn/build_and_install/install/docker_install.rst
index a5f5fb117e11e8ac1ae49e4271e826fa12d5e810..40339659be406ec72da8ad89b6d5dd38d72bb5ae 100644
--- a/doc_cn/build_and_install/install/docker_install.rst
+++ b/doc_cn/build_and_install/install/docker_install.rst
@@ -1,9 +1,7 @@
 安装PaddlePaddle的Docker镜像
 ============================
 
-PaddlePaddle提供了Docker的使用镜像。PaddlePaddle推荐使用Docker进行PaddlePaddle的部署和
-运行。Docker是一个基于容器的轻量级虚拟环境。具有和宿主机相近的运行效率，并提供
-了非常方便的二进制分发手段。
+PaddlePaddle项目提供官方 `Docker <https://www.docker.com/>`_ 镜像。Docker镜像是我们目前唯一官方支持的部署和运行方式。
 
 下述内容将分为如下几个类别描述。
 
@@ -41,7 +39,7 @@ PaddlePaddle提供的Docker镜像版本
 * CPU WITHOUT AVX: CPU版本，不支持AVX指令集的CPU也可以运行
 * GPU WITHOUT AVX: GPU版本，不需要AVX指令集的CPU也可以运行。
 
-用户可以选择对应版本的docker image。使用如下脚本可以确定本机的CPU知否支持 :code:`AVX` 指令集\:
+用户可以选择对应版本的docker image。使用如下脚本可以确定本机的CPU是否支持 :code:`AVX` 指令集\:
 
 ..  code-block:: bash
 
@@ -67,7 +65,7 @@ mac osx或者是windows机器，请参考
 
 ..  code-block:: bash
     
-    $ docker run -it paddledev/paddlepaddle:cpu-latest
+    $ docker run -it paddledev/paddle:cpu-latest
 
 即可启动和进入PaddlePaddle的container。如果运行GPU版本的PaddlePaddle，则需要先将
 cuda相关的Driver和设备映射进container中，脚本类似于
@@ -76,7 +74,7 @@ cuda相关的Driver和设备映射进container中，脚本类似于
 
     $ export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
     $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
 
 进入Docker container后，运行 :code:`paddle version` 即可打印出PaddlePaddle的版本和构建
 信息。安装完成的PaddlePaddle主体包括三个部分， :code:`paddle` 脚本， python的
diff --git a/doc_cn/build_and_install/install/ubuntu_install.rst b/doc_cn/build_and_install/install/ubuntu_install.rst
index 0fb59e25f6932214a3f1c67b12b426e388c3fc5d..4500d6e0b03be9280e3e6c25cddbf7fb389671b8 100644
--- a/doc_cn/build_and_install/install/ubuntu_install.rst
+++ b/doc_cn/build_and_install/install/ubuntu_install.rst
@@ -1,35 +1,42 @@
-使用deb包在Ubuntu上安装PaddlePaddle
+Ubuntu部署PaddlePaddle
 ===================================
 
-PaddlePaddle目前支持使用deb包安装。Paddle的 :code:`deb` 安装包在ubuntu 14.04中正确，但理论上支持其他的 debian 发行版。
+PaddlePaddle提供了ubuntu 14.04 deb安装包。
 
+安装
+------
 
-PaddlePaddle的ubuntu安装包分为四个版本，他们是 cpu、gpu、cpu-noavx、gpu-noavx 四个版本。其中 noavx 用于不支持AVX指令集的cpu。安装包的下载地址是\: https://github.com/baidu/Paddle/releases/
+安装包的下载地址是\: https://github.com/PaddlePaddle/Paddle/releases
 
+它包含四个版本\:
 
-用户需要先将PaddlePaddle安装包下载到本地，然后执行如下 :code:`gdebi` 命令即可完成安装。
+* cpu版本: 支持主流x86处理器平台, 使用了avx指令集。
 
-..  code-block:: shell
+* cpu-noavx版本：支持主流x86处理器平台，没有使用avx指令集。
+
+* gpu版本：支持主流x86处理器平台，支持nvidia cuda平台，使用了avx指令集。
 
-    gdebi paddle-*-cpu*.deb
+* gpu-noavx版本：支持主流x86处理器平台，支持nvidia cuda平台，没有使用avx指令集。
 
-如果 :code:`gdebi` 没有安装,则需要使用 :code:`sudo apt-get install gdebi`, 来安装 :code:`gdebi` 。
+下载完相关安装包后，执行:
 
+..  code-block:: shell
 
-或者使用下面一条命令安装.
+    sudo apt-get install gdebi
+    gdebi paddle-*-cpu.deb
+
+或者:
 
 ..  code-block:: shell
 
-    dpkg -i paddle-*-cpu*.deb
+    dpkg -i paddle-*-cpu.deb
     apt-get install -f
 
+
 在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的，
 在 :code:`apt-get install -f` 里会继续安装 PaddlePaddle。
 
-需要注意的是，如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，
-并设置好对应的环境变量(LD_LIBRARY_PATH等等)。
-
-安装完成后,可以使用命令 :code:`paddle version` 查看安装后的paddle 版本。可能的输出为
+安装完成后，可以使用命令 :code:`paddle version` 查看安装后的paddle 版本:
 
 ..  literalinclude:: paddle_version.txt
 
@@ -39,45 +46,16 @@ PaddlePaddle的ubuntu安装包分为四个版本，他们是 cpu、gpu、cpu-noa
 libcudart.so/libcudnn.so找不到
 ++++++++++++++++++++++++++++++
 
-安装完成PaddlePaddle后，运行 :code:`paddle train` 报错\:
-
-..	code-block:: shell
-
-	0831 12:36:04.151525  1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
-
-PaddlePaddle使用运行时动态连接CUDA的so，如果在 LD_LIBRARY_PATH里面找不到这些动态
-库的话，会报寻找不到这些动态库。
-
-解决方法很简单，就是将这些动态库加到环境变量里面。比较可能的命令如下。
+安装完成后，运行 :code:`paddle train` 报错\:
 
-..	code-block:: text
+.. 	code-block:: shell
 
-	export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+	  0831 12:36:04.151525  1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
 
-CUDA Driver找不到
-+++++++++++++++++
+原因是未设置cuda运行时环境变量。 如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，并设置：
 
-运行 :code:`paddle train` 报错\:
-
-..	code-block:: text
-
-	F0831 12:39:16.699000  1090 hl_cuda_device.cc:530] Check failed: cudaSuccess == cudaStat (0 vs. 35) Cuda Error: CUDA driver version is insufficient for CUDA runtime version
-
-PaddlePaddle运行时如果没有寻找到cuda的driver，变会报这个错误。解决办法是将cuda 
-driver添加到LD_LIBRARY_PATH中。比较可能的命令如下。
-
-..	code-block:: text
-
-	export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
-
-config文件找不到
-++++++++++++++++
-
-运行 :code:`paddle train` 得到结果\:
-
-..	code-block:: text
+..  code-block:: shell
 
-	F0831 20:53:07.525789  1302 TrainerMain.cpp:94] Check failed: config != nullptr no valid config
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH
+    export PATH=/usr/local/cuda/bin:$PATH
 
-PaddlePaddle在运行时找不到对应的config文件，说明命令行参数 :code:`config` 没有设置。
-而这个一般说明PaddlePaddle已经安装完毕了。
\ No newline at end of file
diff --git a/doc_cn/demo/quick_start/index.md b/doc_cn/demo/quick_start/index.md
index 4d9b24ba851a7aaaeb0d79bfbeb0703b8878b77f..4a6e07ee1ffd94cf8f781af307b53a96a78e6b93 100644
--- a/doc_cn/demo/quick_start/index.md
+++ b/doc_cn/demo/quick_start/index.md
@@ -32,13 +32,11 @@
 
 ## 数据格式准备(Data Preparation)
 在本问题中，我们使用[Amazon电子产品评论数据](http://jmcauley.ucsd.edu/data/amazon/)，
-将评论分为好评(正样本)和差评(负样本)两类。[源码](https://github.com/baidu/Paddle)的`demo/quick_start`里提供了数据下载脚本
-和预处理脚本。
+将评论分为好评(正样本)和差评(负样本)两类。[源码](https://github.com/PaddlePaddle/Paddle)的`demo/quick_start`里提供了下载已经预处理数据的脚本（如果想从最原始的数据处理，可以使用脚本 `./demo/quick_start/data/proc_from_raw_data/get_data.sh`）。
 
 ```bash
 cd demo/quick_start
 ./data/get_data.sh
-./preprocess.sh
 ```
 
 ## 数据向模型传送(Transfer Data to Model)
@@ -143,7 +141,7 @@ PyDataProvider2</a>。
 
 我们将以基本的逻辑回归网络作为起点，并逐渐展示更加深入的功能。更详细的网络配置
 连接请参考<a href = "../../../doc/layer.html">Layer文档</a>。
-所有配置在[源码](https://github.com/baidu/Paddle)`demo/quick_start`目录，首先列举逻辑回归网络。
+所有配置在[源码](https://github.com/PaddlePaddle/Paddle)`demo/quick_start`目录，首先列举逻辑回归网络。
 
 ### 逻辑回归模型(Logistic Regression)
 
diff --git a/doc_cn/faq/index.rst b/doc_cn/faq/index.rst
index 3eb0e10ae2228740cd384270db5070e367f7007b..551430eb41765673700b7c6568e4b483641f2cac 100644
--- a/doc_cn/faq/index.rst
+++ b/doc_cn/faq/index.rst
@@ -4,22 +4,18 @@ PaddlePaddle常见问题
 
 ..  contents::
 
-1. 如何减少PaddlePaddle的内存占用
+1. 如何减少内存占用
 ---------------------------------
 
-神经网络的训练本身是一个非常消耗内存和显存的工作。经常会消耗数十G的内存和数G的显存。
+神经网络的训练本身是一个非常消耗内存和显存的工作，经常会消耗数10GB的内存和数GB的显存。
 PaddlePaddle的内存占用主要分为如下几个方面\:
 
-* DataProvider缓冲池内存 (只针对内存)
-* 神经元激活内存 （针对内存和显存）
-* 参数内存 (针对内存和显存)
+* DataProvider缓冲池内存（只针对内存）
+* 神经元激活内存（针对内存和显存）
+* 参数内存 （针对内存和显存）
 * 其他内存杂项
 
-这其中，其他内存杂项是指PaddlePaddle本身所用的一些内存，包括字符串分配，临时变量等等，
-这些内存就不考虑如何缩减了。
-
-其他的内存的减少方法依次为
-
+其中，其他内存杂项是指PaddlePaddle本身所用的一些内存，包括字符串分配，临时变量等等，暂不考虑在内。
 
 减少DataProvider缓冲池内存
 ++++++++++++++++++++++++++
@@ -39,28 +35,28 @@ PyDataProvider使用的是异步加载，同时在内存里直接随即选取数
 
 ..  literalinclude:: reduce_min_pool_size.py
 
-这样做可以极大的减少内存占用，并且可能会加速训练过程。 详细文档参考 `这里
+这样做可以极大的减少内存占用，并且可能会加速训练过程，详细文档参考 `这里
 <../ui/data_provider/pydataprovider2.html#provider>`_ 。
 
 神经元激活内存
 ++++++++++++++
 
-神经网络在训练的时候，会对每一个激活暂存一些数据，包括激活，參差等等。
+神经网络在训练的时候，会对每一个激活暂存一些数据，如神经元激活值等。
 在反向传递的时候，这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系，
 一是batch size，另一个是每条序列(Sequence)长度。所以，其实也是和每个mini-batch中包含
 的时间步信息成正比。
 
-所以，做法可以有两种。他们是
+所以做法可以有两种：
 
 * 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数，减小batch size可能会对训练结果产生影响。
 * 减小序列的长度，或者直接扔掉非常长的序列。比如，一个数据集大部分序列长度是100-200,
-  但是突然有一个10000长的序列，就很容易导致内存超限。特别是在LSTM等RNN中。
+  但是突然有一个10000长的序列，就很容易导致内存超限，特别是在LSTM等RNN中。
 
 参数内存
 ++++++++
 
 PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需要使用不同大小的内存。
-例如如果使用 :code:`adadelta` 算法，则需要使用参数规模大约5倍的内存。 如果参数保存下来的
+例如使用 :code:`adadelta` 算法，则需要使用等于权重参数规模大约5倍的内存。举例，如果参数保存下来的模型目录
 文件为 :code:`100M`， 那么该优化算法至少需要 :code:`500M` 的内存。
 
 可以考虑使用一些优化算法，例如 :code:`momentum`。
@@ -68,11 +64,11 @@ PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需
 2. 如何加速PaddlePaddle的训练速度
 ---------------------------------
 
-PaddlePaddle是神经网络训练平台，加速PaddlePaddle训练有如下几个方面\：
+加速PaddlePaddle训练可以考虑从以下几个方面\：
 
 * 减少数据载入的耗时
 * 加速训练速度
-* 利用更多的计算资源
+* 利用分布式训练驾驭更多的计算资源
 
 减少数据载入的耗时
 ++++++++++++++++++
@@ -108,25 +104,20 @@ PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`spa
 利用更多的计算资源可以分为一下几个方式来进行\:
 
 * 单机CPU训练
-  * 使用多线程训练。设置命令行参数 :code:`trainer_count`，即可以设置参与训练的线程数量。使用方法为 :code:`paddle train --trainer_count=4`
+  * 使用多线程训练。设置命令行参数 :code:`trainer_count`。
+
 * 单机GPU训练
-  * 使用显卡训练。设置命令行参数 :code:`use_gpu`。 使用方法为 :code:`paddle train --use_gpu=true`
-  * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count`。使用 :code:`--use_gpu=True` 开启GPU训练，使用 :code:`trainer_count` 指定显卡数量。使用方法为 :code:`paddle train --use_gpu=true --trainer_count=4`
+  * 使用显卡训练。设置命令行参数 :code:`use_gpu`。
+  * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。
+
 * 多机训练
-  * 使用多机训练的方法也比较简单，需要先在每个节点启动 :code:`paddle pserver`，在使用 :code:`paddle train --pservers=192.168.100.1,192.168.100.2` 来指定每个pserver的ip地址
-  * 具体的多机训练方法参考 `多机训练 <TBD>`_ 文档。
+  * 具体的多机训练方法参考  `多机训练文档 <../ui/data_provider/pydataprovider2.html#provider>`_ 。
 
 
 3. 遇到“非法指令”或者是“illegal instruction” 
 --------------------------------------------
 
-paddle在进行计算的时候为了提升计算性能，使用了avx指令。部分老的cpu型号无法支持这样的指令。通常来说执行下grep avx /proc/cpuinfo看看是否有输出即可知道是否支持。（另：用此方法部分虚拟机可能检测到支持avx指令但是实际运行会挂掉，请当成是不支持，看下面的解决方案）
-
-解决办法是\:
-
-* 使用 NO_AVX的 `安装包 <../build_and_install/index.html>`_ 或者 `Docker image <../build_and_install/install/docker_install.html>`_
-* 或者，使用 :code:`-DWITH_AVX=OFF` 重新编译PaddlePaddle。
-
+PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二进制发行版可能会导致这种错误，请选择正确的版本。
 
 4. 如何选择SGD算法的学习率
 --------------------------
@@ -158,7 +149,7 @@ paddle在进行计算的时候为了提升计算性能，使用了avx指令。
 6. 如何共享参数
 ---------------
 
-PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字的参数，会共享参数。设置参数的名字，可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式，是想要共享的参数使用同样的 :code:`ParamAttr` 对象。
+PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字的参数，会共享参数。设置参数的名字，可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式，是使得要共享的参数使用同样的 :code:`ParamAttr` 对象。
 
 简单的全连接网络，参数共享的配置示例为\:
 
@@ -208,9 +199,6 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
     paddle package is already in your PYTHONPATH. But unittest need a clean environment.
     Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
     
-解决办法是：卸载paddle包 :code:`pip uninstall paddle`。
-
-原因是：单元测试使用了一个旧版本的python包，而没有测试到代码中实际修改的python包。即单元测试需要一个干净的环境：
+解决办法是：
 
-* 如果paddle包已经在python的site-packages里面了，那么单元测试时使用的paddle包，就是site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。
-* 即便设置了 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
\ No newline at end of file
+* 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包，使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面，单元测试会引用site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。同时，即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
diff --git a/doc_cn/introduction/index.md b/doc_cn/introduction/index.md
deleted file mode 100644
index 164cb7d4943dfbfcc00a2df7329ae2a877b2d703..0000000000000000000000000000000000000000
--- a/doc_cn/introduction/index.md
+++ /dev/null
@@ -1,105 +0,0 @@
-# 简介
-
-PaddlePaddle 是起源于百度的开源深度学习平台。它是简单易用的：你可以通过简单的十数行配置搭建经典的神经网络模型；它也是高效强大的：PaddlePaddle可以支撑复杂集群环境下超大模型的训练，令你受益于深度学习的前沿成果。在百度内部，已经有大量产品线使用了基于PaddlePaddle的深度学习技术。
-
-这份简短的介绍将像你展示如何利用PaddlePaddle解决一个经典的学习问题。
-
-## 1. 一个经典的任务
-
-让我们从一个基础问题开始：<a href="https://www.baidu.com/s?wd=单变量线性回归">单变量的线性回归</a>。问题假定观测到了一批二维空间上的点`(x, y) `，并且已知 `x` 和 `y` 之间存在着某种线性关系，我们的目标是通过观测数据还原这个线性关系。作为一个简单基础的模型，线性回归却有着广泛的应用场景。比如可以想象一个资产定价的简化场景，其中 `x` 对应于房屋的大小，`y` 对应于房屋价格。我们可以通过观察市场上房屋的情况获得二者之间的关系，从而为新房屋的定价提供参考。
-
-
-## 2. 准备数据
-
-假设变量 `X` 和 `Y` 的真实关系为： `Y = 2X + 0.3`，这里展示如何使用观测数据还原这一线性关系。如下Python代码将随机产生2000个观测点，它们将被用作PaddlePaddle的输入。产生PaddlePaddle的输入数据和写一段普通的Python脚本几乎一样，你唯一需要增加的就是定义输入数据的类型。
-
-```python
-# -*- coding:utf-8 -*-
-# dataprovider.py
-from paddle.trainer.PyDataProvider2 import *
-import random
-
-# 定义输入数据的类型: 2个浮点数
-@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-def process(settings, input_file):
-    for i in xrange(2000):
-        x = random.random()
-        yield [x], [2*x+0.3]
-```
-
-## 3. 训练模型
-
-为了还原 `Y = 2X + 0.3`，我们先从一条随机的直线 `Y' = wX + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `Y'` 和 `Y` 的差距不断减小，最终趋于相同。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
-
-在PaddlePaddle里，该模型的网络配置如下。
-
-```python
-# -*- coding:utf-8 -*-
-# trainer_config.py
-from paddle.trainer_config_helpers import *
-
-# 1. 定义数据来源，调用上面的process函数获得观测数据
-data_file = 'empty.list'
-with open(data_file, 'w') as f: f.writelines(' ')
-define_py_data_sources2(train_list=data_file, test_list=None, 
-        module='dataprovider', obj='process',args={})
-
-# 2. 学习算法。控制如何改变模型参数 w 和 b
-settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-# 3. 神经网络配置
-x = data_layer(name='x', size=1)
-y = data_layer(name='y', size=1)
-# 线性计算单元: y_predict = wx + b
-y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-# 损失计算，度量 y_predict 和真实 y 之间的差距
-cost = regression_cost(input=y_predict, label=y)
-outputs(cost)
-```
-这段简短的配置展示了PaddlePaddle的基本用法：
-
-- 首先，第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的`process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
-
-- 第二部分主要是选择学习算法，它定义了模型参数如何改变。PaddlePaddle提供了很多优秀的学习算法，但这里使用一个简单的基于momentum的算法就足够了，它每次读取12个数据进行计算和模型更新。
-
-- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络单元（Layer），所以很多时候你需要做的只是声明正确的网络单元并把它们拼接起来。这里使用了三种网络单元：
-	- **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到下游的其它单元。这里数据层有两个，分别对应于变量 `X` 和 `Y`。
-	- **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以挖掘复杂的数据关系。
-	- **回归损失层**：回归损失层 `regression_cost`是众多损失函数层的一种，它们在训练过程作为网络的出口，用来计算模型的表现，并指导模型参数的改变。
-
-这样定义了网络结构并保存为`trainer_config.py`之后，运行训练命令即可：
- ```
- paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
- ```
-
-PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加损失函数的输出在不断的减小，这意味着模型在不断的改进，直到逼近真实解：` Y = 2X + 0.3 `
-
-## 4. 模型检验
-
-训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用模型对另外一组数据进行预测，然后评价预测的效果。但在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
-
-PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件，所以可以利用如下方法读取模型的参数。
-
-```python
-import numpy as np
-import os
-
-def load(file_name):
-    with open(file_name, 'rb') as f:
-        f.read(16) # skip header for float type.
-        return np.fromfile(f, dtype=np.float32)
-        
-print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-# w=1.999743, b=0.300137
-```
-<center> ![](./parameters.png) </center>
-
-从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型重合。
-
-这样，我们就完成了对单变量线性回归问题的解决：将数据输入PaddlePaddle，训练模型，最后验证结果。
-
-## 5. 推荐后续阅读
-
-- <a href="../build_and_install/index.html">安装/编译</a>：PaddlePaddle的安装与编译文档。
-- <a href="../demo/quick_start/index.html">快速入门 </a>：使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
-- <a href="../demo/index.html">示例</a>：各种实用案例，涵盖图像、文本、推荐等多个领域。
diff --git a/doc_cn/introduction/index.rst b/doc_cn/introduction/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f6eb5456c007ca03ea6002109b1f27b8a99faa0f
--- /dev/null
+++ b/doc_cn/introduction/index.rst
@@ -0,0 +1,102 @@
+# 简介
+
+PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。
+
+## 1. 一个经典的任务
+
+我们展示如何用PaddlePaddle解决<a href="https://www.baidu.com/s?wd=单变量线性回归">单变量的线性回归</a>问题。线性回归的输入是一批点`(x, y) `，其中 `y = wx + b + ε`， 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 w 和 b。
+
+一个例子是房产估值。我们假设房产的价格（y）是其大小（x）的一个线性函数，那么我们可以通过收集市场上房子的大小和价格，用来估计线性函数的参数w 和 b。
+
+## 2. 准备数据
+
+假设变量 `x` 和 `y` 的真实关系为： `y = 2x + 0.3 + ε`，这里展示如何使用观测数据来拟合这一线性关系。首先，Python代码将随机产生2000个观测点，作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。
+
+```python
+# dataprovider.py
+from paddle.trainer.PyDataProvider2 import *
+import random
+
+# 定义输入数据的类型: 2个浮点数
+@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
+def process(settings, input_file):
+    for i in xrange(2000):
+        x = random.random()
+        yield [x], [2*x+0.3]
+```
+
+## 3. 训练模型
+
+为了还原 `y = 2x + 0.3`，我们先从一条随机的直线 `y' = wx + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小，最终趋于接近。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
+
+在PaddlePaddle里，该模型的网络配置如下。
+
+```python
+# trainer_config.py
+from paddle.trainer_config_helpers import *
+
+# 1. 定义数据来源，调用上面的process函数获得观测数据
+data_file = 'empty.list'
+with open(data_file, 'w') as f: f.writelines(' ')
+define_py_data_sources2(train_list=data_file, test_list=None, 
+        module='dataprovider', obj='process',args={})
+
+# 2. 学习算法。控制如何改变模型参数 w 和 b
+settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+# 3. 神经网络配置
+x = data_layer(name='x', size=1)
+y = data_layer(name='y', size=1)
+# 线性计算网络层: ȳ = wx + b
+ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
+# 计算误差函数，即  ȳ 和真实 y 之间的距离
+cost = regression_cost(input= ȳ, label=y)
+outputs(cost)
+```
+这段简短的配置展示了PaddlePaddle的基本用法：
+
+- 第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的`process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
+
+- 第二部分主要是选择学习算法，它定义了模型参数改变的规则。PaddlePaddle提供了很多优秀的学习算法，这里使用一个基于momentum的随机梯度下降(SGD)算法，该算法每批量(batch)读取12个采样数据进行随机梯度计算来更新更新。
+
+- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络层，所以很多时候你需要做的只是定义正确的网络层并把它们连接起来。这里使用了三种网络单元：
+	- **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
+	- **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
+	- **回归误差代价层**：回归误差代价层 `regression_cost`是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
+
+定义了网络结构并保存为`trainer_config.py`之后，运行以下训练命令：
+ ```
+ paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
+ ```
+
+PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加误差代价函数的输出在不断的减小，这意味着模型在训练数据上不断的改进，直到逼近真实解：` y = 2x + 0.3 `
+
+## 4. 模型检验
+
+训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测，评价预测的效果。在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
+
+PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件，所以可以利用如下方法读取模型的参数。
+
+```python
+import numpy as np
+import os
+
+def load(file_name):
+    with open(file_name, 'rb') as f:
+        f.read(16) # skip header for float type.
+        return np.fromfile(f, dtype=np.float32)
+        
+print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
+# w=1.999743, b=0.300137
+```
+<center> ![](./parameters.png) </center>
+
+从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型一致。
+
+这样，我们用PaddlePaddle解决了单变量线性回归问题， 包括数据输入，模型训练和最后的结果验证。
+
+## 5. 推荐后续阅读
+
+- <a href="../build_and_install/index.html">安装/编译</a>：PaddlePaddle的安装与编译文档。
+- <a href="../demo/quick_start/index.html">快速入门 </a>：使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
+- <a href="../demo/index.html">示例</a>：各种实用案例，涵盖图像、文本、推荐等多个领域。
diff --git a/doc_cn/ui/cmd/dump_config.rst b/doc_cn/ui/cmd/dump_config.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/doc_cn/ui/cmd/index.rst b/doc_cn/ui/cmd/index.rst
index 6d62180a6a5e3f2490cccd2a90213050aa3c172e..31a8b8a79f4a87101bd6030eb4e779fd11d65811 100644
--- a/doc_cn/ui/cmd/index.rst
+++ b/doc_cn/ui/cmd/index.rst
@@ -1,24 +1,20 @@
-命令行参数
-==========
+命令
+====
 
-安装好的PaddlePaddle脚本包括多条命令，他们是
+安装好PaddlePaddle后，在命令行直接敲击 ``paddle`` 或 ``paddle --help`` 会显示如下一些命令。
 
-* paddle train即为PaddlePaddle的训练进程。可以使用paddle train完成单机多显卡多线程的训
-  练。也可以和paddle pserver组合使用，完成多机训练。
-* paddle pserver为PaddlePaddle的parameter server进程。负责多机训练中的参数聚合工作。
-* paddle version可以打印出PaddlePaddle的版本和编译时信息。
-* merge_model 可以将PaddlePaddle的模型和配置打包成一个文件。方便部署分发。
-* dump_config 可以将PaddlePaddle的训练模型以proto string的格式打印出来
-* make_diagram 可以使用graphviz对PaddlePaddle的网络模型进行绘制，方便调试使用。
+* ``train`` Start a paddle_trainer
+    启动一个PaddlePaddle训练进程。 ``paddle train`` 可以通过命令行参数 ``-local=true`` 启动一个单机的训练进程；也可以和 ``paddle pserver`` 一起使用启动多机的分布式训练进程。
+* ``pserver`` Start a paddle_pserver_main
+    在多机分布式训练下启动PaddlePaddle的parameter server进程。
+* ``version`` Print paddle version
+    用于打印当前PaddlePaddle的版本和编译选项相关信息。常见的输出格式如下：1）第一行说明了PaddlePaddle的版本信息；2）第二行开始说明了一些主要的编译选项，具体意义可以参考 `编译参数选项文件 <../../build_and_install/cmake/compile_options.html>`_ 。
 
-更详细的介绍请参考各个命令的命令行参数文档。
+    ..  literalinclude:: paddle_version.txt
 
-..  toctree::
-    :glob:
-
-    paddle_train.rst
-    paddle_pserver.rst
-    paddle_version.rst
-    merge_model.rst
-    dump_config.rst
-    make_diagram.rst
+* ``merge_model`` Start a paddle_merge_model
+    用于将PaddlePaddle的模型参数文件和模型配置文件打包成一个文件，方便做部署分发。
+* ``dump_config`` Dump the trainer config as proto string
+    用于将PaddlePaddle的模型配置文件以proto string的格式打印出来。
+* ``make_diagram``
+    使用graphviz对PaddlePaddle的模型配置文件进行绘制。
\ No newline at end of file
diff --git a/doc_cn/ui/cmd/make_diagram.rst b/doc_cn/ui/cmd/make_diagram.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/doc_cn/ui/cmd/merge_model.rst b/doc_cn/ui/cmd/merge_model.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/doc_cn/ui/cmd/paddle_pserver.rst b/doc_cn/ui/cmd/paddle_pserver.rst
deleted file mode 100644
index 891975c34af5c34dddc754b79bd3e1adda9d9671..0000000000000000000000000000000000000000
--- a/doc_cn/ui/cmd/paddle_pserver.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-paddle pserver的命令行参数
-==========================
diff --git a/doc_cn/ui/cmd/paddle_train.rst b/doc_cn/ui/cmd/paddle_train.rst
deleted file mode 100644
index 87b84f5cbdbbe016d9bcdbda2cb30d93d2ad8022..0000000000000000000000000000000000000000
--- a/doc_cn/ui/cmd/paddle_train.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-paddle train的命令行参数
-========================
diff --git a/doc_cn/ui/cmd/paddle_version.rst b/doc_cn/ui/cmd/paddle_version.rst
deleted file mode 100644
index 0a4f8dd472a6009ef6832df75be043c24bb32ba0..0000000000000000000000000000000000000000
--- a/doc_cn/ui/cmd/paddle_version.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-paddle version的命令行参数
-==========================
-
-paddle version可以打印出paddle的版本信息和编译的选项。常见的输出格式为
-
-..  literalinclude:: paddle_version.txt
-
-其第一行说明了paddle的版本，后面跟着一系列编译参数。这里可以参考paddle的
-`编译参数选项文件 <../../build/cmake/compile_options.html>`_
diff --git a/doc_cn/ui/index.rst b/doc_cn/ui/index.rst
index 5aba272c627204110a56337f0f120f3f2cd37ae9..d871ad805ff7cd37fb83f24024003e54bce77f42 100644
--- a/doc_cn/ui/index.rst
+++ b/doc_cn/ui/index.rst
@@ -1,8 +1,9 @@
+########
 用户接口
-========
+########
 
 数据提供
-''''''''
+========
 
 ..  toctree::
     :maxdepth: 1
@@ -10,16 +11,23 @@
     data_provider/index.rst
 
 
-命令行参数
-''''''''''
-* `Use Case <../../doc/ui/cmd_argument/use_case.html>`_
-* `Argument Outline <../../doc/ui/cmd_argument/argument_outline.html>`_
-* `Detail Description <../../doc/ui/cmd_argument/detail_introduction.html>`_
+命令及命令行参数
+================
+
+..  toctree::
+    :maxdepth: 1
+
+    cmd/index.rst
+
+* `参数用例 <../../doc/ui/cmd_argument/use_case.html>`_
+* `参数分类 <../../doc/ui/cmd_argument/argument_outline.html>`_
+* `参数描述 <../../doc/ui/cmd_argument/detail_introduction.html>`_
 
 
 预测
-''''
+====
 
 ..  toctree::
+    :maxdepth: 1
 
     predict/swig_py_paddle.rst
diff --git a/doc_cn/ui/predict/swig_py_paddle.rst b/doc_cn/ui/predict/swig_py_paddle.rst
index 012ac4ff6e66a022fa7d8af798236f55b62011ec..89031dd72f5065b6919d873f5611a5e94e8b62e3 100644
--- a/doc_cn/ui/predict/swig_py_paddle.rst
+++ b/doc_cn/ui/predict/swig_py_paddle.rst
@@ -1,38 +1,36 @@
-PaddlePaddle的Python预测接口
-==================================
+基于Python的预测
+================
 
-PaddlePaddle目前使用Swig对其常用的预测接口进行了封装，使在Python环境下的预测接口更加简单。
-在Python环境下预测结果，主要分为以下几个步骤。
+预测流程
+--------
 
-* 读入解析训练配置
-* 构造GradientMachine
-* 准备数据
-* 预测
+PaddlePaddle使用swig对常用的预测接口进行了封装，通过编译会生成py_paddle软件包，安装该软件包就可以在python环境下实现模型预测。可以使用python的 ``help()`` 函数查询软件包相关API说明。
 
-典型的预测代码如下，使用mnist手写识别作为样例, 完整代码见
-:code:`src_root/doc/ui/predict/predict_sample.py` 。
+基于Python的模型预测，主要包括以下五个步骤。
+
+1. 初始化PaddlePaddle环境
+  在程序开始阶段，通过调用 ``swig_paddle.initPaddle()`` 并传入相应的命令行参数初始化PaddlePaddle。
+2. 解析模型配置文件
+  初始化之后，可以通过调用 ``parse_config()`` 解析训练模型时用的配置文件。注意预测数据通常不包含label, 同时预测网络通常直接输出最后一层的结果而不是像训练网络一样再接一层cost layer，所以一般需要对训练用的模型配置文件稍作相应修改才能在预测时使用。
+3. 构造paddle.GradientMachine
+  通过调用 ``swig_paddle.GradientMachine.createFromConfigproto()`` 传入上一步解析出来的模型配置就可以创建一个 ``GradientMachine``。
+4. 准备预测数据
+  swig_paddle中的预测接口的参数是自定义的C++数据类型，py_paddle里面提供了一个工具类 ``DataProviderConverter`` 可以用于接收和PyDataProvider2一样的输入数据并转换成预测接口所需的数据类型。
+5. 模型预测
+  通过调用 ``forwardTest()`` 传入预测数据，直接返回计算结果。
+
+
+预测Demo
+--------
+
+如下是一段使用mnist model来实现手写识别的预测代码。完整的代码见 ``src_root/doc/ui/predict/predict_sample.py`` 。mnist model可以通过 ``src_root\demo\mnist`` 目录下的demo训练出来。
 
 ..  literalinclude:: ../../../doc/ui/predict/predict_sample.py
     :language: python
-    :lines: 15-18,90-100,101-104
-
-主要的软件包为py_paddle.swig_paddle，这个软件包文档相对完善。可以使用python的
-:code:`help()` 函数查询文档。主要步骤为:
-
-* 在程序开始阶段，使用 :code:`swig_paddle.initPaddle()` 传入命令行参数初始化
-  PaddlePaddle。详细的命令行参数请参考
-  `命令行参数 <../cmd_argument/detail_introduction.html>`_ 。
-* 接下来使用 :code:`parse_config()` 解析训练时的配置文件。这里要注意预测数据通常
-  不包含label, 而且预测网络通常直接输出最后一层的结果而不是像训练时一样以cost
-  layer作为输出，所以用于预测的配置文件要做相应的修改。
-* 使用 :code:`swig_paddle.GradientMachine.createFromConfigproto()` 根据上一步解
-  析好的配置创建神经网络。
-* 创建一个 :code:`DataProviderConverter` 对象converter。
-    - swig_paddle接受的原始数据是C++的Matrix，也就是直接写内存的float数组。
-      这个接口并不用户友好。所以，我们提供了一个工具类DataProviderConverter。
-      这个工具类接收和PyDataProvider2一样的输入数据，详情请参考
-      `PyDataProvider2文档 <../../../doc/ui/data_provider/pydataprovider2.html>`_ 。
-* 最后使用 :code:`forwardTest()` 直接提取出神经网络Output层的输出结果。典型的输出结果为\:
+    :lines: 15-18,121-136
+
+
+Demo预测输出如下，其中value即为softmax层的输出。由于TEST_DATA包含两条预测数据，所以输出的value包含两个向量 。
 
 ..  code-block:: text
 
@@ -45,4 +43,4 @@ PaddlePaddle目前使用Swig对其常用的预测接口进行了封装，使在P
           2.70634608e-08,   3.48565123e-08,   5.25639710e-09,
           4.48684503e-08]], dtype=float32)}]
 
-其中，value即为softmax层的输出。由于数据是两条，所以输出的value包含两个向量 。
+
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 2f32b3fdd1a26c5b1bca43d0bd0ebb0896a012c4..a723ef7bc8329329fa82113f8e96a1bdbe750277 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1240,6 +1240,12 @@ void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
   }
 }
 
+DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp);
+template<class T>
+void BaseMatrixT<T>::deepSwap(BaseMatrixT& b) {
+    applyBinary(binary::DeepSwap<T>(), b);
+}
+
 template<>
 void BaseMatrixT<real>::rowDotMul(size_t destCol,
                                   BaseMatrixT& b,
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index d41dcee682cce15e94d45dafeb12bb0dce19b221..ea58c861a3d6a03642291c172af76795e90fcb92 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -455,6 +455,17 @@ public:
    */
   void assign(T p);
 
+  /**
+   * @code
+   * swap(this, b)
+   * example: swap two Matrices
+   * MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+   * MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+   * cpuA->deepSwap(*cpuB);
+   * @endcode
+   */
+  void deepSwap(BaseMatrixT& b);
+
   /**
    * @code
    * this = this + p
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index ae5bc5a86a1790ce30a8d7f83c9564f52d7cf7ea..de540dad4c8eefe5084c7089d7960d8ca8cf9875 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -448,6 +448,24 @@ void testMatrixZeroAtOffset(int height, int width) {
   MatrixCheckEqual(*cpuA, *cpuTest);
 }
 
+void testMatrixDeepSwap(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuCopyA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuCopyB = std::make_shared<CpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuCopyA->copyFrom(*cpuA);
+  cpuCopyB->copyFrom(*cpuB);
+
+  // swap matrix cpuA and cpuB
+  cpuA->deepSwap(*cpuB);
+
+  MatrixCheckEqual(*cpuA, *cpuCopyB);
+  MatrixCheckEqual(*cpuB, *cpuCopyA);
+}
+
 void testMatrixBinaryAdd(int height, int width) {
   MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
   MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
@@ -480,6 +498,7 @@ void testMatrixAssign(int height, int width) {
   MatrixCheckEqual(*cpuA, *outputCheck);
 }
 
+
 void testMatrixAdd(int height, int width) {
   MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
   MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
@@ -798,6 +817,7 @@ TEST(Matrix, unary) {
       testMatrixBinaryAdd(height, width);
       testMatrixTanh(height, width);
       testMatrixTanhDerivative(height, width);
+      testMatrixDeepSwap(height, width);
 
       // applyTernary
       testMatrixTernarySub(height, width);
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 461c73f14c2dc9377cc39ebb8f1273eee81730a3..ec68b53d440185f869566e2975a65d0c3fec5bc5 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -1,3 +1,12 @@
+execute_process(COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --version
+	OUTPUT_VARIABLE PROTOBUF_VERSION)
+string(REPLACE "libprotoc " "" PROTOBUF_VERSION ${PROTOBUF_VERSION})
+
+set(PROTOBUF_3 OFF)
+if (${PROTOBUF_VERSION} VERSION_GREATER "3.0.0" OR ${PROTOBUF_VERSION} VERSION_EQUAL "3.0.0")
+    set(PROTOBUF_3 ON)
+endif()
+
 set(proto_filenames
     DataConfig.proto
     DataFormat.proto
@@ -11,8 +20,12 @@ set(real_proto_files)
 # TODO(yuyang18): Some internal proto will also be depended on.
 #                 Find a way to automatically calculate all depends.
 foreach(filename ${proto_filenames})
+    set(PROTOBUF_3_FLAGS "")
+    if (PROTOBUF_3)
+        set(PROTOBUF_3_FLAGS "-Dproto3")
+    endif()
     add_custom_command(OUTPUT ${filename}
-        COMMAND ${M4_EXECUTABLE} -Dreal=${ACCURACY} -I '${INTERNAL_PROTO_PATH}'
+	COMMAND ${M4_EXECUTABLE} -Dreal=${ACCURACY} ${PROTOBUF_3_FLAGS} -I '${INTERNAL_PROTO_PATH}'
               ${PROJ_ROOT}/proto/${filename}.m4 > ${filename}
         DEPENDS ${PROJ_ROOT}/proto/${filename}.m4
         COMMENT "Generate ${filename}")
diff --git a/proto/DataConfig.proto.m4 b/proto/DataConfig.proto.m4
index 9862e4e7ef2ff96eafc91246e0b435c70fbe31d9..01d451ff7d5334f8f84d28973c2d7c4b4fac5885 100644
--- a/proto/DataConfig.proto.m4
+++ b/proto/DataConfig.proto.m4
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+ifdef(`proto3', `syntax = "proto2";')
 
 package paddle;
 
diff --git a/proto/DataFormat.proto.m4 b/proto/DataFormat.proto.m4
index 556eace5e194ef26991cc06d1f7794f14fbbdded..8a4a0be1b31a62cca35ca732a037ddc8b20786c4 100644
--- a/proto/DataFormat.proto.m4
+++ b/proto/DataFormat.proto.m4
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+ifdef(`proto3', `syntax = "proto2";')
 
 package paddle;
 
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4
index c835cfd5221c8579b383c0a6f0b2f0f554eac6d2..68a5eb9dd2231b47cc8f83696ab18fdb907c44c0 100644
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+ifdef(`proto3', `syntax = "proto2";')
 
 import "ParameterConfig.proto";
 
diff --git a/proto/ParameterConfig.proto.m4 b/proto/ParameterConfig.proto.m4
index e8d512445e5025f5663fbe3e20b4425cf1633a2b..26e7c3ef77b7377b8d6da4d947bcad27ae4edf72 100644
--- a/proto/ParameterConfig.proto.m4
+++ b/proto/ParameterConfig.proto.m4
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+ifdef(`proto3', `syntax = "proto2";')
 
 package paddle;
 
diff --git a/proto/ParameterService.proto.m4 b/proto/ParameterService.proto.m4
index 189dc1c9700bd821959bab80aef3721bd4940b5c..0b3f14a2ee5b3e1771f724bd9d271a3ecfd15038 100644
--- a/proto/ParameterService.proto.m4
+++ b/proto/ParameterService.proto.m4
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+ifdef(`proto3', `syntax = "proto2";')
 
 import "ParameterConfig.proto";
 import "TrainerConfig.proto";
@@ -20,7 +21,6 @@ package paddle;
 /**
  * Various structs for communicating with parameter server
  */
-
 enum ParameterUpdateMode {
   // Set parameter
    PSERVER_UPDATE_MODE_SET_PARAM = 0;//use local param
diff --git a/proto/TrainerConfig.proto.m4 b/proto/TrainerConfig.proto.m4
index 3b0e24f90bed8cdf0e102c12d2a4a041c17a8447..965c9cd39353970dd547f2a595eb99531f3693c6 100644
--- a/proto/TrainerConfig.proto.m4
+++ b/proto/TrainerConfig.proto.m4
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+ifdef(`proto3', `syntax = "proto2";')
 
 import "DataConfig.proto";
 import "ModelConfig.proto";