diff --git a/CMakeLists.txt b/CMakeLists.txt
index af193c27ae7d802a8724fdc1e23b4b5b583e9f7c..7d685587a7a7f388167f79cc8874003ab445f433 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
 set(PADDLE_MINOR_VERSION 9)
-set(PADDLE_PATCH_VERSION 0a0)
+set(PADDLE_PATCH_VERSION 0)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
diff --git a/RELEASE.md b/RELEASE.md
new file mode 100644
index 0000000000000000000000000000000000000000..a8a245ab442ba0fc63d1f1fda932e7590a6fe4ca
--- /dev/null
+++ b/RELEASE.md
@@ -0,0 +1,69 @@
+# Release v0.9.0
+
+## New Features:
+
+* New Layers
+  * bilinear interpolation layer.
+  * spatial pyramid-pool layer.
+  * de-convolution layer.
+  * maxout layer.
+* Support rectangle padding, stride, window and input for Pooling Operation.
+* Add —job=time in trainer, which can be used to print time info without compiler option -WITH_TIMER=ON.
+* Expose cost_weight/nce_layer in `trainer_config_helpers`
+* Add FAQ, concepts, h-rnn docs.
+* Add Bidi-LSTM and DB-LSTM to quick start demo @alvations
+* Add usage track scripts.
+
+## Improvements
+
+* Add Travis-CI for Mac OS X. Enable swig unittest in Travis-CI. Skip Travis-CI when only docs are changed.
+* Add code coverage tools.
+* Refine convolution layer to speedup and reduce GPU memory.
+* Speed up PyDataProvider2
+* Add ubuntu deb package build scripts.
+* Make Paddle use git-flow branching model.
+* PServer support no parameter blocks.
+
+## Bug Fixes
+
+* add zlib link to py_paddle
+* add input sparse data check for sparse layer at runtime
+* Bug fix for sparse matrix multiplication
+* Fix floating-point overflow problem of tanh
+* Fix some nvcc compile options
+* Fix a bug in yield dictionary in DataProvider
+* Fix SRL hang when exit.
+
+# Release v0.8.0beta.1
+New features:
+
+* Mac OSX is supported by source code. #138
+   * Both GPU and CPU versions of PaddlePaddle are supported.
+
+* Support CUDA 8.0
+
+* Enhance `PyDataProvider2`
+   * Add dictionary yield format. `PyDataProvider2` can yield a dictionary with key is data_layer's name, value is features.
+   * Add `min_pool_size` to control memory pool in provider.
+
+* Add `deb` install package & docker image for no_avx machines.
+   * Especially for cloud computing and virtual machines
+
+* Automatically disable `avx` instructions in cmake when machine's CPU don't support `avx` instructions.
+
+* Add Parallel NN api in trainer_config_helpers.
+
+* Add `travis ci` for Github
+
+Bug fixes:
+
+* Several bugs in trainer_config_helpers. Also complete the unittest for trainer_config_helpers
+* Check if PaddlePaddle is installed when unittest.
+* Fix bugs in GTX series GPU
+* Fix bug in MultinomialSampler
+
+Also more documentation was written since last release.
+
+# Release v0.8.0beta.0
+
+PaddlePaddle v0.8.0beta.0 release. The install package is not stable yet and it's a pre-release version.
diff --git a/demo/quick_start/data/README.md b/demo/quick_start/data/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..63abcf7ebf31903213e44cf492b93e09f61db14e
--- /dev/null
+++ b/demo/quick_start/data/README.md
@@ -0,0 +1,9 @@
+This dataset consists of electronics product reviews associated with
+binary labels (positive/negative) for sentiment classification.
+
+The preprocessed data can be downloaded by script `get_data.sh`.
+The data was derived from reviews_Electronics_5.json.gz at
+
+http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+
+If you want to process the raw data, you can use the script `proc_from_raw_data/get_data.sh`.
diff --git a/demo/quick_start/data/get_data.sh b/demo/quick_start/data/get_data.sh
index f355d63225b28ab495b34e72dd3be8d237ae08f4..952de3f3c8f52a7a6f84412f9b38f16ac2503ac2 100755
--- a/demo/quick_start/data/get_data.sh
+++ b/demo/quick_start/data/get_data.sh
@@ -17,14 +17,11 @@ set -e
 DIR="$( cd "$(dirname "$0")" ; pwd -P )"
 cd $DIR
 
-echo "Downloading Amazon Electronics reviews data..."
-# http://jmcauley.ucsd.edu/data/amazon/
-wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+# Download the preprocessed data
+wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
 
-echo "Downloading mosesdecoder..."
-#https://github.com/moses-smt/mosesdecoder
-wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
+# Extract package
+tar zxvf preprocessed_data.tar.gz
 
-unzip master.zip
-rm master.zip
-echo "Done."
+# Remove compressed package
+rm preprocessed_data.tar.gz
diff --git a/demo/quick_start/data/pred.list b/demo/quick_start/data/pred.list
deleted file mode 100644
index d88b2b63851101a8b40e706b32d8c17b5fabb201..0000000000000000000000000000000000000000
--- a/demo/quick_start/data/pred.list
+++ /dev/null
@@ -1 +0,0 @@
-./data/pred.txt
diff --git a/demo/quick_start/data/pred.txt b/demo/quick_start/data/pred.txt
deleted file mode 100644
index 6ed5f738ddaff6645448d5e606dcef1baf01b282..0000000000000000000000000000000000000000
--- a/demo/quick_start/data/pred.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-the device is cute , but that &apos;s just about all that &apos;s good. the specs are what you &apos;d expect : it &apos;s a wifi mic , with some noise filter options. the app has the option to upload your baby &apos;s name and photo , which is a cutesy touch. but the app is otherwise unstable and useless unless you upgrade for $ 60 / year.set up involves downloading the app , turning on the mic , switching your phone to the wifi network of the mic , telling the app your wifi settings , switching your wifi back to your home router. the app is then directly connected to your mic.the app is adware ! the main screen says &quot; cry notifications on / off : upgrade to evoz premium and receive a text message of email when your baby is crying &quot; .but the adware points out an important limitation , this monitor is only intended to be used from your home network. if you want to access it remotely , get a webcam. this app would make a lot more sense of the premium features were included with the hardware .
-don &apos;t be fooled by my one star rating. if there was a zero , i would have selected it. this product was a waste of my money.it has never worked like the company said it supposed to. i only have one device , an iphone 4gs. after charging the the iphone mid way , the i.sound portable power max 16,000 mah is completely drained. the led light no longer lit up. when plugging the isound portable power max into a wall outlet to charge , it would charge for about 20-30 minutes and then all four battery led indicator lit up showing a full charge. i would leave it on to charge for the full 8 hours or more but each time with the same result upon using. don &apos;t buy this thing. put your money to good use elsewhere .
diff --git a/demo/quick_start/preprocess.sh b/demo/quick_start/data/proc_from_raw_data/get_data.sh
similarity index 65%
rename from demo/quick_start/preprocess.sh
rename to demo/quick_start/data/proc_from_raw_data/get_data.sh
index c9190e2dd2ef754bf3c7287006322b52493dc3a0..cd85e26842dfccea78e4f26bdfee938887021f03 100755
--- a/demo/quick_start/preprocess.sh
+++ b/demo/quick_start/data/proc_from_raw_data/get_data.sh
@@ -16,10 +16,26 @@
 # 1. size of pos : neg = 1:1.
 # 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
 # 3. distinct train set and test set.
-# 4. build dict
 
 set -e
 
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+
+# Download data
+echo "Downloading Amazon Electronics reviews data..."
+# http://jmcauley.ucsd.edu/data/amazon/
+wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+echo "Downloading mosesdecoder..."
+# https://github.com/moses-smt/mosesdecoder
+wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
+
+unzip master.zip
+rm master.zip
+
+##################
+# Preprocess data 
+echo "Preprocess data..."
 export LC_ALL=C
 UNAME_STR=`uname`
 
@@ -29,11 +45,11 @@ else
   SHUF_PROG='gshuf'
 fi
 
-mkdir -p data/tmp
-python preprocess.py -i data/reviews_Electronics_5.json.gz
+mkdir -p tmp
+python preprocess.py -i reviews_Electronics_5.json.gz
 # uniq and shuffle
-cd data/tmp
-echo 'uniq and shuffle...'
+cd tmp
+echo 'Uniq and shuffle...'
 cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed
 cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed
 
@@ -53,11 +69,11 @@ cat train.pos train.neg | ${SHUF_PROG} >../train.txt
 cat test.pos test.neg | ${SHUF_PROG} >../test.txt
 
 cd -
-echo 'data/train.txt' > data/train.list
-echo 'data/test.txt' > data/test.list
+echo 'train.txt' > train.list
+echo 'test.txt' > test.list
 
 # use 30k dict
-rm -rf data/tmp
-mv data/dict.txt data/dict_all.txt
-cat data/dict_all.txt | head -n 30001 > data/dict.txt
-echo 'preprocess finished'
+rm -rf tmp
+mv dict.txt dict_all.txt
+cat dict_all.txt | head -n 30001 > dict.txt
+echo 'Done.'
diff --git a/demo/quick_start/preprocess.py b/demo/quick_start/data/proc_from_raw_data/preprocess.py
similarity index 95%
rename from demo/quick_start/preprocess.py
rename to demo/quick_start/data/proc_from_raw_data/preprocess.py
index d87fad632a7429f7d9682badabe4c72ca127354f..56c2c5f16ceb63ff88fa51ed78c2e77ea5b64592 100755
--- a/demo/quick_start/preprocess.py
+++ b/demo/quick_start/data/proc_from_raw_data/preprocess.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-1. (remove HTML before or not)tokensizing
+1. Tokenize the words and punctuation 
 2. pos sample : rating score 5; neg sample: rating score 1-2.
 
 Usage:
@@ -76,7 +76,11 @@ def tokenize(sentences):
     sentences : a list of input sentences.
     return: a list of processed text.
     """
-    dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
+    dir = './mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
+    if not os.path.exists(dir):
+        sys.exit(
+            "The ./mosesdecoder-master/scripts/tokenizer/tokenizer.perl does not exists."
+        )
     tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
     assert isinstance(sentences, list)
     text = "\n".join(sentences)
@@ -104,7 +108,7 @@ def tokenize_batch(id):
         num_batch, instance, pre_fix = parse_queue.get()
         if num_batch == -1:  ### parse_queue finished
             tokenize_queue.put((-1, None, None))
-            sys.stderr.write("tokenize theread %s finish\n" % (id))
+            sys.stderr.write("Thread %s finish\n" % (id))
             break
         tokenize_instance = tokenize(instance)
         tokenize_queue.put((num_batch, tokenize_instance, pre_fix))
diff --git a/demo/semantic_role_labeling/data/get_data.sh b/demo/semantic_role_labeling/data/get_data.sh
index 55e33f4685627ed483aa6642c518a33558091531..99487e0d9a8c31d884c4a338386ad0ff8e5d9dc7 100644
--- a/demo/semantic_role_labeling/data/get_data.sh
+++ b/demo/semantic_role_labeling/data/get_data.sh
@@ -14,10 +14,10 @@
 # limitations under the License.
 set -e
 wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/verbDict.txt --no-check-certificate
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/targetDict.txt --no-check-certificate
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/wordDict.txt --no-check-certificate
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/emb --no-check-certificate
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt 
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt 
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb
 tar -xzvf conll05st-tests.tar.gz
 rm conll05st-tests.tar.gz
 cp ./conll05st-release/test.wsj/words/test.wsj.words.gz  .
diff --git a/doc/about/index.rst b/doc/about/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8a372d2bc2b2c54b021ed63941482cbad8d8f719
--- /dev/null
+++ b/doc/about/index.rst
@@ -0,0 +1,14 @@
+ABOUT
+=======
+
+PaddlPaddle is an easy-to-use, efficient, flexible and scalable deep learning platform,
+which is originally developed by Baidu scientists and engineers for the purpose of applying deep learning to many products at Baidu.
+
+PaddlePaddle is now open source but far from complete, which is intended to be built upon, improved, scaled, and extended.
+We hope to build an active open source community both by providing feedback and by actively contributing to the source code.
+
+
+Credits
+--------
+
+We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/blob/develop/authors>`_ of PaddlePaddle!
diff --git a/doc/algorithm/index.rst b/doc/algorithm/index.rst
deleted file mode 100644
index 6073add3c0cbb12529eabb0f8d8a051bcb84e628..0000000000000000000000000000000000000000
--- a/doc/algorithm/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Algorithm Tutorial
-==================
-
-..  toctree::
-  :maxdepth: 1
-
-  rnn/rnn.rst
diff --git a/doc/algorithm/rnn/bi_lstm.jpg b/doc/algorithm/rnn/bi_lstm.jpg
deleted file mode 120000
index a53296cf806f97f7f2520e1700c4fb93f6bfc9d8..0000000000000000000000000000000000000000
--- a/doc/algorithm/rnn/bi_lstm.jpg
+++ /dev/null
@@ -1 +0,0 @@
-../../demo/sentiment_analysis/bi_lstm.jpg
\ No newline at end of file
diff --git a/doc/algorithm/rnn/encoder-decoder-attention-model.png b/doc/algorithm/rnn/encoder-decoder-attention-model.png
deleted file mode 120000
index db71321a43a37b774e7de0af3765a60345033743..0000000000000000000000000000000000000000
--- a/doc/algorithm/rnn/encoder-decoder-attention-model.png
+++ /dev/null
@@ -1 +0,0 @@
-../../demo/text_generation/encoder-decoder-attention-model.png
\ No newline at end of file
diff --git a/doc/ui/data_provider/index.rst b/doc/api/data_provider/index.rst
similarity index 97%
rename from doc/ui/data_provider/index.rst
rename to doc/api/data_provider/index.rst
index 3db5b57376257b83fc2a27c518b0db663682136d..5e7a49d63236ffa854e64c53921441bacebc13ae 100644
--- a/doc/ui/data_provider/index.rst
+++ b/doc/api/data_provider/index.rst
@@ -1,5 +1,5 @@
-DataProvider Introduction
-=========================
+Introduction
+==============
 DataProvider is a module that loads training or testing data into cpu or gpu
 memory for the following triaining or testing process.
 
diff --git a/doc/ui/data_provider/pydataprovider2.rst b/doc/api/data_provider/pydataprovider2.rst
similarity index 99%
rename from doc/ui/data_provider/pydataprovider2.rst
rename to doc/api/data_provider/pydataprovider2.rst
index e105d3be308705d228c0b188e15742a0f7325ab6..b42cbca576e4b5d67d50d0156939a01faae4533d 100644
--- a/doc/ui/data_provider/pydataprovider2.rst
+++ b/doc/api/data_provider/pydataprovider2.rst
@@ -1,5 +1,5 @@
-How to use PyDataProvider2
-==========================
+PyDataProvider2
+=================
 
 We highly recommand users to use PyDataProvider2 to provide training or testing
 data to PaddlePaddle. The user only needs to focus on how to read a single
diff --git a/doc/api/index.rst b/doc/api/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ccee7a0f1f3e5290891dfa963ea24c7fdbd6275e
--- /dev/null
+++ b/doc/api/index.rst
@@ -0,0 +1,36 @@
+API
+====
+
+DataProvider API
+----------------
+
+..  toctree::
+  :maxdepth: 1
+
+  data_provider/index.rst
+  data_provider/pydataprovider2.rst
+
+Model Config API
+----------------
+
+..  toctree::
+  :maxdepth: 1
+
+  trainer_config_helpers/index.rst
+  trainer_config_helpers/optimizers.rst
+  trainer_config_helpers/data_sources.rst
+  trainer_config_helpers/layers.rst
+  trainer_config_helpers/activations.rst 
+  trainer_config_helpers/poolings.rst
+  trainer_config_helpers/networks.rst
+  trainer_config_helpers/evaluators.rst
+  trainer_config_helpers/attrs.rst
+
+
+Applications API
+----------------
+
+..  toctree::
+  :maxdepth: 1
+
+  predict/swig_py_paddle_en.rst
\ No newline at end of file
diff --git a/doc/ui/predict/predict_sample.py b/doc/api/predict/predict_sample.py
similarity index 100%
rename from doc/ui/predict/predict_sample.py
rename to doc/api/predict/predict_sample.py
diff --git a/doc/ui/predict/swig_py_paddle_en.rst b/doc/api/predict/swig_py_paddle_en.rst
similarity index 98%
rename from doc/ui/predict/swig_py_paddle_en.rst
rename to doc/api/predict/swig_py_paddle_en.rst
index b743fc456914664168e1be6c7f18a419c38afa62..9845cd1607b425dc0a4ddc665aab40d96fa2fbe4 100644
--- a/doc/ui/predict/swig_py_paddle_en.rst
+++ b/doc/api/predict/swig_py_paddle_en.rst
@@ -1,5 +1,5 @@
-Python Prediction API
-=====================
+Python Prediction
+==================
 
 PaddlePaddle offers a set of clean prediction interfaces for python with the help of
 SWIG. The main steps of predict values in python are:
diff --git a/doc/ui/api/trainer_config_helpers/activations.rst b/doc/api/trainer_config_helpers/activations.rst
similarity index 100%
rename from doc/ui/api/trainer_config_helpers/activations.rst
rename to doc/api/trainer_config_helpers/activations.rst
diff --git a/doc/api/trainer_config_helpers/attrs.rst b/doc/api/trainer_config_helpers/attrs.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ac63127bf7d9db6351365ab7b58f43db12347a8e
--- /dev/null
+++ b/doc/api/trainer_config_helpers/attrs.rst
@@ -0,0 +1,5 @@
+Parameter Attributes
+=======================
+
+..  automodule:: paddle.trainer_config_helpers.attrs
+    :members:
diff --git a/doc/ui/api/trainer_config_helpers/data_sources.rst b/doc/api/trainer_config_helpers/data_sources.rst
similarity index 100%
rename from doc/ui/api/trainer_config_helpers/data_sources.rst
rename to doc/api/trainer_config_helpers/data_sources.rst
diff --git a/doc/ui/api/trainer_config_helpers/evaluators.rst b/doc/api/trainer_config_helpers/evaluators.rst
similarity index 100%
rename from doc/ui/api/trainer_config_helpers/evaluators.rst
rename to doc/api/trainer_config_helpers/evaluators.rst
diff --git a/doc/ui/api/trainer_config_helpers/layers.rst b/doc/api/trainer_config_helpers/layers.rst
similarity index 100%
rename from doc/ui/api/trainer_config_helpers/layers.rst
rename to doc/api/trainer_config_helpers/layers.rst
diff --git a/doc/ui/api/trainer_config_helpers/networks.rst b/doc/api/trainer_config_helpers/networks.rst
similarity index 100%
rename from doc/ui/api/trainer_config_helpers/networks.rst
rename to doc/api/trainer_config_helpers/networks.rst
diff --git a/doc/ui/api/trainer_config_helpers/optimizers.rst b/doc/api/trainer_config_helpers/optimizers.rst
similarity index 100%
rename from doc/ui/api/trainer_config_helpers/optimizers.rst
rename to doc/api/trainer_config_helpers/optimizers.rst
diff --git a/doc/ui/api/trainer_config_helpers/poolings.rst b/doc/api/trainer_config_helpers/poolings.rst
similarity index 100%
rename from doc/ui/api/trainer_config_helpers/poolings.rst
rename to doc/api/trainer_config_helpers/poolings.rst
diff --git a/doc/cluster/index.rst b/doc/cluster/index.rst
deleted file mode 100644
index 9062f85f98d2981b5c8dcf8149e32c2ccdac77f4..0000000000000000000000000000000000000000
--- a/doc/cluster/index.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-Cluster Train
-====================
-
-.. toctree::
-  :glob:
-
-  opensource/cluster_train.md
-  internal/index.md
diff --git a/doc/dev/index.rst b/doc/dev/index.rst
deleted file mode 100644
index 0468dd492b6246cfe0771a05c3597ddee95b3ddd..0000000000000000000000000000000000000000
--- a/doc/dev/index.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-Development Guide
-=================
-
-..  toctree::
-  :maxdepth: 1
-
-  layer.md
-  new_layer/new_layer.rst
-  ../source/index.md
diff --git a/doc/dev/layer.md b/doc/dev/layer.md
deleted file mode 100644
index 930fb0de1ac074b15d06197ed0e732f92288b411..0000000000000000000000000000000000000000
--- a/doc/dev/layer.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Layer Documents
-
-* [Layer Source Code Document](../source/gserver/layers/index.rst)
-* [Layer Python API Document](../ui/api/trainer_config_helpers/index.rst)
diff --git a/doc/getstarted/basic_usage/basic_usage.rst b/doc/getstarted/basic_usage/basic_usage.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dca7a6b1f4f017b302148c611122806f112564a9
--- /dev/null
+++ b/doc/getstarted/basic_usage/basic_usage.rst
@@ -0,0 +1,109 @@
+Basic Usage
+=============
+
+PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
+
+1. A Classic Problem
+---------------------
+
+Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression <https://en.wikipedia.org/wiki/Simple_linear_regression>`_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
+
+2. Prepare the Data
+--------------------
+
+Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
+
+    .. code-block:: python
+
+        # dataprovider.py
+        from paddle.trainer.PyDataProvider2 import *
+        import random
+
+        # define data types of input: 2 real numbers
+        @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
+        def process(settings, input_file):
+            for i in xrange(2000):
+                x = random.random()
+                yield [x], [2*x+0.3]
+
+3. Train a NeuralNetwork
+-------------------------
+
+To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle:
+
+    .. code-block:: python
+
+        # trainer_config.py
+        from paddle.trainer_config_helpers import *
+
+        # 1. read data. Suppose you saved above python code as dataprovider.py
+        data_file = 'empty.list'
+        with open(data_file, 'w') as f: f.writelines(' ')
+        define_py_data_sources2(train_list=data_file, test_list=None, 
+                module='dataprovider', obj='process',args={})
+
+        # 2. learning algorithm
+        settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+        # 3. Network configuration
+        x = data_layer(name='x', size=1)
+        y = data_layer(name='y', size=1)
+        y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
+        cost = regression_cost(input=y_predict, label=y)
+        outputs(cost)
+
+Some of the most fundamental usages of PaddlePaddle are demonstrated:
+
+-  The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly.
+
+-  The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time.
+
+-  Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration:
+	-  **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for ``X`` and ``Y``.
+	-  **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model.
+	-  **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters.
+
+Now that everything is ready, you can train the network with a simple command line call:
+
+    .. code-block:: bash
+ 
+        paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
+ 
+
+This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
+
+
+4. Evaluate the Model
+-----------------------
+
+Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly.
+
+In PaddlePaddle, training is just to get a collection of model parameters, which are ``w`` and ``b`` in this case. Each parameter is saved in an individual file in the popular ``numpy`` array format. Here is the code that reads parameters from last pass.
+
+    .. code-block:: python
+
+        import numpy as np
+        import os
+
+        def load(file_name):
+            with open(file_name, 'rb') as f:
+                f.read(16) # skip header for float type.
+                return np.fromfile(f, dtype=np.float32)
+                
+        print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
+        # w=1.999743, b=0.300137
+
+    .. image:: parameters.png
+        :align: center
+
+Although starts from a random guess, you can see that value of ``w`` changes quickly towards 2 and ``b`` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
+
+There, you have recovered the underlying pattern between ``X`` and ``Y`` only from observed data.
+
+
+5. Where to Go from Here
+-------------------------
+
+- `Install and Build <../build_and_install/index.html>`_
+- `Tutorials <../demo/quick_start/index_en.html>`_
+- `Example and Demo <../demo/index.html>`_
diff --git a/doc/getstarted/basic_usage/parameters.png b/doc/getstarted/basic_usage/parameters.png
new file mode 100644
index 0000000000000000000000000000000000000000..2ec67480951e21f0400bce1c34b3108dcd65c18c
Binary files /dev/null and b/doc/getstarted/basic_usage/parameters.png differ
diff --git a/doc/build/build_from_source.md b/doc/getstarted/build_and_install/build_from_source.md
similarity index 100%
rename from doc/build/build_from_source.md
rename to doc/getstarted/build_and_install/build_from_source.md
diff --git a/doc/build/cmake.png b/doc/getstarted/build_and_install/cmake.png
similarity index 100%
rename from doc/build/cmake.png
rename to doc/getstarted/build_and_install/cmake.png
diff --git a/doc/build/docker_install.rst b/doc/getstarted/build_and_install/docker_install.rst
similarity index 100%
rename from doc/build/docker_install.rst
rename to doc/getstarted/build_and_install/docker_install.rst
diff --git a/doc/build/index.rst b/doc/getstarted/build_and_install/index.rst
similarity index 80%
rename from doc/build/index.rst
rename to doc/getstarted/build_and_install/index.rst
index b4fe4596047c7d201fdf36bc76c26d5134611560..6187be9d7257b1690d223770f0f6a5b466cf2898 100644
--- a/doc/build/index.rst
+++ b/doc/getstarted/build_and_install/index.rst
@@ -8,8 +8,6 @@ Install PaddlePaddle
     :maxdepth: 1
     :glob:
 
-    install_*
-    internal/install_from_jumbo.md
     docker_install.rst
     ubuntu_install.rst
 
@@ -24,5 +22,4 @@ Build from Source
     :maxdepth: 1
     :glob:
 
-    build_from_source.md
-    contribute_to_paddle.md
+    build_from_source.md
\ No newline at end of file
diff --git a/doc/build/ubuntu_install.rst b/doc/getstarted/build_and_install/ubuntu_install.rst
similarity index 100%
rename from doc/build/ubuntu_install.rst
rename to doc/getstarted/build_and_install/ubuntu_install.rst
diff --git a/doc/getstarted/index.rst b/doc/getstarted/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5f2787066ea5cdb32a1eff2939ffd0585d7fbc63
--- /dev/null
+++ b/doc/getstarted/index.rst
@@ -0,0 +1,8 @@
+GET STARTED
+============
+
+..  toctree::
+  :maxdepth: 2
+
+  build_and_install/index.rst
+  basic_usage/basic_usage.rst
diff --git a/doc/cluster/opensource/cluster_train.md b/doc/howto/cluster/cluster_train.md
similarity index 99%
rename from doc/cluster/opensource/cluster_train.md
rename to doc/howto/cluster/cluster_train.md
index cb493a88f031850cb6a5eeed0ebe9e41bb7e01c3..1de34a6a99440bf45af8b1fec2c7a2361865fed3 100644
--- a/doc/cluster/opensource/cluster_train.md
+++ b/doc/howto/cluster/cluster_train.md
@@ -1,4 +1,4 @@
-# Distributed Training
+# How to Run Distributed Training
 
 In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
 
@@ -9,7 +9,7 @@ In this article, we explain how to run distributed Paddle training jobs on clust
 1. Aforementioned scripts use a Python library [fabric](http://www.fabfile.org/) to run SSH commands.  We can use `pip` to install fabric:
 
    ```bash
-pip install fabric
+   pip install fabric
    ```
 
 1. We need to install PaddlePaddle on all nodes in the cluster.  To enable GPUs, we need to install CUDA in `/usr/local/cuda`; otherwise Paddle would report errors at runtime.
diff --git a/doc/ui/cmd_argument/argument_outline.md b/doc/howto/cmd_parameter/arguments.md
similarity index 100%
rename from doc/ui/cmd_argument/argument_outline.md
rename to doc/howto/cmd_parameter/arguments.md
diff --git a/doc/ui/cmd_argument/detail_introduction.md b/doc/howto/cmd_parameter/detail_introduction.md
similarity index 100%
rename from doc/ui/cmd_argument/detail_introduction.md
rename to doc/howto/cmd_parameter/detail_introduction.md
diff --git a/doc/howto/cmd_parameter/index.md b/doc/howto/cmd_parameter/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..48cf835de142f19f41a9d077786d312100859592
--- /dev/null
+++ b/doc/howto/cmd_parameter/index.md
@@ -0,0 +1,5 @@
+# How to Set Command-line Parameters
+
+* [Use Case](use_case.md)
+* [Arguments](arguments.md)
+* [Detailed Descriptions](detail_introduction.md)
diff --git a/doc/ui/cmd_argument/use_case.md b/doc/howto/cmd_parameter/use_case.md
similarity index 100%
rename from doc/ui/cmd_argument/use_case.md
rename to doc/howto/cmd_parameter/use_case.md
diff --git a/doc/build/contribute_to_paddle.md b/doc/howto/contribute_to_paddle.md
similarity index 99%
rename from doc/build/contribute_to_paddle.md
rename to doc/howto/contribute_to_paddle.md
index 1d03eb7362b1b6f2fcdac7b53f8b7f93fb75e49c..d1f12c6ab2fb9ddeed40b53c1b2c68a9ccb19105 100644
--- a/doc/build/contribute_to_paddle.md
+++ b/doc/howto/contribute_to_paddle.md
@@ -1,4 +1,4 @@
-# Contribute Code
+# How to Contribute Code
 
 We sincerely appreciate your contributions. You can use fork and pull request
 workflow to merge your code. 
diff --git a/doc/howto/deep_model/index.rst b/doc/howto/deep_model/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..06ef443f62f63723704d4f06ba5d321636c0b72b
--- /dev/null
+++ b/doc/howto/deep_model/index.rst
@@ -0,0 +1,7 @@
+How to Configure Deep Models
+============================
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn/rnn.rst
diff --git a/doc/algorithm/rnn/rnn.rst b/doc/howto/deep_model/rnn/rnn.rst
similarity index 99%
rename from doc/algorithm/rnn/rnn.rst
rename to doc/howto/deep_model/rnn/rnn.rst
index 01d2caefb5cdf4e949511fd0f5bbafe0e604e881..da29b8efadd299fe4fc74a71392cbc9a56e32be3 100644
--- a/doc/algorithm/rnn/rnn.rst
+++ b/doc/howto/deep_model/rnn/rnn.rst
@@ -42,7 +42,7 @@ Simple Gated Recurrent Neural Network
 
 Recurrent neural network process a sequence at each time step sequentially. An example of the architecture of LSTM is listed below.
 
-.. image:: ./bi_lstm.jpg
+.. image:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
 	 :align: center
 
 Generally speaking, a recurrent network perform the following operations from :math:`t=1` to :math:`t=T`, or reversely from :math:`t=T` to :math:`t=1`.
@@ -101,7 +101,7 @@ Sequence to Sequence Model with Attention
 -----------------------------------------
 We will use the sequence to sequence model with attention as an example to demonstrate how you can configure complex recurrent neural network models. An illustration of the sequence to sequence model with attention is shown in the following figure.
 
-.. image:: ./encoder-decoder-attention-model.png
+.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
  	 :align: center
 
 In this model, the source sequence :math:`S = \{s_1, \dots, s_T\}` is encoded with a bidirectional gated recurrent neural networks. The hidden states of the bidirectional gated recurrent neural network :math:`H_S = \{H_1, \dots, H_T\}` is called *encoder vector* The decoder is a gated recurrent neural network. When decoding each token :math:`y_t`, the gated recurrent neural network generates a set of weights :math:`W_S^t = \{W_1^t, \dots, W_T^t\}`, which are used to compute a weighted sum of the encoder vector. The weighted sum of the encoder vector is utilized to condition the generation of the token :math:`y_t`.
diff --git a/doc/howto/index.rst b/doc/howto/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..41877a64a56e289c39ca7513eb05fa6a41129487
--- /dev/null
+++ b/doc/howto/index.rst
@@ -0,0 +1,29 @@
+HOW TO
+=======
+
+Usage
+-------
+
+..  toctree::
+  :maxdepth: 1
+
+  cmd_parameter/index.md
+  deep_model/index.rst
+  cluster/cluster_train.md
+
+Development
+------------
+
+..  toctree::
+  :maxdepth: 1
+
+  new_layer/index.rst
+  contribute_to_paddle.md
+
+Optimization
+-------------
+
+..  toctree::
+  :maxdepth: 1
+
+  optimization/index.rst
diff --git a/doc/dev/new_layer/FullyConnected.jpg b/doc/howto/new_layer/FullyConnected.jpg
similarity index 100%
rename from doc/dev/new_layer/FullyConnected.jpg
rename to doc/howto/new_layer/FullyConnected.jpg
diff --git a/doc/dev/new_layer/new_layer.rst b/doc/howto/new_layer/index.rst
similarity index 99%
rename from doc/dev/new_layer/new_layer.rst
rename to doc/howto/new_layer/index.rst
index af8b76a3075194ead9be40d2c943238b2cfadecc..922bda5b0d879b9041e3c0ca5d2518363a7cfa05 100644
--- a/doc/dev/new_layer/new_layer.rst
+++ b/doc/howto/new_layer/index.rst
@@ -1,6 +1,6 @@
-==================
-Writing New Layers
-==================
+=======================
+How to Write New Layers
+=======================
 
 This tutorial will guide you to write customized layers in PaddlePaddle. We will utilize fully connected layer as an example to guide you through the following steps for writing a new layer.
 
diff --git a/doc/optimization/gpu_profiling.rst b/doc/howto/optimization/gpu_profiling.rst
similarity index 100%
rename from doc/optimization/gpu_profiling.rst
rename to doc/howto/optimization/gpu_profiling.rst
diff --git a/doc/howto/optimization/index.rst b/doc/howto/optimization/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e2822a00982b386e8900420e8c200ed1f92a2d9b
--- /dev/null
+++ b/doc/howto/optimization/index.rst
@@ -0,0 +1,7 @@
+How to Tune GPU Performance
+===========================
+
+.. toctree::
+  :maxdepth: 3
+
+  gpu_profiling.rst
diff --git a/doc/optimization/nvvp1.png b/doc/howto/optimization/nvvp1.png
similarity index 100%
rename from doc/optimization/nvvp1.png
rename to doc/howto/optimization/nvvp1.png
diff --git a/doc/optimization/nvvp2.png b/doc/howto/optimization/nvvp2.png
similarity index 100%
rename from doc/optimization/nvvp2.png
rename to doc/howto/optimization/nvvp2.png
diff --git a/doc/optimization/nvvp3.png b/doc/howto/optimization/nvvp3.png
similarity index 100%
rename from doc/optimization/nvvp3.png
rename to doc/howto/optimization/nvvp3.png
diff --git a/doc/optimization/nvvp4.png b/doc/howto/optimization/nvvp4.png
similarity index 100%
rename from doc/optimization/nvvp4.png
rename to doc/howto/optimization/nvvp4.png
diff --git a/doc/source/api.rst b/doc/howto/source/api.rst
similarity index 100%
rename from doc/source/api.rst
rename to doc/howto/source/api.rst
diff --git a/doc/source/cuda/index.rst b/doc/howto/source/cuda/index.rst
similarity index 100%
rename from doc/source/cuda/index.rst
rename to doc/howto/source/cuda/index.rst
diff --git a/doc/source/cuda/matrix.rst b/doc/howto/source/cuda/matrix.rst
similarity index 100%
rename from doc/source/cuda/matrix.rst
rename to doc/howto/source/cuda/matrix.rst
diff --git a/doc/source/cuda/nn.rst b/doc/howto/source/cuda/nn.rst
similarity index 100%
rename from doc/source/cuda/nn.rst
rename to doc/howto/source/cuda/nn.rst
diff --git a/doc/source/cuda/utils.rst b/doc/howto/source/cuda/utils.rst
similarity index 100%
rename from doc/source/cuda/utils.rst
rename to doc/howto/source/cuda/utils.rst
diff --git a/doc/source/gserver/activations.rst b/doc/howto/source/gserver/activations.rst
similarity index 100%
rename from doc/source/gserver/activations.rst
rename to doc/howto/source/gserver/activations.rst
diff --git a/doc/source/gserver/dataproviders.rst b/doc/howto/source/gserver/dataproviders.rst
similarity index 100%
rename from doc/source/gserver/dataproviders.rst
rename to doc/howto/source/gserver/dataproviders.rst
diff --git a/doc/source/gserver/evaluators.rst b/doc/howto/source/gserver/evaluators.rst
similarity index 100%
rename from doc/source/gserver/evaluators.rst
rename to doc/howto/source/gserver/evaluators.rst
diff --git a/doc/source/gserver/gradientmachines.rst b/doc/howto/source/gserver/gradientmachines.rst
similarity index 100%
rename from doc/source/gserver/gradientmachines.rst
rename to doc/howto/source/gserver/gradientmachines.rst
diff --git a/doc/source/gserver/index.rst b/doc/howto/source/gserver/index.rst
similarity index 100%
rename from doc/source/gserver/index.rst
rename to doc/howto/source/gserver/index.rst
diff --git a/doc/source/gserver/layers.rst b/doc/howto/source/gserver/layers.rst
similarity index 100%
rename from doc/source/gserver/layers.rst
rename to doc/howto/source/gserver/layers.rst
diff --git a/doc/source/gserver/neworks.rst b/doc/howto/source/gserver/neworks.rst
similarity index 100%
rename from doc/source/gserver/neworks.rst
rename to doc/howto/source/gserver/neworks.rst
diff --git a/doc/source/index.rst b/doc/howto/source/index.rst
similarity index 100%
rename from doc/source/index.rst
rename to doc/howto/source/index.rst
diff --git a/doc/source/math/functions.rst b/doc/howto/source/math/functions.rst
similarity index 100%
rename from doc/source/math/functions.rst
rename to doc/howto/source/math/functions.rst
diff --git a/doc/source/math/index.rst b/doc/howto/source/math/index.rst
similarity index 100%
rename from doc/source/math/index.rst
rename to doc/howto/source/math/index.rst
diff --git a/doc/source/math/matrix.rst b/doc/howto/source/math/matrix.rst
similarity index 100%
rename from doc/source/math/matrix.rst
rename to doc/howto/source/math/matrix.rst
diff --git a/doc/source/math/utils.rst b/doc/howto/source/math/utils.rst
similarity index 100%
rename from doc/source/math/utils.rst
rename to doc/howto/source/math/utils.rst
diff --git a/doc/source/math/vector.rst b/doc/howto/source/math/vector.rst
similarity index 100%
rename from doc/source/math/vector.rst
rename to doc/howto/source/math/vector.rst
diff --git a/doc/source/parameter/index.rst b/doc/howto/source/parameter/index.rst
similarity index 100%
rename from doc/source/parameter/index.rst
rename to doc/howto/source/parameter/index.rst
diff --git a/doc/source/parameter/optimizer.rst b/doc/howto/source/parameter/optimizer.rst
similarity index 100%
rename from doc/source/parameter/optimizer.rst
rename to doc/howto/source/parameter/optimizer.rst
diff --git a/doc/source/parameter/parameter.rst b/doc/howto/source/parameter/parameter.rst
similarity index 100%
rename from doc/source/parameter/parameter.rst
rename to doc/howto/source/parameter/parameter.rst
diff --git a/doc/source/parameter/updater.rst b/doc/howto/source/parameter/updater.rst
similarity index 100%
rename from doc/source/parameter/updater.rst
rename to doc/howto/source/parameter/updater.rst
diff --git a/doc/source/pserver/client.rst b/doc/howto/source/pserver/client.rst
similarity index 100%
rename from doc/source/pserver/client.rst
rename to doc/howto/source/pserver/client.rst
diff --git a/doc/source/pserver/index.rst b/doc/howto/source/pserver/index.rst
similarity index 100%
rename from doc/source/pserver/index.rst
rename to doc/howto/source/pserver/index.rst
diff --git a/doc/source/pserver/network.rst b/doc/howto/source/pserver/network.rst
similarity index 100%
rename from doc/source/pserver/network.rst
rename to doc/howto/source/pserver/network.rst
diff --git a/doc/source/pserver/server.rst b/doc/howto/source/pserver/server.rst
similarity index 100%
rename from doc/source/pserver/server.rst
rename to doc/howto/source/pserver/server.rst
diff --git a/doc/source/trainer.rst b/doc/howto/source/trainer.rst
similarity index 100%
rename from doc/source/trainer.rst
rename to doc/howto/source/trainer.rst
diff --git a/doc/source/utils/customStackTrace.rst b/doc/howto/source/utils/customStackTrace.rst
similarity index 100%
rename from doc/source/utils/customStackTrace.rst
rename to doc/howto/source/utils/customStackTrace.rst
diff --git a/doc/source/utils/enum.rst b/doc/howto/source/utils/enum.rst
similarity index 100%
rename from doc/source/utils/enum.rst
rename to doc/howto/source/utils/enum.rst
diff --git a/doc/source/utils/index.rst b/doc/howto/source/utils/index.rst
similarity index 100%
rename from doc/source/utils/index.rst
rename to doc/howto/source/utils/index.rst
diff --git a/doc/source/utils/lock.rst b/doc/howto/source/utils/lock.rst
similarity index 100%
rename from doc/source/utils/lock.rst
rename to doc/howto/source/utils/lock.rst
diff --git a/doc/source/utils/queue.rst b/doc/howto/source/utils/queue.rst
similarity index 100%
rename from doc/source/utils/queue.rst
rename to doc/howto/source/utils/queue.rst
diff --git a/doc/source/utils/thread.rst b/doc/howto/source/utils/thread.rst
similarity index 100%
rename from doc/source/utils/thread.rst
rename to doc/howto/source/utils/thread.rst
diff --git a/doc/index.rst b/doc/index.rst
index 76fb7a3ace8057d9cd34e03134c63ef0cd298cae..3555da1dfc81b29a89c7dfa6087d5fbb734a727b 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -4,8 +4,9 @@ PaddlePaddle Documentation
 ..  toctree::
   :maxdepth: 1
 
-  introduction/index.md
-  user_guide.rst
-  dev/index.rst
-  algorithm/index.rst
-  optimization/index.rst
+  getstarted/index.rst
+  tutorials/index.md
+  howto/index.rst
+  api/index.rst
+  about/index.rst 
+ 
\ No newline at end of file
diff --git a/doc/introduction/index.md b/doc/introduction/index.md
deleted file mode 100644
index 01f52031a1d0247cd0b885218c17001f23685239..0000000000000000000000000000000000000000
--- a/doc/introduction/index.md
+++ /dev/null
@@ -1,100 +0,0 @@
-# Introduction
-
-PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
-
-## 1. A Classic Problem
-
-Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - <a href="https://en.wikipedia.org/wiki/Simple_linear_regression">**simple linear regression**</a> : you have observed a set of two-dimensional data points of `X` and `Y`, where `X` is an explanatory variable and `Y` is corresponding dependent variable, and you want to recover the underlying correlation between `X` and `Y`. Linear regression can be used in many practical scenarios. For example, `X` can be a variable about house size, and `Y` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
-
-## 2. Prepare the Data
-
-Suppose the true relationship can be characterized as `Y = 2X + 0.3`, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
-
-```python
-# dataprovider.py
-from paddle.trainer.PyDataProvider2 import *
-import random
-
-# define data types of input: 2 real numbers
-@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-def process(settings, input_file):
-    for i in xrange(2000):
-        x = random.random()
-        yield [x], [2*x+0.3]
-```
-
-## 3. Train a NeuralNetwork in PaddlePaddle
-
-To recover this relationship between `X` and `Y`, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line `Y' = wX + b` , then we gradually adapt `w` and `b` to minimize the difference between `Y'` and `Y`. Here is what it looks like in PaddlePaddle:
-
-```python
-# trainer_config.py
-from paddle.trainer_config_helpers import *
-
-# 1. read data. Suppose you saved above python code as dataprovider.py
-data_file = 'empty.list'
-with open(data_file, 'w') as f: f.writelines(' ')
-define_py_data_sources2(train_list=data_file, test_list=None, 
-        module='dataprovider', obj='process',args={})
-
-# 2. learning algorithm
-settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-# 3. Network configuration
-x = data_layer(name='x', size=1)
-y = data_layer(name='y', size=1)
-y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-cost = regression_cost(input=y_predict, label=y)
-outputs(cost)
-```
-
-Some of the most fundamental usages of PaddlePaddle are demonstrated:
-
--  The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly.
-
--  The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time.
-
--  Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration:
-	-  **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for `X` and `Y`.
-	-  **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model.
-	-  **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters.
-
-Now that everything is ready, you can train the network with a simple command line call:
- ```
- paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
- ```
-
-This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path `./output`. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
-
-
-## 4. Evaluate the Model
-
-Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: `w=2, b=0.3`, thus a better option is to check out model parameters directly.
-
-In PaddlePaddle, training is just to get a collection of model parameters, which are `w` and `b` in this case. Each parameter is saved in an individual file in the popular `numpy` array format. Here is the code that reads parameters from last pass.
-
-```python
-import numpy as np
-import os
-
-def load(file_name):
-    with open(file_name, 'rb') as f:
-        f.read(16) # skip header for float type.
-        return np.fromfile(f, dtype=np.float32)
-        
-print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-# w=1.999743, b=0.300137
-```
-
-<center> ![](./parameters.png) </center>
-
-Although starts from a random guess, you can see that value of `w` changes quickly towards 2 and `b` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
-
-There, you have recovered the underlying pattern between `X` and `Y` only from observed data.
-
-
-## 5. Where to Go from Here
-
-- <a href="../build/index.html"> Build and Installation </a>
-- <a href="../demo/quick_start/index_en.html">Quick Start</a>
-- <a href="../demo/index.html">Example and Demo</a>
diff --git a/doc/introduction/parameters.png b/doc/introduction/parameters.png
deleted file mode 120000
index f47e74c94fffabbd32f055febbadb1b18aa0c429..0000000000000000000000000000000000000000
--- a/doc/introduction/parameters.png
+++ /dev/null
@@ -1 +0,0 @@
-../../doc_cn/introduction/parameters.png
\ No newline at end of file
diff --git a/doc/optimization/index.rst b/doc/optimization/index.rst
deleted file mode 100644
index c9e87e0778dfe44fa3d1bb84d0ad340aa6f25d08..0000000000000000000000000000000000000000
--- a/doc/optimization/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Performance Tuning
-==================
-
-.. toctree::
-  :maxdepth: 3
-
-  gpu_profiling.rst
diff --git a/doc/demo/embedding_model/index.md b/doc/tutorials/embedding_model/index.md
similarity index 100%
rename from doc/demo/embedding_model/index.md
rename to doc/tutorials/embedding_model/index.md
diff --git a/doc/demo/embedding_model/neural-n-gram-model.png b/doc/tutorials/embedding_model/neural-n-gram-model.png
similarity index 100%
rename from doc/demo/embedding_model/neural-n-gram-model.png
rename to doc/tutorials/embedding_model/neural-n-gram-model.png
diff --git a/doc/demo/image_classification/cifar.png b/doc/tutorials/image_classification/cifar.png
similarity index 100%
rename from doc/demo/image_classification/cifar.png
rename to doc/tutorials/image_classification/cifar.png
diff --git a/doc/demo/image_classification/image_classification.md b/doc/tutorials/image_classification/image_classification.md
similarity index 100%
rename from doc/demo/image_classification/image_classification.md
rename to doc/tutorials/image_classification/image_classification.md
diff --git a/doc/demo/image_classification/image_classification.png b/doc/tutorials/image_classification/image_classification.png
similarity index 100%
rename from doc/demo/image_classification/image_classification.png
rename to doc/tutorials/image_classification/image_classification.png
diff --git a/doc/demo/image_classification/index.rst b/doc/tutorials/image_classification/index.rst
similarity index 100%
rename from doc/demo/image_classification/index.rst
rename to doc/tutorials/image_classification/index.rst
diff --git a/doc/demo/image_classification/lenet.png b/doc/tutorials/image_classification/lenet.png
similarity index 100%
rename from doc/demo/image_classification/lenet.png
rename to doc/tutorials/image_classification/lenet.png
diff --git a/doc/demo/image_classification/plot.png b/doc/tutorials/image_classification/plot.png
similarity index 100%
rename from doc/demo/image_classification/plot.png
rename to doc/tutorials/image_classification/plot.png
diff --git a/doc/demo/imagenet_model/resnet_block.jpg b/doc/tutorials/imagenet_model/resnet_block.jpg
similarity index 100%
rename from doc/demo/imagenet_model/resnet_block.jpg
rename to doc/tutorials/imagenet_model/resnet_block.jpg
diff --git a/doc/demo/imagenet_model/resnet_model.md b/doc/tutorials/imagenet_model/resnet_model.md
similarity index 100%
rename from doc/demo/imagenet_model/resnet_model.md
rename to doc/tutorials/imagenet_model/resnet_model.md
diff --git a/doc/demo/index.md b/doc/tutorials/index.md
similarity index 96%
rename from doc/demo/index.md
rename to doc/tutorials/index.md
index 289199d496eb3b527fa8c8261820bc8e4d301786..ebf5397391e65e096b265f44a0ad81942f0b9ec2 100644
--- a/doc/demo/index.md
+++ b/doc/tutorials/index.md
@@ -1,4 +1,4 @@
-# Examples and demos
+# TUTORIALS
 There are serveral examples and demos here.
 
 ## Image
diff --git a/doc/demo/quick_start/NetContinuous_en.png b/doc/tutorials/quick_start/NetContinuous_en.png
similarity index 100%
rename from doc/demo/quick_start/NetContinuous_en.png
rename to doc/tutorials/quick_start/NetContinuous_en.png
diff --git a/doc/demo/quick_start/NetConv_en.png b/doc/tutorials/quick_start/NetConv_en.png
similarity index 100%
rename from doc/demo/quick_start/NetConv_en.png
rename to doc/tutorials/quick_start/NetConv_en.png
diff --git a/doc/demo/quick_start/NetLR_en.png b/doc/tutorials/quick_start/NetLR_en.png
similarity index 100%
rename from doc/demo/quick_start/NetLR_en.png
rename to doc/tutorials/quick_start/NetLR_en.png
diff --git a/doc/demo/quick_start/NetRNN_en.png b/doc/tutorials/quick_start/NetRNN_en.png
similarity index 100%
rename from doc/demo/quick_start/NetRNN_en.png
rename to doc/tutorials/quick_start/NetRNN_en.png
diff --git a/doc/demo/quick_start/PipelineNetwork_en.jpg b/doc/tutorials/quick_start/PipelineNetwork_en.jpg
similarity index 100%
rename from doc/demo/quick_start/PipelineNetwork_en.jpg
rename to doc/tutorials/quick_start/PipelineNetwork_en.jpg
diff --git a/doc/demo/quick_start/PipelineTest_en.png b/doc/tutorials/quick_start/PipelineTest_en.png
similarity index 100%
rename from doc/demo/quick_start/PipelineTest_en.png
rename to doc/tutorials/quick_start/PipelineTest_en.png
diff --git a/doc/demo/quick_start/PipelineTrain_en.png b/doc/tutorials/quick_start/PipelineTrain_en.png
similarity index 100%
rename from doc/demo/quick_start/PipelineTrain_en.png
rename to doc/tutorials/quick_start/PipelineTrain_en.png
diff --git a/doc/demo/quick_start/Pipeline_en.jpg b/doc/tutorials/quick_start/Pipeline_en.jpg
similarity index 100%
rename from doc/demo/quick_start/Pipeline_en.jpg
rename to doc/tutorials/quick_start/Pipeline_en.jpg
diff --git a/doc/demo/quick_start/index_en.md b/doc/tutorials/quick_start/index_en.md
similarity index 98%
rename from doc/demo/quick_start/index_en.md
rename to doc/tutorials/quick_start/index_en.md
index 659485d9be1b6a3e9759a2fd040cb09d1f2a3005..ec548b5393d7b210d6409328c00917aeb679a451 100644
--- a/doc/demo/quick_start/index_en.md
+++ b/doc/tutorials/quick_start/index_en.md
@@ -59,12 +59,11 @@ To build your text classification system, your code will need to perform five st
 ## Preprocess data into standardized format
 In this example, you are going to use [Amazon electronic product review dataset](http://jmcauley.ucsd.edu/data/amazon/) to build a bunch of deep neural network models for text classification. Each text in this dataset is a product review. This dataset has two categories: “positive” and “negative”. Positive means the reviewer likes the product, while negative means the reviewer does not like the product.
 
-`demo/quick_start` in the [source code](https://github.com/baidu/Paddle) provides scripts for downloading data and preprocessing data as shown below. The data process takes several minutes (about 3 minutes in our machine).
+`demo/quick_start` in the [source code](https://github.com/PaddlePaddle/Paddle) provides script for downloading the preprocessed data as shown below. (If you want to process the raw data, you can use the script `demo/quick_start/data/proc_from_raw_data/get_data.sh`).
 
 ```bash
 cd demo/quick_start
 ./data/get_data.sh
-./preprocess.sh
 ```
 
 ## Transfer Data to Model
diff --git a/doc/demo/rec/ml_dataset.md b/doc/tutorials/rec/ml_dataset.md
similarity index 100%
rename from doc/demo/rec/ml_dataset.md
rename to doc/tutorials/rec/ml_dataset.md
diff --git a/doc/demo/rec/ml_regression.rst b/doc/tutorials/rec/ml_regression.rst
similarity index 100%
rename from doc/demo/rec/ml_regression.rst
rename to doc/tutorials/rec/ml_regression.rst
diff --git a/doc/demo/rec/rec_regression_network.png b/doc/tutorials/rec/rec_regression_network.png
similarity index 100%
rename from doc/demo/rec/rec_regression_network.png
rename to doc/tutorials/rec/rec_regression_network.png
diff --git a/doc/demo/semantic_role_labeling/curve.jpg b/doc/tutorials/semantic_role_labeling/curve.jpg
similarity index 100%
rename from doc/demo/semantic_role_labeling/curve.jpg
rename to doc/tutorials/semantic_role_labeling/curve.jpg
diff --git a/doc/demo/semantic_role_labeling/feature.jpg b/doc/tutorials/semantic_role_labeling/feature.jpg
similarity index 100%
rename from doc/demo/semantic_role_labeling/feature.jpg
rename to doc/tutorials/semantic_role_labeling/feature.jpg
diff --git a/doc/demo/semantic_role_labeling/index.rst b/doc/tutorials/semantic_role_labeling/index.rst
similarity index 100%
rename from doc/demo/semantic_role_labeling/index.rst
rename to doc/tutorials/semantic_role_labeling/index.rst
diff --git a/doc/demo/semantic_role_labeling/network_arch.png b/doc/tutorials/semantic_role_labeling/network_arch.png
similarity index 100%
rename from doc/demo/semantic_role_labeling/network_arch.png
rename to doc/tutorials/semantic_role_labeling/network_arch.png
diff --git a/doc/demo/semantic_role_labeling/semantic_role_labeling.md b/doc/tutorials/semantic_role_labeling/semantic_role_labeling.md
similarity index 97%
rename from doc/demo/semantic_role_labeling/semantic_role_labeling.md
rename to doc/tutorials/semantic_role_labeling/semantic_role_labeling.md
index e2793b2b3494160a7a80f07ec2127bd1f1a4f2e4..f5bdf64487aa189cefcd55d633cc6638912b9e31 100644
--- a/doc/demo/semantic_role_labeling/semantic_role_labeling.md
+++ b/doc/tutorials/semantic_role_labeling/semantic_role_labeling.md
@@ -1,200 +1,200 @@
-# Semantic Role labeling Tutorial #
-
-Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering.  An instance is as following [1]:
-
- [ <sub>A0</sub> He ] [ <sub>AM-MOD</sub> would ][ <sub>AM-NEG</sub> n’t ] [ <sub>V</sub> accept] [ <sub>A1</sub> anything of value ] from [<sub>A2</sub> those he was writing about ]. 
-
-- V: verb
-- A0: acceptor
-- A1: thing accepted
-- A2: accepted-from
-- A3: Attribute
-- AM-MOD: modal 
-- AM-NEG: negation
-
-Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank. 
-
-To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem. 
-
-## Data Description
-The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license,  the demo adopts the test data set of CoNLL-2005, which can be reached on website.
-
-To download and process the original data, user just need to execute the following command:
-
-```bash
-cd data
-./get_data.sh
-```
-Several new files appear in the `data `directory as follows.
-```bash
-conll05st-release：the test data set of CoNll-2005 shared task 
-test.wsj.words：the Wall Street Journal data sentences
-test.wsj.props:  the propositional arguments
-feature: the extracted features from data set
-```
-
-## Training
-### DB-LSTM
-Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit. 
-
-Unlike Bidirectional-LSTM that used in Sentiment Analysis demo,  the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model. 
-
-The following figure shows a temporal expanded 2-layer DB-LSTM network.
-<center>
-![pic](./network_arch.png)
-</center>
-
-### Features
-Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark m<sub>r</sub> = 1 to denote the argument position if it locates in the predicate context region, or m<sub>r</sub> = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]:
-<center>
-![pic](./feature.jpg)
-</center>
-
-In this sample, the coresponding labelled sentence is:
-
-[ <sub>A1</sub> A record date ] has [ <sub>AM-NEG</sub> n't ] been [ <sub>V</sub> set ] . 
-
-In the demo, we adopt the feature template as above, consists of :  `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`.
-
-### Data Provider
-
-`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The  Six features and label are all IndexSlots.
-```
-def hook(settings, word_dict, label_dict, **kwargs):
-    settings.word_dict = word_dict
-    settings.label_dict = label_dict
-    #all inputs are integral and sequential type
-    settings.slots = [
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(predicate_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(2),
-        integer_value_sequence(len(label_dict))]
-```
-The corresponding data iterator is as following:
-```
-@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
-          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line in fdata:
-            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
-                line.strip().split('\t')
-
-            words = sentence.split()
-            sen_len = len(words)
-            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-
-            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
-            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
-            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
-            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
-            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
-
-            marks = mark.split()
-            mark_slot = [int(w) for w in marks]
-
-            label_list = label.split()
-            label_slot = [settings.label_dict.get(w) for w in label_list]
-            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
-                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
-```
-The `process`function yield 9 lists which are 8 features and label.
- 
-### Neural Network Config
-`db_lstm.py` is the neural network config file to load the dictionaries and define the  data provider module and network architecture during the training procedure. 
-
-Nine `data_layer` load instances from data provider. Eight features are transformed into embedddings respectively, and mixed by `mixed_layer` .  Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels.
-
-### Run Training 
-The script for training is `train.sh`, user just need to execute:
-```bash
-  ./train.sh
-```
-The content in `train.sh`:
-```
-paddle train \
-  --config=./db_lstm.py \
-  --use_gpu=0 \
-  --log_period=5000 \
-  --trainer_count=1 \
-  --show_parameter_stats_period=5000 \
-  --save_dir=./output \
-  --num_passes=10000 \
-  --average_test_period=10000000 \
-  --init_model_path=./data \
-  --load_missing_parameter_strategy=rand \
-  --test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
-```
-
--  \--config=./db_lstm.py : network config file.
--  \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train, until now crf_layer do not support GPU
--  \--log_period=500: print log every 20 batches.
--  \--trainer_count=1: set thread number (or GPU count).
--  \--show_parameter_stats_period=5000: show parameter statistic every 100 batches.
--  \--save_dir=./output: output path to save models.
--  \--num_passes=10000: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
--  \--average_test_period=10000000:  do test on average parameter every average_test_period batches
--  \--init_model_path=./data: parameter initialization path 
--  \--load_missing_parameter_strategy=rand: random initialization unexisted parameters
--  \--test_all_data_in_one_period=1: test all data in one period
-
-
-After training, the models  will be saved in directory `output`. Our training curve is as following:
-<center>
-![pic](./curve.jpg)
-</center>
-
-### Run testing
-The script for testing is `test.sh`, user just need to execute:
-```bash
-  ./test.sh
-```
-The main part in `tesh.sh`
-```
-paddle train \
-  --config=./db_lstm.py \
-  --model_list=$model_list \
-  --job=test \
-  --config_args=is_test=1 \
-```
-
-  - \--config=./db_lstm.py: network config file
-  - \--model_list=$model_list.list: model list file
-  - \--job=test: indicate the test job
-  - \--config_args=is_test=1: flag to indicate test
-  - \--test_all_data_in_one_period=1: test all data in 1 period
-  
-
-### Run prediction
-The script for prediction is `predict.sh`, user just need to execute:
-```bash
-  ./predict.sh
-  
-```
-In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file
-```
-python predict.py 
-     -c $config_file \
-     -w $best_model_path \
-     -l $label_file \
-     -p $predicate_dict_file  \
-     -d $dict_file \
-     -i $input_file \
-     -o $output_file
-```
-
-`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix.
-
-After prediction,  the result is saved in `predict.res`.
-
-## Reference
-[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. 
-
-[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
+# Semantic Role labeling Tutorial #
+
+Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering.  An instance is as following [1]:
+
+ [ <sub>A0</sub> He ] [ <sub>AM-MOD</sub> would ][ <sub>AM-NEG</sub> n’t ] [ <sub>V</sub> accept] [ <sub>A1</sub> anything of value ] from [<sub>A2</sub> those he was writing about ]. 
+
+- V: verb
+- A0: acceptor
+- A1: thing accepted
+- A2: accepted-from
+- A3: Attribute
+- AM-MOD: modal 
+- AM-NEG: negation
+
+Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank. 
+
+To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem. 
+
+## Data Description
+The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license,  the demo adopts the test data set of CoNLL-2005, which can be reached on website.
+
+To download and process the original data, user just need to execute the following command:
+
+```bash
+cd data
+./get_data.sh
+```
+Several new files appear in the `data `directory as follows.
+```bash
+conll05st-release：the test data set of CoNll-2005 shared task 
+test.wsj.words：the Wall Street Journal data sentences
+test.wsj.props:  the propositional arguments
+feature: the extracted features from data set
+```
+
+## Training
+### DB-LSTM
+Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit. 
+
+Unlike Bidirectional-LSTM that used in Sentiment Analysis demo,  the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model. 
+
+The following figure shows a temporal expanded 2-layer DB-LSTM network.
+<center>
+![pic](./network_arch.png)
+</center>
+
+### Features
+Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark m<sub>r</sub> = 1 to denote the argument position if it locates in the predicate context region, or m<sub>r</sub> = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]:
+<center>
+![pic](./feature.jpg)
+</center>
+
+In this sample, the coresponding labelled sentence is:
+
+[ <sub>A1</sub> A record date ] has [ <sub>AM-NEG</sub> n't ] been [ <sub>V</sub> set ] . 
+
+In the demo, we adopt the feature template as above, consists of :  `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`.
+
+### Data Provider
+
+`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The  Six features and label are all IndexSlots.
+```
+def hook(settings, word_dict, label_dict, **kwargs):
+    settings.word_dict = word_dict
+    settings.label_dict = label_dict
+    #all inputs are integral and sequential type
+    settings.slots = [
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(predicate_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(2),
+        integer_value_sequence(len(label_dict))]
+```
+The corresponding data iterator is as following:
+```
+@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
+          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
+    with open(file_name, 'r') as fdata:
+        for line in fdata:
+            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
+                line.strip().split('\t')
+
+            words = sentence.split()
+            sen_len = len(words)
+            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
+
+            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
+            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
+            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
+
+            marks = mark.split()
+            mark_slot = [int(w) for w in marks]
+
+            label_list = label.split()
+            label_slot = [settings.label_dict.get(w) for w in label_list]
+            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
+                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
+```
+The `process`function yield 9 lists which are 8 features and label.
+ 
+### Neural Network Config
+`db_lstm.py` is the neural network config file to load the dictionaries and define the  data provider module and network architecture during the training procedure. 
+
+Nine `data_layer` load instances from data provider. Eight features are transformed into embedddings respectively, and mixed by `mixed_layer` .  Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels.
+
+### Run Training 
+The script for training is `train.sh`, user just need to execute:
+```bash
+  ./train.sh
+```
+The content in `train.sh`:
+```
+paddle train \
+  --config=./db_lstm.py \
+  --use_gpu=0 \
+  --log_period=5000 \
+  --trainer_count=1 \
+  --show_parameter_stats_period=5000 \
+  --save_dir=./output \
+  --num_passes=10000 \
+  --average_test_period=10000000 \
+  --init_model_path=./data \
+  --load_missing_parameter_strategy=rand \
+  --test_all_data_in_one_period=1 \
+2>&1 | tee 'train.log'
+```
+
+-  \--config=./db_lstm.py : network config file.
+-  \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train, until now crf_layer do not support GPU
+-  \--log_period=500: print log every 20 batches.
+-  \--trainer_count=1: set thread number (or GPU count).
+-  \--show_parameter_stats_period=5000: show parameter statistic every 100 batches.
+-  \--save_dir=./output: output path to save models.
+-  \--num_passes=10000: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
+-  \--average_test_period=10000000:  do test on average parameter every average_test_period batches
+-  \--init_model_path=./data: parameter initialization path 
+-  \--load_missing_parameter_strategy=rand: random initialization unexisted parameters
+-  \--test_all_data_in_one_period=1: test all data in one period
+
+
+After training, the models  will be saved in directory `output`. Our training curve is as following:
+<center>
+![pic](./curve.jpg)
+</center>
+
+### Run testing
+The script for testing is `test.sh`, user just need to execute:
+```bash
+  ./test.sh
+```
+The main part in `tesh.sh`
+```
+paddle train \
+  --config=./db_lstm.py \
+  --model_list=$model_list \
+  --job=test \
+  --config_args=is_test=1 \
+```
+
+  - \--config=./db_lstm.py: network config file
+  - \--model_list=$model_list.list: model list file
+  - \--job=test: indicate the test job
+  - \--config_args=is_test=1: flag to indicate test
+  - \--test_all_data_in_one_period=1: test all data in 1 period
+  
+
+### Run prediction
+The script for prediction is `predict.sh`, user just need to execute:
+```bash
+  ./predict.sh
+  
+```
+In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file
+```
+python predict.py 
+     -c $config_file \
+     -w $best_model_path \
+     -l $label_file \
+     -p $predicate_dict_file  \
+     -d $dict_file \
+     -i $input_file \
+     -o $output_file
+```
+
+`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix.
+
+After prediction,  the result is saved in `predict.res`.
+
+## Reference
+[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. 
+
+[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
diff --git a/doc/demo/sentiment_analysis/bi_lstm.jpg b/doc/tutorials/sentiment_analysis/bi_lstm.jpg
similarity index 100%
rename from doc/demo/sentiment_analysis/bi_lstm.jpg
rename to doc/tutorials/sentiment_analysis/bi_lstm.jpg
diff --git a/doc/demo/sentiment_analysis/index.rst b/doc/tutorials/sentiment_analysis/index.rst
similarity index 100%
rename from doc/demo/sentiment_analysis/index.rst
rename to doc/tutorials/sentiment_analysis/index.rst
diff --git a/doc/demo/sentiment_analysis/lstm.png b/doc/tutorials/sentiment_analysis/lstm.png
similarity index 100%
rename from doc/demo/sentiment_analysis/lstm.png
rename to doc/tutorials/sentiment_analysis/lstm.png
diff --git a/doc/demo/sentiment_analysis/sentiment_analysis.md b/doc/tutorials/sentiment_analysis/sentiment_analysis.md
similarity index 100%
rename from doc/demo/sentiment_analysis/sentiment_analysis.md
rename to doc/tutorials/sentiment_analysis/sentiment_analysis.md
diff --git a/doc/demo/sentiment_analysis/stacked_lstm.jpg b/doc/tutorials/sentiment_analysis/stacked_lstm.jpg
similarity index 100%
rename from doc/demo/sentiment_analysis/stacked_lstm.jpg
rename to doc/tutorials/sentiment_analysis/stacked_lstm.jpg
diff --git a/doc/demo/text_generation/encoder-decoder-attention-model.png b/doc/tutorials/text_generation/encoder-decoder-attention-model.png
similarity index 100%
rename from doc/demo/text_generation/encoder-decoder-attention-model.png
rename to doc/tutorials/text_generation/encoder-decoder-attention-model.png
diff --git a/doc/demo/text_generation/index.rst b/doc/tutorials/text_generation/index.rst
similarity index 100%
rename from doc/demo/text_generation/index.rst
rename to doc/tutorials/text_generation/index.rst
diff --git a/doc/demo/text_generation/text_generation.md b/doc/tutorials/text_generation/text_generation.md
similarity index 100%
rename from doc/demo/text_generation/text_generation.md
rename to doc/tutorials/text_generation/text_generation.md
diff --git a/doc/ui/api/trainer_config_helpers/attrs.rst b/doc/ui/api/trainer_config_helpers/attrs.rst
deleted file mode 100644
index 44919aba90df0b9da7c311a62339052c16c44ad1..0000000000000000000000000000000000000000
--- a/doc/ui/api/trainer_config_helpers/attrs.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-Parameter and Extra Layer Attribute
-===================================
-
-..  automodule:: paddle.trainer_config_helpers.attrs
-    :members:
diff --git a/doc/ui/api/trainer_config_helpers/index.rst b/doc/ui/api/trainer_config_helpers/index.rst
deleted file mode 100644
index 8395eb75710b3e67ec0c5442f79c999bdacdff42..0000000000000000000000000000000000000000
--- a/doc/ui/api/trainer_config_helpers/index.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Model Config Interface
-======================
-
-.. toctree::
-  :maxdepth: 1
-
-  optimizers.rst
-  data_sources.rst
-  layers.rst
-  activations.rst 
-  poolings.rst
-  networks.rst
-  evaluators.rst
-  attrs.rst
diff --git a/doc/ui/index.md b/doc/ui/index.md
deleted file mode 100644
index 9c1ba27bdc14fa9ab762ffb97424a8a5946808f9..0000000000000000000000000000000000000000
--- a/doc/ui/index.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# User Interface
-
-## Data Provider
-
-* [Introduction](data_provider/index.rst)
-* [PyDataProvider2](data_provider/pydataprovider2.rst)
-
-## API Reference
-
-* [Model Config Interface](api/trainer_config_helpers/index.md)
-
-## Command Line Argument
-
-* [Use Case](cmd_argument/use_case.md)
-* [Argument Outline](cmd_argument/argument_outline.md)
-* [Detailed Descriptions](cmd_argument/detail_introduction.md)
-
-## Predict
-
-* [Python Prediction API](predict/swig_py_paddle_en.rst)
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
deleted file mode 100644
index d4deb3ca5a4523b509ea5082f32be8a315570dea..0000000000000000000000000000000000000000
--- a/doc/user_guide.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-User Guide
-==========
-
-..  toctree::
-  :maxdepth: 1
-
-  demo/quick_start/index_en.md
-  build/index.rst
-  build/contribute_to_paddle.md
-  ui/index.md
-  ui/api/trainer_config_helpers/index.rst
-  demo/index.md
-  cluster/index.md
diff --git a/doc_cn/build_and_install/index.rst b/doc_cn/build_and_install/index.rst
index 2205e282248c4e7f6d1173be47aadf160554c6be..48163fb36e561fe5fd8f6907379687a8b5c97f68 100644
--- a/doc_cn/build_and_install/index.rst
+++ b/doc_cn/build_and_install/index.rst
@@ -8,9 +8,7 @@ PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜
 
 .. toctree::
    :maxdepth: 1
-   :glob:
    
-   使用Jumbo安装(对内) <../build/internal/install_from_jumbo.rst>
    install/docker_install.rst 
    install/ubuntu_install.rst
 
@@ -25,8 +23,5 @@ PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜
 
 .. toctree::
    :maxdepth: 1
-   :glob:
 
-   源码下载(对内) <../build/internal/download_paddle_source_zh_cn.rst>
-   从源码编译安装(对内)  <../build/internal/build_from_source_zh_cn.rst>
    cmake/index.rst
diff --git a/doc_cn/build_and_install/install/docker_install.rst b/doc_cn/build_and_install/install/docker_install.rst
index a5f5fb117e11e8ac1ae49e4271e826fa12d5e810..40339659be406ec72da8ad89b6d5dd38d72bb5ae 100644
--- a/doc_cn/build_and_install/install/docker_install.rst
+++ b/doc_cn/build_and_install/install/docker_install.rst
@@ -1,9 +1,7 @@
 安装PaddlePaddle的Docker镜像
 ============================
 
-PaddlePaddle提供了Docker的使用镜像。PaddlePaddle推荐使用Docker进行PaddlePaddle的部署和
-运行。Docker是一个基于容器的轻量级虚拟环境。具有和宿主机相近的运行效率，并提供
-了非常方便的二进制分发手段。
+PaddlePaddle项目提供官方 `Docker <https://www.docker.com/>`_ 镜像。Docker镜像是我们目前唯一官方支持的部署和运行方式。
 
 下述内容将分为如下几个类别描述。
 
@@ -41,7 +39,7 @@ PaddlePaddle提供的Docker镜像版本
 * CPU WITHOUT AVX: CPU版本，不支持AVX指令集的CPU也可以运行
 * GPU WITHOUT AVX: GPU版本，不需要AVX指令集的CPU也可以运行。
 
-用户可以选择对应版本的docker image。使用如下脚本可以确定本机的CPU知否支持 :code:`AVX` 指令集\:
+用户可以选择对应版本的docker image。使用如下脚本可以确定本机的CPU是否支持 :code:`AVX` 指令集\:
 
 ..  code-block:: bash
 
@@ -67,7 +65,7 @@ mac osx或者是windows机器，请参考
 
 ..  code-block:: bash
     
-    $ docker run -it paddledev/paddlepaddle:cpu-latest
+    $ docker run -it paddledev/paddle:cpu-latest
 
 即可启动和进入PaddlePaddle的container。如果运行GPU版本的PaddlePaddle，则需要先将
 cuda相关的Driver和设备映射进container中，脚本类似于
@@ -76,7 +74,7 @@ cuda相关的Driver和设备映射进container中，脚本类似于
 
     $ export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
     $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
 
 进入Docker container后，运行 :code:`paddle version` 即可打印出PaddlePaddle的版本和构建
 信息。安装完成的PaddlePaddle主体包括三个部分， :code:`paddle` 脚本， python的
diff --git a/doc_cn/build_and_install/install/ubuntu_install.rst b/doc_cn/build_and_install/install/ubuntu_install.rst
index 0fb59e25f6932214a3f1c67b12b426e388c3fc5d..4500d6e0b03be9280e3e6c25cddbf7fb389671b8 100644
--- a/doc_cn/build_and_install/install/ubuntu_install.rst
+++ b/doc_cn/build_and_install/install/ubuntu_install.rst
@@ -1,35 +1,42 @@
-使用deb包在Ubuntu上安装PaddlePaddle
+Ubuntu部署PaddlePaddle
 ===================================
 
-PaddlePaddle目前支持使用deb包安装。Paddle的 :code:`deb` 安装包在ubuntu 14.04中正确，但理论上支持其他的 debian 发行版。
+PaddlePaddle提供了ubuntu 14.04 deb安装包。
 
+安装
+------
 
-PaddlePaddle的ubuntu安装包分为四个版本，他们是 cpu、gpu、cpu-noavx、gpu-noavx 四个版本。其中 noavx 用于不支持AVX指令集的cpu。安装包的下载地址是\: https://github.com/baidu/Paddle/releases/
+安装包的下载地址是\: https://github.com/PaddlePaddle/Paddle/releases
 
+它包含四个版本\:
 
-用户需要先将PaddlePaddle安装包下载到本地，然后执行如下 :code:`gdebi` 命令即可完成安装。
+* cpu版本: 支持主流x86处理器平台, 使用了avx指令集。
 
-..  code-block:: shell
+* cpu-noavx版本：支持主流x86处理器平台，没有使用avx指令集。
+
+* gpu版本：支持主流x86处理器平台，支持nvidia cuda平台，使用了avx指令集。
 
-    gdebi paddle-*-cpu*.deb
+* gpu-noavx版本：支持主流x86处理器平台，支持nvidia cuda平台，没有使用avx指令集。
 
-如果 :code:`gdebi` 没有安装,则需要使用 :code:`sudo apt-get install gdebi`, 来安装 :code:`gdebi` 。
+下载完相关安装包后，执行:
 
+..  code-block:: shell
 
-或者使用下面一条命令安装.
+    sudo apt-get install gdebi
+    gdebi paddle-*-cpu.deb
+
+或者:
 
 ..  code-block:: shell
 
-    dpkg -i paddle-*-cpu*.deb
+    dpkg -i paddle-*-cpu.deb
     apt-get install -f
 
+
 在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的，
 在 :code:`apt-get install -f` 里会继续安装 PaddlePaddle。
 
-需要注意的是，如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，
-并设置好对应的环境变量(LD_LIBRARY_PATH等等)。
-
-安装完成后,可以使用命令 :code:`paddle version` 查看安装后的paddle 版本。可能的输出为
+安装完成后，可以使用命令 :code:`paddle version` 查看安装后的paddle 版本:
 
 ..  literalinclude:: paddle_version.txt
 
@@ -39,45 +46,16 @@ PaddlePaddle的ubuntu安装包分为四个版本，他们是 cpu、gpu、cpu-noa
 libcudart.so/libcudnn.so找不到
 ++++++++++++++++++++++++++++++
 
-安装完成PaddlePaddle后，运行 :code:`paddle train` 报错\:
-
-..	code-block:: shell
-
-	0831 12:36:04.151525  1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
-
-PaddlePaddle使用运行时动态连接CUDA的so，如果在 LD_LIBRARY_PATH里面找不到这些动态
-库的话，会报寻找不到这些动态库。
-
-解决方法很简单，就是将这些动态库加到环境变量里面。比较可能的命令如下。
+安装完成后，运行 :code:`paddle train` 报错\:
 
-..	code-block:: text
+.. 	code-block:: shell
 
-	export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+	  0831 12:36:04.151525  1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
 
-CUDA Driver找不到
-+++++++++++++++++
+原因是未设置cuda运行时环境变量。 如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，并设置：
 
-运行 :code:`paddle train` 报错\:
-
-..	code-block:: text
-
-	F0831 12:39:16.699000  1090 hl_cuda_device.cc:530] Check failed: cudaSuccess == cudaStat (0 vs. 35) Cuda Error: CUDA driver version is insufficient for CUDA runtime version
-
-PaddlePaddle运行时如果没有寻找到cuda的driver，变会报这个错误。解决办法是将cuda 
-driver添加到LD_LIBRARY_PATH中。比较可能的命令如下。
-
-..	code-block:: text
-
-	export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
-
-config文件找不到
-++++++++++++++++
-
-运行 :code:`paddle train` 得到结果\:
-
-..	code-block:: text
+..  code-block:: shell
 
-	F0831 20:53:07.525789  1302 TrainerMain.cpp:94] Check failed: config != nullptr no valid config
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH
+    export PATH=/usr/local/cuda/bin:$PATH
 
-PaddlePaddle在运行时找不到对应的config文件，说明命令行参数 :code:`config` 没有设置。
-而这个一般说明PaddlePaddle已经安装完毕了。
\ No newline at end of file
diff --git a/doc_cn/demo/quick_start/index.md b/doc_cn/demo/quick_start/index.md
index 4d9b24ba851a7aaaeb0d79bfbeb0703b8878b77f..4a6e07ee1ffd94cf8f781af307b53a96a78e6b93 100644
--- a/doc_cn/demo/quick_start/index.md
+++ b/doc_cn/demo/quick_start/index.md
@@ -32,13 +32,11 @@
 
 ## 数据格式准备(Data Preparation)
 在本问题中，我们使用[Amazon电子产品评论数据](http://jmcauley.ucsd.edu/data/amazon/)，
-将评论分为好评(正样本)和差评(负样本)两类。[源码](https://github.com/baidu/Paddle)的`demo/quick_start`里提供了数据下载脚本
-和预处理脚本。
+将评论分为好评(正样本)和差评(负样本)两类。[源码](https://github.com/PaddlePaddle/Paddle)的`demo/quick_start`里提供了下载已经预处理数据的脚本（如果想从最原始的数据处理，可以使用脚本 `./demo/quick_start/data/proc_from_raw_data/get_data.sh`）。
 
 ```bash
 cd demo/quick_start
 ./data/get_data.sh
-./preprocess.sh
 ```
 
 ## 数据向模型传送(Transfer Data to Model)
@@ -143,7 +141,7 @@ PyDataProvider2</a>。
 
 我们将以基本的逻辑回归网络作为起点，并逐渐展示更加深入的功能。更详细的网络配置
 连接请参考<a href = "../../../doc/layer.html">Layer文档</a>。
-所有配置在[源码](https://github.com/baidu/Paddle)`demo/quick_start`目录，首先列举逻辑回归网络。
+所有配置在[源码](https://github.com/PaddlePaddle/Paddle)`demo/quick_start`目录，首先列举逻辑回归网络。
 
 ### 逻辑回归模型(Logistic Regression)
 
diff --git a/doc_cn/faq/index.rst b/doc_cn/faq/index.rst
index 3eb0e10ae2228740cd384270db5070e367f7007b..551430eb41765673700b7c6568e4b483641f2cac 100644
--- a/doc_cn/faq/index.rst
+++ b/doc_cn/faq/index.rst
@@ -4,22 +4,18 @@ PaddlePaddle常见问题
 
 ..  contents::
 
-1. 如何减少PaddlePaddle的内存占用
+1. 如何减少内存占用
 ---------------------------------
 
-神经网络的训练本身是一个非常消耗内存和显存的工作。经常会消耗数十G的内存和数G的显存。
+神经网络的训练本身是一个非常消耗内存和显存的工作，经常会消耗数10GB的内存和数GB的显存。
 PaddlePaddle的内存占用主要分为如下几个方面\:
 
-* DataProvider缓冲池内存 (只针对内存)
-* 神经元激活内存 （针对内存和显存）
-* 参数内存 (针对内存和显存)
+* DataProvider缓冲池内存（只针对内存）
+* 神经元激活内存（针对内存和显存）
+* 参数内存 （针对内存和显存）
 * 其他内存杂项
 
-这其中，其他内存杂项是指PaddlePaddle本身所用的一些内存，包括字符串分配，临时变量等等，
-这些内存就不考虑如何缩减了。
-
-其他的内存的减少方法依次为
-
+其中，其他内存杂项是指PaddlePaddle本身所用的一些内存，包括字符串分配，临时变量等等，暂不考虑在内。
 
 减少DataProvider缓冲池内存
 ++++++++++++++++++++++++++
@@ -39,28 +35,28 @@ PyDataProvider使用的是异步加载，同时在内存里直接随即选取数
 
 ..  literalinclude:: reduce_min_pool_size.py
 
-这样做可以极大的减少内存占用，并且可能会加速训练过程。 详细文档参考 `这里
+这样做可以极大的减少内存占用，并且可能会加速训练过程，详细文档参考 `这里
 <../ui/data_provider/pydataprovider2.html#provider>`_ 。
 
 神经元激活内存
 ++++++++++++++
 
-神经网络在训练的时候，会对每一个激活暂存一些数据，包括激活，參差等等。
+神经网络在训练的时候，会对每一个激活暂存一些数据，如神经元激活值等。
 在反向传递的时候，这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系，
 一是batch size，另一个是每条序列(Sequence)长度。所以，其实也是和每个mini-batch中包含
 的时间步信息成正比。
 
-所以，做法可以有两种。他们是
+所以做法可以有两种：
 
 * 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数，减小batch size可能会对训练结果产生影响。
 * 减小序列的长度，或者直接扔掉非常长的序列。比如，一个数据集大部分序列长度是100-200,
-  但是突然有一个10000长的序列，就很容易导致内存超限。特别是在LSTM等RNN中。
+  但是突然有一个10000长的序列，就很容易导致内存超限，特别是在LSTM等RNN中。
 
 参数内存
 ++++++++
 
 PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需要使用不同大小的内存。
-例如如果使用 :code:`adadelta` 算法，则需要使用参数规模大约5倍的内存。 如果参数保存下来的
+例如使用 :code:`adadelta` 算法，则需要使用等于权重参数规模大约5倍的内存。举例，如果参数保存下来的模型目录
 文件为 :code:`100M`， 那么该优化算法至少需要 :code:`500M` 的内存。
 
 可以考虑使用一些优化算法，例如 :code:`momentum`。
@@ -68,11 +64,11 @@ PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需
 2. 如何加速PaddlePaddle的训练速度
 ---------------------------------
 
-PaddlePaddle是神经网络训练平台，加速PaddlePaddle训练有如下几个方面\：
+加速PaddlePaddle训练可以考虑从以下几个方面\：
 
 * 减少数据载入的耗时
 * 加速训练速度
-* 利用更多的计算资源
+* 利用分布式训练驾驭更多的计算资源
 
 减少数据载入的耗时
 ++++++++++++++++++
@@ -108,25 +104,20 @@ PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`spa
 利用更多的计算资源可以分为一下几个方式来进行\:
 
 * 单机CPU训练
-  * 使用多线程训练。设置命令行参数 :code:`trainer_count`，即可以设置参与训练的线程数量。使用方法为 :code:`paddle train --trainer_count=4`
+  * 使用多线程训练。设置命令行参数 :code:`trainer_count`。
+
 * 单机GPU训练
-  * 使用显卡训练。设置命令行参数 :code:`use_gpu`。 使用方法为 :code:`paddle train --use_gpu=true`
-  * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count`。使用 :code:`--use_gpu=True` 开启GPU训练，使用 :code:`trainer_count` 指定显卡数量。使用方法为 :code:`paddle train --use_gpu=true --trainer_count=4`
+  * 使用显卡训练。设置命令行参数 :code:`use_gpu`。
+  * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。
+
 * 多机训练
-  * 使用多机训练的方法也比较简单，需要先在每个节点启动 :code:`paddle pserver`，在使用 :code:`paddle train --pservers=192.168.100.1,192.168.100.2` 来指定每个pserver的ip地址
-  * 具体的多机训练方法参考 `多机训练 <TBD>`_ 文档。
+  * 具体的多机训练方法参考  `多机训练文档 <../ui/data_provider/pydataprovider2.html#provider>`_ 。
 
 
 3. 遇到“非法指令”或者是“illegal instruction” 
 --------------------------------------------
 
-paddle在进行计算的时候为了提升计算性能，使用了avx指令。部分老的cpu型号无法支持这样的指令。通常来说执行下grep avx /proc/cpuinfo看看是否有输出即可知道是否支持。（另：用此方法部分虚拟机可能检测到支持avx指令但是实际运行会挂掉，请当成是不支持，看下面的解决方案）
-
-解决办法是\:
-
-* 使用 NO_AVX的 `安装包 <../build_and_install/index.html>`_ 或者 `Docker image <../build_and_install/install/docker_install.html>`_
-* 或者，使用 :code:`-DWITH_AVX=OFF` 重新编译PaddlePaddle。
-
+PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二进制发行版可能会导致这种错误，请选择正确的版本。
 
 4. 如何选择SGD算法的学习率
 --------------------------
@@ -158,7 +149,7 @@ paddle在进行计算的时候为了提升计算性能，使用了avx指令。
 6. 如何共享参数
 ---------------
 
-PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字的参数，会共享参数。设置参数的名字，可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式，是想要共享的参数使用同样的 :code:`ParamAttr` 对象。
+PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字的参数，会共享参数。设置参数的名字，可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式，是使得要共享的参数使用同样的 :code:`ParamAttr` 对象。
 
 简单的全连接网络，参数共享的配置示例为\:
 
@@ -208,9 +199,6 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
     paddle package is already in your PYTHONPATH. But unittest need a clean environment.
     Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
     
-解决办法是：卸载paddle包 :code:`pip uninstall paddle`。
-
-原因是：单元测试使用了一个旧版本的python包，而没有测试到代码中实际修改的python包。即单元测试需要一个干净的环境：
+解决办法是：
 
-* 如果paddle包已经在python的site-packages里面了，那么单元测试时使用的paddle包，就是site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。
-* 即便设置了 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
\ No newline at end of file
+* 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包，使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面，单元测试会引用site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。同时，即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
diff --git a/doc_cn/introduction/index.md b/doc_cn/introduction/index.md
deleted file mode 100644
index 164cb7d4943dfbfcc00a2df7329ae2a877b2d703..0000000000000000000000000000000000000000
--- a/doc_cn/introduction/index.md
+++ /dev/null
@@ -1,105 +0,0 @@
-# 简介
-
-PaddlePaddle 是起源于百度的开源深度学习平台。它是简单易用的：你可以通过简单的十数行配置搭建经典的神经网络模型；它也是高效强大的：PaddlePaddle可以支撑复杂集群环境下超大模型的训练，令你受益于深度学习的前沿成果。在百度内部，已经有大量产品线使用了基于PaddlePaddle的深度学习技术。
-
-这份简短的介绍将像你展示如何利用PaddlePaddle解决一个经典的学习问题。
-
-## 1. 一个经典的任务
-
-让我们从一个基础问题开始：<a href="https://www.baidu.com/s?wd=单变量线性回归">单变量的线性回归</a>。问题假定观测到了一批二维空间上的点`(x, y) `，并且已知 `x` 和 `y` 之间存在着某种线性关系，我们的目标是通过观测数据还原这个线性关系。作为一个简单基础的模型，线性回归却有着广泛的应用场景。比如可以想象一个资产定价的简化场景，其中 `x` 对应于房屋的大小，`y` 对应于房屋价格。我们可以通过观察市场上房屋的情况获得二者之间的关系，从而为新房屋的定价提供参考。
-
-
-## 2. 准备数据
-
-假设变量 `X` 和 `Y` 的真实关系为： `Y = 2X + 0.3`，这里展示如何使用观测数据还原这一线性关系。如下Python代码将随机产生2000个观测点，它们将被用作PaddlePaddle的输入。产生PaddlePaddle的输入数据和写一段普通的Python脚本几乎一样，你唯一需要增加的就是定义输入数据的类型。
-
-```python
-# -*- coding:utf-8 -*-
-# dataprovider.py
-from paddle.trainer.PyDataProvider2 import *
-import random
-
-# 定义输入数据的类型: 2个浮点数
-@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-def process(settings, input_file):
-    for i in xrange(2000):
-        x = random.random()
-        yield [x], [2*x+0.3]
-```
-
-## 3. 训练模型
-
-为了还原 `Y = 2X + 0.3`，我们先从一条随机的直线 `Y' = wX + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `Y'` 和 `Y` 的差距不断减小，最终趋于相同。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
-
-在PaddlePaddle里，该模型的网络配置如下。
-
-```python
-# -*- coding:utf-8 -*-
-# trainer_config.py
-from paddle.trainer_config_helpers import *
-
-# 1. 定义数据来源，调用上面的process函数获得观测数据
-data_file = 'empty.list'
-with open(data_file, 'w') as f: f.writelines(' ')
-define_py_data_sources2(train_list=data_file, test_list=None, 
-        module='dataprovider', obj='process',args={})
-
-# 2. 学习算法。控制如何改变模型参数 w 和 b
-settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-# 3. 神经网络配置
-x = data_layer(name='x', size=1)
-y = data_layer(name='y', size=1)
-# 线性计算单元: y_predict = wx + b
-y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-# 损失计算，度量 y_predict 和真实 y 之间的差距
-cost = regression_cost(input=y_predict, label=y)
-outputs(cost)
-```
-这段简短的配置展示了PaddlePaddle的基本用法：
-
-- 首先，第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的`process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
-
-- 第二部分主要是选择学习算法，它定义了模型参数如何改变。PaddlePaddle提供了很多优秀的学习算法，但这里使用一个简单的基于momentum的算法就足够了，它每次读取12个数据进行计算和模型更新。
-
-- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络单元（Layer），所以很多时候你需要做的只是声明正确的网络单元并把它们拼接起来。这里使用了三种网络单元：
-	- **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到下游的其它单元。这里数据层有两个，分别对应于变量 `X` 和 `Y`。
-	- **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以挖掘复杂的数据关系。
-	- **回归损失层**：回归损失层 `regression_cost`是众多损失函数层的一种，它们在训练过程作为网络的出口，用来计算模型的表现，并指导模型参数的改变。
-
-这样定义了网络结构并保存为`trainer_config.py`之后，运行训练命令即可：
- ```
- paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
- ```
-
-PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加损失函数的输出在不断的减小，这意味着模型在不断的改进，直到逼近真实解：` Y = 2X + 0.3 `
-
-## 4. 模型检验
-
-训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用模型对另外一组数据进行预测，然后评价预测的效果。但在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
-
-PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件，所以可以利用如下方法读取模型的参数。
-
-```python
-import numpy as np
-import os
-
-def load(file_name):
-    with open(file_name, 'rb') as f:
-        f.read(16) # skip header for float type.
-        return np.fromfile(f, dtype=np.float32)
-        
-print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-# w=1.999743, b=0.300137
-```
-<center> ![](./parameters.png) </center>
-
-从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型重合。
-
-这样，我们就完成了对单变量线性回归问题的解决：将数据输入PaddlePaddle，训练模型，最后验证结果。
-
-## 5. 推荐后续阅读
-
-- <a href="../build_and_install/index.html">安装/编译</a>：PaddlePaddle的安装与编译文档。
-- <a href="../demo/quick_start/index.html">快速入门 </a>：使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
-- <a href="../demo/index.html">示例</a>：各种实用案例，涵盖图像、文本、推荐等多个领域。
diff --git a/doc_cn/introduction/index.rst b/doc_cn/introduction/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c996f5f4acd07011c98c3e1086080e85ed7dd1b4
--- /dev/null
+++ b/doc_cn/introduction/index.rst
@@ -0,0 +1,114 @@
+简介
+====
+
+PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。
+
+1. 一个经典的任务
+-----------------
+
+我们展示如何用PaddlePaddle解决 `单变量的线性回归 <https://www.baidu.com/s?wd=单变量线性回归>`_ 问题。线性回归的输入是一批点 `(x, y)` ，其中 `y = wx + b + ε`， 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。
+
+一个例子是房产估值。我们假设房产的价格（y）是其大小（x）的一个线性函数，那么我们可以通过收集市场上房子的大小和价格，用来估计线性函数的参数w 和 b。
+
+2. 准备数据
+-----------
+
+假设变量 `x` 和 `y` 的真实关系为： `y = 2x + 0.3 + ε`，这里展示如何使用观测数据来拟合这一线性关系。首先，Python代码将随机产生2000个观测点，作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。
+
+.. code-block:: python
+
+    # dataprovider.py
+    from paddle.trainer.PyDataProvider2 import *
+    import random
+
+    # 定义输入数据的类型: 2个浮点数
+    @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
+    def process(settings, input_file):
+        for i in xrange(2000):
+            x = random.random()
+            yield [x], [2*x+0.3]
+
+3. 训练模型
+-----------
+
+为了还原 `y = 2x + 0.3`，我们先从一条随机的直线 `y' = wx + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小，最终趋于接近。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
+
+在PaddlePaddle里，该模型的网络配置如下。
+
+.. code-block:: python
+
+    # trainer_config.py
+    from paddle.trainer_config_helpers import *
+
+    # 1. 定义数据来源，调用上面的process函数获得观测数据
+    data_file = 'empty.list'
+    with open(data_file, 'w') as f: f.writelines(' ')
+    define_py_data_sources2(train_list=data_file, test_list=None, 
+                            module='dataprovider', obj='process',args={})
+
+    # 2. 学习算法。控制如何改变模型参数 w 和 b
+    settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+    # 3. 神经网络配置
+    x = data_layer(name='x', size=1)
+    y = data_layer(name='y', size=1)
+    # 线性计算网络层: ȳ = wx + b
+    ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
+    # 计算误差函数，即  ȳ 和真实 y 之间的距离
+    cost = regression_cost(input= ȳ, label=y)
+    outputs(cost)
+
+这段简短的配置展示了PaddlePaddle的基本用法：
+
+- 第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的 `process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
+
+- 第二部分主要是选择学习算法，它定义了模型参数改变的规则。PaddlePaddle提供了很多优秀的学习算法，这里使用一个基于momentum的随机梯度下降(SGD)算法，该算法每批量(batch)读取12个采样数据进行随机梯度计算来更新更新。
+
+- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络层，所以很多时候你需要做的只是定义正确的网络层并把它们连接起来。这里使用了三种网络单元：
+	
+	- **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
+	- **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
+	- **回归误差代价层**：回归误差代价层 `regression_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
+
+定义了网络结构并保存为 `trainer_config.py` 之后，运行以下训练命令：
+
+.. code-block:: bash
+
+    paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
+
+PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加误差代价函数的输出在不断的减小，这意味着模型在训练数据上不断的改进，直到逼近真实解：` y = 2x + 0.3 `
+
+4. 模型检验
+-----------
+
+训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测，评价预测的效果。在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
+
+PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件，所以可以利用如下方法读取模型的参数。
+
+.. code-block:: python
+
+    import numpy as np
+    import os
+
+    def load(file_name):
+        with open(file_name, 'rb') as f:
+            f.read(16) # skip header for float type.
+            return np.fromfile(f, dtype=np.float32)
+        
+    print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
+    # w=1.999743, b=0.300137
+
+.. image:: ./parameters.png
+	 :align: center
+	 :scale: 80 %
+
+从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型一致。
+
+这样，我们用PaddlePaddle解决了单变量线性回归问题， 包括数据输入、模型训练和最后的结果验证。
+
+5. 推荐后续阅读
+---------------
+
+- `安装/编译 <../build_and_install/index.html>`_ ：PaddlePaddle的安装与编译文档。
+- `快速入门 <../demo/quick_start/index.html>`_ ：使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
+- `示例 <../demo/index.html>`_ ：各种实用案例，涵盖图像、文本、推荐等多个领域。
\ No newline at end of file
diff --git a/doc_cn/ui/cmd/dump_config.rst b/doc_cn/ui/cmd/dump_config.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/doc_cn/ui/cmd/index.rst b/doc_cn/ui/cmd/index.rst
index f975d432c07f8f0cdc725af2b29c25b7bd6a0657..31a8b8a79f4a87101bd6030eb4e779fd11d65811 100644
--- a/doc_cn/ui/cmd/index.rst
+++ b/doc_cn/ui/cmd/index.rst
@@ -1,29 +1,20 @@
-PaddlePaddle的命令行参数
-========================
+命令
+====
 
-安装好PaddlePaddle后，在命令行直接敲击 ``paddle`` 或 ``paddle --help`` 会显示如下一些命令行参数。
+安装好PaddlePaddle后，在命令行直接敲击 ``paddle`` 或 ``paddle --help`` 会显示如下一些命令。
 
 * ``train`` Start a paddle_trainer
     启动一个PaddlePaddle训练进程。 ``paddle train`` 可以通过命令行参数 ``-local=true`` 启动一个单机的训练进程；也可以和 ``paddle pserver`` 一起使用启动多机的分布式训练进程。
 * ``pserver`` Start a paddle_pserver_main
     在多机分布式训练下启动PaddlePaddle的parameter server进程。
 * ``version`` Print paddle version
-    用于打印当前PaddlePaddle的版本和编译选项相关信息。
+    用于打印当前PaddlePaddle的版本和编译选项相关信息。常见的输出格式如下：1）第一行说明了PaddlePaddle的版本信息；2）第二行开始说明了一些主要的编译选项，具体意义可以参考 `编译参数选项文件 <../../build_and_install/cmake/compile_options.html>`_ 。
+
+    ..  literalinclude:: paddle_version.txt
+
 * ``merge_model`` Start a paddle_merge_model
     用于将PaddlePaddle的模型参数文件和模型配置文件打包成一个文件，方便做部署分发。
 * ``dump_config`` Dump the trainer config as proto string
     用于将PaddlePaddle的模型配置文件以proto string的格式打印出来。
 * ``make_diagram``
-    使用graphviz对PaddlePaddle的模型配置文件进行绘制。
-
-更详细的介绍请参考各命令行参数文档。
-
-..  toctree::
-    :glob:
-
-    paddle_train.rst
-    paddle_pserver.rst
-    paddle_version.rst
-    merge_model.rst
-    dump_config.rst
-    make_diagram.rst
+    使用graphviz对PaddlePaddle的模型配置文件进行绘制。
\ No newline at end of file
diff --git a/doc_cn/ui/cmd/make_diagram.rst b/doc_cn/ui/cmd/make_diagram.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/doc_cn/ui/cmd/merge_model.rst b/doc_cn/ui/cmd/merge_model.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/doc_cn/ui/cmd/paddle_pserver.rst b/doc_cn/ui/cmd/paddle_pserver.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/doc_cn/ui/cmd/paddle_train.rst b/doc_cn/ui/cmd/paddle_train.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/doc_cn/ui/cmd/paddle_version.rst b/doc_cn/ui/cmd/paddle_version.rst
deleted file mode 100644
index 537c23df75ea8eee5d17cc3f05bf17ed1bdfcb73..0000000000000000000000000000000000000000
--- a/doc_cn/ui/cmd/paddle_version.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddle version的命令行参数
-==========================
-
-paddle version用于打印当前的版本信息和相关编译选项。常见的输出格式如下。第一行说明了PaddlePaddle的版本信息，后面跟着一些主要的编译选项。编译选项的具体意义可以参考
-`编译参数选项文件 <../../build_and_install/cmake/compile_options.html>`_
-
-..  literalinclude:: paddle_version.txt
diff --git a/doc_cn/ui/index.rst b/doc_cn/ui/index.rst
index 8079bd9180cf02db944535829baca2dbaa1c4205..d871ad805ff7cd37fb83f24024003e54bce77f42 100644
--- a/doc_cn/ui/index.rst
+++ b/doc_cn/ui/index.rst
@@ -11,21 +11,23 @@
     data_provider/index.rst
 
 
-命令行参数
-==========
+命令及命令行参数
+================
 
 ..  toctree::
+    :maxdepth: 1
 
     cmd/index.rst
 
+* `参数用例 <../../doc/ui/cmd_argument/use_case.html>`_
 * `参数分类 <../../doc/ui/cmd_argument/argument_outline.html>`_
 * `参数描述 <../../doc/ui/cmd_argument/detail_introduction.html>`_
-* `参数用例 <../../doc/ui/cmd_argument/use_case.html>`_
 
 
 预测
 ====
 
 ..  toctree::
+    :maxdepth: 1
 
     predict/swig_py_paddle.rst
diff --git a/doc_cn/ui/predict/swig_py_paddle.rst b/doc_cn/ui/predict/swig_py_paddle.rst
index 012ac4ff6e66a022fa7d8af798236f55b62011ec..05f25345c5246687363dee1931310120b5723d0b 100644
--- a/doc_cn/ui/predict/swig_py_paddle.rst
+++ b/doc_cn/ui/predict/swig_py_paddle.rst
@@ -1,42 +1,50 @@
-PaddlePaddle的Python预测接口
-==================================
+基于Python的预测
+================
 
-PaddlePaddle目前使用Swig对其常用的预测接口进行了封装，使在Python环境下的预测接口更加简单。
-在Python环境下预测结果，主要分为以下几个步骤。
+预测流程
+--------
 
-* 读入解析训练配置
-* 构造GradientMachine
-* 准备数据
-* 预测
+PaddlePaddle使用swig对常用的预测接口进行了封装，通过编译会生成py_paddle软件包，安装该软件包就可以在python环境下实现模型预测。可以使用python的 ``help()`` 函数查询软件包相关API说明。
 
-典型的预测代码如下，使用mnist手写识别作为样例, 完整代码见
-:code:`src_root/doc/ui/predict/predict_sample.py` 。
+基于Python的模型预测，主要包括以下五个步骤。
+
+1. 初始化PaddlePaddle环境
+
+   在程序开始阶段，通过调用 ``swig_paddle.initPaddle()`` 并传入相应的命令行参数初始化PaddlePaddle。
+
+2. 解析模型配置文件
+   
+   初始化之后，可以通过调用 ``parse_config()`` 解析训练模型时用的配置文件。注意预测数据通常不包含label, 同时预测网络通常直接输出最后一层的结果而不是像训练网络一样再接一层cost layer，所以一般需要对训练用的模型配置文件稍作相应修改才能在预测时使用。
+
+3. 构造paddle.GradientMachine
+  
+   通过调用 ``swig_paddle.GradientMachine.createFromConfigproto()`` 传入上一步解析出来的模型配置就可以创建一个 ``GradientMachine``。
+
+4. 准备预测数据
+  
+   swig_paddle中的预测接口的参数是自定义的C++数据类型，py_paddle里面提供了一个工具类 ``DataProviderConverter`` 可以用于接收和PyDataProvider2一样的输入数据并转换成预测接口所需的数据类型。
+
+5. 模型预测
+  
+   通过调用 ``forwardTest()`` 传入预测数据，直接返回计算结果。
+
+
+预测Demo
+--------
+
+如下是一段使用mnist model来实现手写识别的预测代码。完整的代码见 ``src_root/doc/ui/predict/predict_sample.py`` 。mnist model可以通过 ``src_root\demo\mnist`` 目录下的demo训练出来。
 
 ..  literalinclude:: ../../../doc/ui/predict/predict_sample.py
     :language: python
-    :lines: 15-18,90-100,101-104
-
-主要的软件包为py_paddle.swig_paddle，这个软件包文档相对完善。可以使用python的
-:code:`help()` 函数查询文档。主要步骤为:
-
-* 在程序开始阶段，使用 :code:`swig_paddle.initPaddle()` 传入命令行参数初始化
-  PaddlePaddle。详细的命令行参数请参考
-  `命令行参数 <../cmd_argument/detail_introduction.html>`_ 。
-* 接下来使用 :code:`parse_config()` 解析训练时的配置文件。这里要注意预测数据通常
-  不包含label, 而且预测网络通常直接输出最后一层的结果而不是像训练时一样以cost
-  layer作为输出，所以用于预测的配置文件要做相应的修改。
-* 使用 :code:`swig_paddle.GradientMachine.createFromConfigproto()` 根据上一步解
-  析好的配置创建神经网络。
-* 创建一个 :code:`DataProviderConverter` 对象converter。
-    - swig_paddle接受的原始数据是C++的Matrix，也就是直接写内存的float数组。
-      这个接口并不用户友好。所以，我们提供了一个工具类DataProviderConverter。
-      这个工具类接收和PyDataProvider2一样的输入数据，详情请参考
-      `PyDataProvider2文档 <../../../doc/ui/data_provider/pydataprovider2.html>`_ 。
-* 最后使用 :code:`forwardTest()` 直接提取出神经网络Output层的输出结果。典型的输出结果为\:
+    :lines: 15-18,121-136
+
+
+Demo预测输出如下，其中value即为softmax层的输出。由于TEST_DATA包含两条预测数据，所以输出的value包含两个向量 。
 
 ..  code-block:: text
 
-    [{'id': None, 'value': array([[  5.53018653e-09,   1.12194102e-05,   1.96644767e-09,
+    [{'id': None, 'value': array(
+      [[  5.53018653e-09,   1.12194102e-05,   1.96644767e-09,
           1.43630644e-02,   1.51111044e-13,   9.85625684e-01,
           2.08823112e-10,   2.32777140e-08,   2.00186201e-09,
           1.15501715e-08],
@@ -45,4 +53,4 @@ PaddlePaddle目前使用Swig对其常用的预测接口进行了封装，使在P
           2.70634608e-08,   3.48565123e-08,   5.25639710e-09,
           4.48684503e-08]], dtype=float32)}]
 
-其中，value即为softmax层的输出。由于数据是两条，所以输出的value包含两个向量 。
+
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 2f32b3fdd1a26c5b1bca43d0bd0ebb0896a012c4..a723ef7bc8329329fa82113f8e96a1bdbe750277 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1240,6 +1240,12 @@ void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
   }
 }
 
+DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp);
+template<class T>
+void BaseMatrixT<T>::deepSwap(BaseMatrixT& b) {
+    applyBinary(binary::DeepSwap<T>(), b);
+}
+
 template<>
 void BaseMatrixT<real>::rowDotMul(size_t destCol,
                                   BaseMatrixT& b,
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index d41dcee682cce15e94d45dafeb12bb0dce19b221..ea58c861a3d6a03642291c172af76795e90fcb92 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -455,6 +455,17 @@ public:
    */
   void assign(T p);
 
+  /**
+   * @code
+   * swap(this, b)
+   * example: swap two Matrices
+   * MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+   * MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+   * cpuA->deepSwap(*cpuB);
+   * @endcode
+   */
+  void deepSwap(BaseMatrixT& b);
+
   /**
    * @code
    * this = this + p
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index ae5bc5a86a1790ce30a8d7f83c9564f52d7cf7ea..de540dad4c8eefe5084c7089d7960d8ca8cf9875 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -448,6 +448,24 @@ void testMatrixZeroAtOffset(int height, int width) {
   MatrixCheckEqual(*cpuA, *cpuTest);
 }
 
+void testMatrixDeepSwap(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuCopyA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuCopyB = std::make_shared<CpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuCopyA->copyFrom(*cpuA);
+  cpuCopyB->copyFrom(*cpuB);
+
+  // swap matrix cpuA and cpuB
+  cpuA->deepSwap(*cpuB);
+
+  MatrixCheckEqual(*cpuA, *cpuCopyB);
+  MatrixCheckEqual(*cpuB, *cpuCopyA);
+}
+
 void testMatrixBinaryAdd(int height, int width) {
   MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
   MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
@@ -480,6 +498,7 @@ void testMatrixAssign(int height, int width) {
   MatrixCheckEqual(*cpuA, *outputCheck);
 }
 
+
 void testMatrixAdd(int height, int width) {
   MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
   MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
@@ -798,6 +817,7 @@ TEST(Matrix, unary) {
       testMatrixBinaryAdd(height, width);
       testMatrixTanh(height, width);
       testMatrixTanhDerivative(height, width);
+      testMatrixDeepSwap(height, width);
 
       // applyTernary
       testMatrixTernarySub(height, width);
diff --git a/paddle/scripts/docker/Dockerfile.m4 b/paddle/scripts/docker/Dockerfile.m4
index e14493ed9e842351125ab458db53fcc3f38233f6..761aa975d693631556c162dc29ae288ad6bd980b 100644
--- a/paddle/scripts/docker/Dockerfile.m4
+++ b/paddle/scripts/docker/Dockerfile.m4
@@ -1,7 +1,7 @@
 FROM PADDLE_BASE_IMAGE
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
+ENV GIT_CHECKOUT=v0.9.0
 ENV WITH_GPU=PADDLE_WITH_GPU
 ENV IS_DEVEL=PADDLE_IS_DEVEL
 ENV WITH_DEMO=PADDLE_WITH_DEMO
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 20ea2fedc4d464cdd5403af28bc917770c993b98..ace2c0dee972e338001a0e5a4045c32e64ff157e 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -29,6 +29,7 @@ function version(){
 }
 
 function ver2num() {
+  set -e
   # convert version to number.
   if [ -z "$1" ]; then # empty argument
     printf "%03d%03d%03d%03d%03d" 0
@@ -41,6 +42,7 @@ function ver2num() {
       printf "%03d%03d%03d%03d%03d" $VERN
     fi
   fi
+  set +e
 }
 
 PADDLE_CONF_HOME="$HOME/.config/paddle"
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 461c73f14c2dc9377cc39ebb8f1273eee81730a3..ec68b53d440185f869566e2975a65d0c3fec5bc5 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -1,3 +1,12 @@
+execute_process(COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --version
+	OUTPUT_VARIABLE PROTOBUF_VERSION)
+string(REPLACE "libprotoc " "" PROTOBUF_VERSION ${PROTOBUF_VERSION})
+
+set(PROTOBUF_3 OFF)
+if (${PROTOBUF_VERSION} VERSION_GREATER "3.0.0" OR ${PROTOBUF_VERSION} VERSION_EQUAL "3.0.0")
+    set(PROTOBUF_3 ON)
+endif()
+
 set(proto_filenames
     DataConfig.proto
     DataFormat.proto
@@ -11,8 +20,12 @@ set(real_proto_files)
 # TODO(yuyang18): Some internal proto will also be depended on.
 #                 Find a way to automatically calculate all depends.
 foreach(filename ${proto_filenames})
+    set(PROTOBUF_3_FLAGS "")
+    if (PROTOBUF_3)
+        set(PROTOBUF_3_FLAGS "-Dproto3")
+    endif()
     add_custom_command(OUTPUT ${filename}
-        COMMAND ${M4_EXECUTABLE} -Dreal=${ACCURACY} -I '${INTERNAL_PROTO_PATH}'
+	COMMAND ${M4_EXECUTABLE} -Dreal=${ACCURACY} ${PROTOBUF_3_FLAGS} -I '${INTERNAL_PROTO_PATH}'
               ${PROJ_ROOT}/proto/${filename}.m4 > ${filename}
         DEPENDS ${PROJ_ROOT}/proto/${filename}.m4
         COMMENT "Generate ${filename}")
diff --git a/proto/DataConfig.proto.m4 b/proto/DataConfig.proto.m4
index 9862e4e7ef2ff96eafc91246e0b435c70fbe31d9..01d451ff7d5334f8f84d28973c2d7c4b4fac5885 100644
--- a/proto/DataConfig.proto.m4
+++ b/proto/DataConfig.proto.m4
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+ifdef(`proto3', `syntax = "proto2";')
 
 package paddle;
 
diff --git a/proto/DataFormat.proto.m4 b/proto/DataFormat.proto.m4
index 556eace5e194ef26991cc06d1f7794f14fbbdded..8a4a0be1b31a62cca35ca732a037ddc8b20786c4 100644
--- a/proto/DataFormat.proto.m4
+++ b/proto/DataFormat.proto.m4
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+ifdef(`proto3', `syntax = "proto2";')
 
 package paddle;
 
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4
index ac32c9c5fb87eccb4dcb7fb95c071cdb78410fbd..4772f6b8d662bebf22cb781c9999af8bebbc7abe 100644
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+ifdef(`proto3', `syntax = "proto2";')
 
 import "ParameterConfig.proto";
 
diff --git a/proto/ParameterConfig.proto.m4 b/proto/ParameterConfig.proto.m4
index e8d512445e5025f5663fbe3e20b4425cf1633a2b..26e7c3ef77b7377b8d6da4d947bcad27ae4edf72 100644
--- a/proto/ParameterConfig.proto.m4
+++ b/proto/ParameterConfig.proto.m4
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+ifdef(`proto3', `syntax = "proto2";')
 
 package paddle;
 
diff --git a/proto/ParameterService.proto.m4 b/proto/ParameterService.proto.m4
index 189dc1c9700bd821959bab80aef3721bd4940b5c..0b3f14a2ee5b3e1771f724bd9d271a3ecfd15038 100644
--- a/proto/ParameterService.proto.m4
+++ b/proto/ParameterService.proto.m4
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+ifdef(`proto3', `syntax = "proto2";')
 
 import "ParameterConfig.proto";
 import "TrainerConfig.proto";
@@ -20,7 +21,6 @@ package paddle;
 /**
  * Various structs for communicating with parameter server
  */
-
 enum ParameterUpdateMode {
   // Set parameter
    PSERVER_UPDATE_MODE_SET_PARAM = 0;//use local param
diff --git a/proto/TrainerConfig.proto.m4 b/proto/TrainerConfig.proto.m4
index 3b0e24f90bed8cdf0e102c12d2a4a041c17a8447..965c9cd39353970dd547f2a595eb99531f3693c6 100644
--- a/proto/TrainerConfig.proto.m4
+++ b/proto/TrainerConfig.proto.m4
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+ifdef(`proto3', `syntax = "proto2";')
 
 import "DataConfig.proto";
 import "ModelConfig.proto";