Merge branch 'develop' of https://github.com/baidu/Paddle into cmrnorm

bf324111 · hedaoyuan · 5fddd99e · 8a42a549 · bf324111 · 5fddd99e
117 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -29,10 +29,6 @@ addons:
      - python-pip
      - python2.7-dev
      - m4
-      - libprotobuf-dev
-      - doxygen
-      - protobuf-compiler
-      - python-protobuf
      - python-numpy
      - python-wheel
      - libgoogle-glog-dev
@@ -43,6 +39,8 @@ addons:
      - graphviz
      - swig
      - clang-format-3.8
+      - automake
+      - libtool
 before_install:
  - |
    if [ ${JOB} == "BUILD_AND_TEST" ]; then

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
-./doc/howto/contribute_to_paddle_en.md
\ No newline at end of file
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
+./doc/howto/dev/contribute_to_paddle_en.md
--- a/cmake/FindSphinx.cmake
+++ b/cmake/FindSphinx.cmake
@@ -72,7 +72,7 @@ function( Sphinx_add_target target_name builder conf cache source destination )
    ${source}
    ${destination}
    COMMENT "Generating sphinx documentation: ${builder}"
-    COMMAND ln -s ${destination}/index_*.html ${destination}/index.html
+    COMMAND ln -sf ${destination}/index_*.html ${destination}/index.html
    )

  set_property(

--- a/cmake/check_packages.cmake
+++ b/cmake/check_packages.cmake
@@ -24,7 +24,6 @@ endif()

 if(WITH_DOC)
  find_package(Sphinx REQUIRED)
-  find_package(Doxygen REQUIRED)
  find_python_module(recommonmark REQUIRED)
 endif()


--- a/demo/gan/data/download_cifar.sh
+++ b/demo/gan/data/download_cifar.sh
+#!/bin/bash
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

--- a/demo/gan/data/get_mnist_data.sh
+++ b/demo/gan/data/get_mnist_data.sh
--- a/demo/gan/gan_conf_image.py
+++ b/demo/gan/gan_conf_image.py
@@ -87,9 +87,9 @@ def conv_bn(input,
    print(imgSize, output_x, stride, filter_size, padding)

    if trans:
-        nameApx = "_conv"
-    else:
        nameApx = "_convt"
+    else:
+        nameApx = "_conv"

    if bn:
        conv = img_conv_layer(

--- a/demo/image_classification/data/download_cifar.sh
+++ b/demo/image_classification/data/download_cifar.sh
+#!/bin/bash
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

--- a/demo/image_classification/image_provider.py
+++ b/demo/image_classification/image_provider.py
@@ -21,7 +21,7 @@ from paddle.trainer.PyDataProvider2 import *

 #
 # {'img_size': 32,
-# 'settings': <paddle.trainer.PyDataProviderWrapper.Cls instance at 0x7fea27cb6050>,
+# 'settings': a global object,
 # 'color': True,
 # 'mean_img_size': 32,
 # 'meta': './data/cifar-out/batches/batches.meta',
@@ -50,10 +50,10 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,

    settings.logger.info('Image size: %s', settings.img_size)
    settings.logger.info('Meta path: %s', settings.meta_path)
-    settings.input_types = [
-        dense_vector(settings.img_raw_size),  # image feature
-        integer_value(settings.num_classes)
-    ]  # labels
+    settings.input_types = {
+        'image': dense_vector(settings.img_raw_size),
+        'label': integer_value(settings.num_classes)
+    }

    settings.logger.info('DataProvider Initialization finished')

@@ -83,4 +83,7 @@ def processData(settings, file_list):
                        img, settings.img_mean, settings.img_size,
                        settings.is_train, settings.color)
                    label = data['labels'][i]
-                    yield img_feat.astype('float32'), int(label)
+                    yield {
+                        'image': img_feat.astype('float32'),
+                        'label': int(label)
+                    }
--- a/demo/introduction/.gitignore
+++ b/demo/introduction/.gitignore
+dataprovider.pyc
+empty.list
+train.log
+output
+train.list
--- a/demo/introduction/dataprovider.py
+++ b/demo/introduction/dataprovider.py
@@ -17,8 +17,10 @@ import random


 # define data types of input: 2 real numbers
-@provider(input_types=[dense_vector(1), dense_vector(1)], use_seq=False)
+@provider(
+    input_types={'x': dense_vector(1),
+                 'y': dense_vector(1)}, use_seq=False)
 def process(settings, input_file):
    for i in xrange(2000):
        x = random.random()
-        yield [x], [2 * x + 0.3]
+        yield {'x': [x], 'y': [2 * x + 0.3]}
--- a/demo/introduction/trainer_config.py
+++ b/demo/introduction/trainer_config.py
@@ -15,11 +15,8 @@
 from paddle.trainer_config_helpers import *

 # 1. read data. Suppose you saved above python code as dataprovider.py
-data_file = 'empty.list'
-with open(data_file, 'w') as f:
-    f.writelines(' ')
 define_py_data_sources2(
-    train_list=data_file,
+    train_list=['no_matter.txt'],
    test_list=None,
    module='dataprovider',
    obj='process',

--- a/demo/mnist/mnist_provider.py
+++ b/demo/mnist/mnist_provider.py
 from paddle.trainer.PyDataProvider2 import *
+import numpy


 # Define a py data provider
 @provider(
    input_types={'pixel': dense_vector(28 * 28),
-                 'label': integer_value(10)})
+                 'label': integer_value(10)},
+    cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, filename):  # settings is not used currently.
    imgf = filename + "-images-idx3-ubyte"
    labelf = filename + "-labels-idx1-ubyte"
@@ -20,12 +22,13 @@ def process(settings, filename):  # settings is not used currently.
    else:
        n = 10000

-    for i in range(n):
-        label = ord(l.read(1))
-        pixels = []
-        for j in range(28 * 28):
-            pixels.append(float(ord(f.read(1))) / 255.0)
-        yield {"pixel": pixels, 'label': label}
+    images = numpy.fromfile(
+        f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
+    images = images / 255.0 * 2.0 - 1.0
+    labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
+
+    for i in xrange(n):
+        yield {"pixel": images[i, :], 'label': labels[i]}

    f.close()
    l.close()
--- a/demo/quick_start/.gitignore
+++ b/demo/quick_start/.gitignore
@@ -8,6 +8,8 @@ data/test.list
 data/test.txt
 data/train.list
 data/train.txt
+data/pred.list
+data/pred.txt
 dataprovider_copy_1.py
 train.log
 output
--- a/demo/quick_start/api_predict.py
+++ b/demo/quick_start/api_predict.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os, sys
+import numpy as np
+from optparse import OptionParser
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import sparse_binary_vector
+from paddle.trainer.config_parser import parse_config
+"""
+Usage: run following command to show help message.
+  python api_predict.py -h
+"""
+
+
+class QuickStartPrediction():
+    def __init__(self, train_conf, dict_file, model_dir=None, label_file=None):
+        """
+        train_conf: trainer configure.
+        dict_file: word dictionary file name.
+        model_dir: directory of model.
+        """
+        self.train_conf = train_conf
+        self.dict_file = dict_file
+        self.word_dict = {}
+        self.dict_dim = self.load_dict()
+        self.model_dir = model_dir
+        if model_dir is None:
+            self.model_dir = os.path.dirname(train_conf)
+
+        self.label = None
+        if label_file is not None:
+            self.load_label(label_file)
+
+        conf = parse_config(train_conf, "is_predict=1")
+        self.network = swig_paddle.GradientMachine.createFromConfigProto(
+            conf.model_config)
+        self.network.loadParameters(self.model_dir)
+        input_types = [sparse_binary_vector(self.dict_dim)]
+        self.converter = DataProviderConverter(input_types)
+
+    def load_dict(self):
+        """
+        Load dictionary from self.dict_file.
+        """
+        for line_count, line in enumerate(open(self.dict_file, 'r')):
+            self.word_dict[line.strip().split('\t')[0]] = line_count
+        return len(self.word_dict)
+
+    def load_label(self, label_file):
+        """
+        Load label.
+        """
+        self.label = {}
+        for v in open(label_file, 'r'):
+            self.label[int(v.split('\t')[1])] = v.split('\t')[0]
+
+    def get_index(self, data):
+        """
+        transform word into integer index according to the dictionary.
+        """
+        words = data.strip().split()
+        word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
+        return word_slot
+
+    def batch_predict(self, data_batch):
+        input = self.converter(data_batch)
+        output = self.network.forwardTest(input)
+        prob = output[0]["id"].tolist()
+        print("predicting labels is:")
+        print prob
+
+
+def option_parser():
+    usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
+    parser = OptionParser(usage="usage: %s [options]" % usage)
+    parser.add_option(
+        "-n",
+        "--tconf",
+        action="store",
+        dest="train_conf",
+        help="network config")
+    parser.add_option(
+        "-d",
+        "--dict",
+        action="store",
+        dest="dict_file",
+        help="dictionary file")
+    parser.add_option(
+        "-b",
+        "--label",
+        action="store",
+        dest="label",
+        default=None,
+        help="dictionary file")
+    parser.add_option(
+        "-c",
+        "--batch_size",
+        type="int",
+        action="store",
+        dest="batch_size",
+        default=1,
+        help="the batch size for prediction")
+    parser.add_option(
+        "-w",
+        "--model",
+        action="store",
+        dest="model_path",
+        default=None,
+        help="model path")
+    return parser.parse_args()
+
+
+def main():
+    options, args = option_parser()
+    train_conf = options.train_conf
+    batch_size = options.batch_size
+    dict_file = options.dict_file
+    model_path = options.model_path
+    label = options.label
+    swig_paddle.initPaddle("--use_gpu=0")
+    predict = QuickStartPrediction(train_conf, dict_file, model_path, label)
+
+    batch = []
+    labels = []
+    for line in sys.stdin:
+        [label, text] = line.split("\t")
+        labels.append(int(label))
+        batch.append([predict.get_index(text)])
+    print("labels is:")
+    print labels
+    predict.batch_predict(batch)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/quick_start/api_predict.sh
+++ b/demo/quick_start/api_predict.sh
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+#Note the default model is pass-00002, you shold make sure the model path
+#exists or change the mode path.
+#only test on trainer_config.lr.py
+model=output/pass-00001/
+config=trainer_config.lr.py
+label=data/labels.list
+dict=data/dict.txt
+batch_size=20
+head -n$batch_size data/test.txt | python api_predict.py \
+     --tconf=$config\
+     --model=$model \
+     --label=$label \
+     --dict=$dict \
+     --batch_size=$batch_size
--- a/demo/quick_start/dataprovider_bow.py
+++ b/demo/quick_start/dataprovider_bow.py
@@ -31,16 +31,16 @@ def initializer(settings, dictionary, **kwargs):

    # setting.input_types specifies what the data types the data provider
    # generates.
-    settings.input_types = [
+    settings.input_types = {
        # The first input is a sparse_binary_vector,
        # which means each dimension of the vector is either 0 or 1. It is the
        # bag-of-words (BOW) representation of the texts.
-        sparse_binary_vector(len(dictionary)),
+        'word': sparse_binary_vector(len(dictionary)),
        # The second input is an integer. It represents the category id of the
        # sample. 2 means there are two labels in the dataset.
        # (1 for positive and 0 for negative)
-        integer_value(2)
-    ]
+        'label': integer_value(2)
+    }


 # Delaring a data provider. It has an initializer 'data_initialzer'.
@@ -67,12 +67,12 @@ def process(settings, file_name):
            # Return the features for the current comment. The first is a list
            # of ids representing a 0-1 binary sparse vector of the text,
            # the second is the integer id of the label.
-            yield word_vector, int(label)
+            yield {'word': word_vector, 'label': int(label)}


 def predict_initializer(settings, dictionary, **kwargs):
    settings.word_dict = dictionary
-    settings.input_types = [sparse_binary_vector(len(dictionary))]
+    settings.input_types = {'word': sparse_binary_vector(len(dictionary))}


 # Declaring a data provider for prediction. The difference with process
@@ -83,4 +83,4 @@ def process_predict(settings, file_name):
        for line in f:
            comment = line.strip().split()
            word_vector = [settings.word_dict.get(w, UNK_IDX) for w in comment]
-            yield word_vector
+            yield {'word': word_vector}
--- a/demo/quick_start/dataprovider_emb.py
+++ b/demo/quick_start/dataprovider_emb.py
@@ -19,13 +19,13 @@ UNK_IDX = 0

 def initializer(settings, dictionary, **kwargs):
    settings.word_dict = dictionary
-    settings.input_types = [
+    settings.input_types = {
        # Define the type of the first input as sequence of integer.
        # The value of the integers range from 0 to len(dictrionary)-1
-        integer_value_sequence(len(dictionary)),
+        'word': integer_value_sequence(len(dictionary)),
        # Define the second input for label id
-        integer_value(2)
-    ]
+        'label': integer_value(2)
+    }


 @provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
@@ -35,15 +35,12 @@ def process(settings, file_name):
            label, comment = line.strip().split('\t')
            words = comment.split()
            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-            yield word_slot, int(label)
+            yield {'word': word_slot, 'label': int(label)}


 def predict_initializer(settings, dictionary, **kwargs):
    settings.word_dict = dictionary
-    settings.input_types = [
-        integer_value(
-            len(dictionary), seq_type=SequenceType.SEQUENCE)
-    ]
+    settings.input_types = {'word': integer_value_sequence(len(dictionary))}


 @provider(init_hook=predict_initializer, should_shuffle=False)
@@ -52,4 +49,4 @@ def process_predict(settings, file_name):
        for line in f:
            comment = line.strip().split()
            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in comment]
-            yield word_slot
+            yield {'word': word_slot}
--- a/demo/recommendation/common_utils.py
+++ b/demo/recommendation/common_utils.py
@@ -17,13 +17,14 @@ from paddle.trainer.PyDataProvider2 import *
 def meta_to_header(meta, name):
    metas = meta[name]['__meta__']['raw_meta']
    for each_meta in metas:
+        slot_name = each_meta.get('name', '%s_id' % name)
        if each_meta['type'] == 'id':
-            yield integer_value(each_meta['max'])
+            yield slot_name, integer_value(each_meta['max'])
        elif each_meta['type'] == 'embedding':
            is_seq = each_meta['seq'] == 'sequence'
-            yield integer_value(
+            yield slot_name, integer_value(
                len(each_meta['dict']),
                seq_type=SequenceType.SEQUENCE
                if is_seq else SequenceType.NO_SEQUENCE)
        elif each_meta['type'] == 'one_hot_dense':
-            yield dense_vector(len(each_meta['dict']))
+            yield slot_name, dense_vector(len(each_meta['dict']))
--- a/demo/recommendation/dataprovider.py
+++ b/demo/recommendation/dataprovider.py
@@ -16,6 +16,14 @@ from paddle.trainer.PyDataProvider2 import *
 import common_utils  # parse


+def __list_to_map__(lst):
+    ret_val = dict()
+    for each in lst:
+        k, v = each
+        ret_val[k] = v
+    return ret_val
+
+
 def hook(settings, meta, **kwargs):
    """
    Init hook is invoked before process data. It will set obj.slots and store
@@ -34,12 +42,16 @@ def hook(settings, meta, **kwargs):
    #    second part is user features.
    #    final part is rating score.
    # header is a list of [USE_SEQ_OR_NOT?, SlotType]
-    headers = list(common_utils.meta_to_header(meta, 'movie'))
-    headers.extend(list(common_utils.meta_to_header(meta, 'user')))
-    headers.append(dense_vector(1))  # Score
+    movie_headers = list(common_utils.meta_to_header(meta, 'movie'))
+    settings.movie_names = [h[0] for h in movie_headers]
+    headers = movie_headers
+    user_headers = list(common_utils.meta_to_header(meta, 'user'))
+    settings.user_names = [h[0] for h in user_headers]
+    headers.extend(user_headers)
+    headers.append(("rating", dense_vector(1)))  # Score

    # slot types.
-    settings.input_types = headers
+    settings.input_types = __list_to_map__(headers)
    settings.meta = meta


@@ -57,20 +69,20 @@ def process(settings, filename):
            movie_meta = settings.meta['movie'][movie_id]
            user_meta = settings.meta['user'][user_id]

-            outputs = [movie_id - 1]
+            outputs = [('movie_id', movie_id - 1)]

            # Then add movie features
-            for each_meta in movie_meta:
-                outputs.append(each_meta)
+            for i, each_meta in enumerate(movie_meta):
+                outputs.append((settings.movie_names[i + 1], each_meta))

            # Then add user id.
-            outputs.append(user_id - 1)
+            outputs.append(('user_id', user_id - 1))

            # Then add user features.
-            for each_meta in user_meta:
-                outputs.append(each_meta)
+            for i, each_meta in enumerate(user_meta):
+                outputs.append((settings.user_names[i + 1], each_meta))

            # Finally, add score
-            outputs.append([score])
+            outputs.append(('rating', [score]))
            # Return data to paddle
-            yield outputs
+            yield __list_to_map__(outputs)
--- a/demo/recommendation/prediction.py
+++ b/demo/recommendation/prediction.py
@@ -34,8 +34,8 @@ if __name__ == '__main__':
    network.loadParameters(model_path)
    with open('./data/meta.bin', 'rb') as f:
        meta = pickle.load(f)
-        headers = list(meta_to_header(meta, 'movie'))
-        headers.extend(list(meta_to_header(meta, 'user')))
+        headers = [h[1] for h in meta_to_header(meta, 'movie')]
+        headers.extend([h[1] for h in meta_to_header(meta, 'user')])
        cvt = DataProviderConverter(headers)
        while True:
            movie_id = int(raw_input("Input movie_id: "))

--- a/demo/recommendation/preprocess.sh
+++ b/demo/recommendation/preprocess.sh
@@ -14,6 +14,15 @@
 # limitations under the License.
 set -e

+UNAME_STR=`uname`
+
+if [[ ${UNAME_STR} == 'Linux' ]]; then
+	SHUF_PROG='shuf'
+else
+	SHUF_PROG='gshuf'
+fi
+
+
 cd "$(dirname "$0")"
 delimiter='::'
 dir=ml-1m
@@ -25,7 +34,7 @@ python meta_generator.py $dir meta.bin --config=meta_config.json
 echo 'split train/test file'
 python split.py $dir/ratings.dat --delimiter=${delimiter} --test_ratio=0.1
 echo 'shuffle train file'
-shuf $dir/ratings.dat.train > ratings.dat.train
+${SHUF_PROG} $dir/ratings.dat.train > ratings.dat.train
 cp $dir/ratings.dat.test .
 echo "./data/ratings.dat.train" > train.list
 echo "./data/ratings.dat.test" > test.list
--- a/demo/semantic_role_labeling/.gitignore
+++ b/demo/semantic_role_labeling/.gitignore
@@ -8,3 +8,7 @@ data/test.wsj.seq_pair
 data/test.wsj.words
 data/tgt.dict
 output
+data/emb
+data/targetDict.txt
+data/verbDict.txt
+data/wordDict.txt
--- a/demo/semantic_role_labeling/data/get_data.sh
+++ b/demo/semantic_role_labeling/data/get_data.sh
--- a/demo/sequence_tagging/linear_crf.py
+++ b/demo/sequence_tagging/linear_crf.py
@@ -74,7 +74,8 @@ sum_evaluator(

 chunk_evaluator(
    name="chunk_f1",
-    input=[crf_decoding, chunk],
+    input=crf_decoding,
+    label=chunk,
    chunk_scheme="IOB",
    num_chunk_types=11, )


--- a/demo/sequence_tagging/rnn_crf.py
+++ b/demo/sequence_tagging/rnn_crf.py
@@ -112,7 +112,8 @@ sum_evaluator(

 chunk_evaluator(
    name="chunk_f1",
-    input=[crf_decoding, chunk],
+    input=crf_decoding,
+    label=chunk,
    chunk_scheme="IOB",
    num_chunk_types=11, )


--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -16,7 +16,7 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")

 configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/conf.py.en.in"
+    "${CMAKE_CURRENT_SOURCE_DIR}/templates/conf.py.en.in"
    "${BINARY_BUILD_DIR_EN}/conf.py"
    @ONLY)

@@ -41,7 +41,7 @@ set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
 set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")

 configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/conf.py.cn.in"
+    "${CMAKE_CURRENT_SOURCE_DIR}/templates/conf.py.cn.in"
    "${BINARY_BUILD_DIR_CN}/conf.py"
    @ONLY)


--- a/doc/about/index_cn.md
+++ b/doc/about/index_cn.md
+关于PaddlePaddle
+================
+
+PaddlePaddle是一个最早由百度科学家和工程师共同研发的并行分布式深度学习平台，兼备易用性、高效性、灵活性和可扩展性，目前已被百度内部多个产品线广泛使用。
+PaddlePaddle目前已经开放源码, 但是远未完善，我们希望能在这个基础上不断的改进、扩展和延伸。
+同时我们希望广大开发者积极提供反馈和贡献源代码，建立一个活跃的开源社区。
+
+致谢
+--------
+
+在此，特别感谢PaddlePaddle的[所有贡献者](https://github.com/PaddlePaddle/Paddle/graphs/contributors)。
--- a/doc/about/index_en.rst
+++ b/doc/about/index_en.rst
@@ -11,4 +11,4 @@ We hope to build an active open source community both by providing feedback and
 Credits
 --------

-We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/blob/develop/authors>`_ of PaddlePaddle!
+We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/graphs/contributors>`_ of PaddlePaddle!
--- a/doc/api/data_provider/dataprovider_cn.rst
+++ b/doc/api/data_provider/dataprovider_cn.rst
+.. _api_dataprovider:
+
 DataProvider的介绍
 ==================

-DataProvider是PaddlePaddle负责提供数据的模块。其作用是将数据传入内存或显存，让神经网络可以进行训练或预测。用户可以通过简单使用Python接口 `PyDataProvider2 <pydataprovider2.html>`_ ，来自定义传数据的过程。如果有更复杂的使用，或者需要更高的效率，用户也可以在C++端自定义一个 ``DataProvider`` 。
+DataProvider是PaddlePaddle负责提供数据的模块。其作用是将数据传入内存或显存，让神经网络可以进行训练或预测。用户可以通过简单使用Python接口 :ref:`api_pydataprovider2` ，来自定义传数据的过程。如果有更复杂的使用，或者需要更高的效率，用户也可以在C++端自定义一个 ``DataProvider`` 。

 PaddlePaddle需要用户在网络配置（trainer_config.py）中定义使用哪种DataProvider，并且在DataProvider中实现如何访问训练文件列表（train.list）或测试文件列表（test.list）。


--- a/doc/api/data_provider/pydataprovider2_cn.rst
+++ b/doc/api/data_provider/pydataprovider2_cn.rst
+..  _api_pydataprovider2:
+
 PyDataProvider2的使用
 =====================


--- a/doc/api/index_cn.rst
+++ b/doc/api/index_cn.rst
-API
-===
+API中文手册
+============

 DataProvider API
 ----------------

--- a/doc/api/predict/swig_py_paddle_cn.rst
+++ b/doc/api/predict/swig_py_paddle_cn.rst
+.. _api_swig_py_paddle:
+
 基于Python的预测
 ================


--- a/doc/api/trainer_config_helpers/evaluators.rst
+++ b/doc/api/trainer_config_helpers/evaluators.rst
+..  _api_trainer_config_helpers_evaluators:
+
 ==========
 Evaluators
 ==========

--- a/doc/api/trainer_config_helpers/layers.rst
+++ b/doc/api/trainer_config_helpers/layers.rst
@@ -187,6 +187,8 @@ get_output_layer
 Mixed Layer
 ===========

+..  _api_trainer_config_helpers_layers_mixed_layer:
+
 mixed_layer
 -----------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -255,12 +257,16 @@ pooling_layer
    :members: pooling_layer
    :noindex:

+..  _api_trainer_config_helpers_layers_last_seq:
+
 last_seq
 --------
 ..  automodule:: paddle.trainer_config_helpers.layers
    :members: last_seq
    :noindex:

+..  _api_trainer_config_helpers_layers_first_seq:
+
 first_seq
 ---------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -282,6 +288,8 @@ block_expand_layer
    :members: block_expand_layer
    :noindex:

+..  _api_trainer_config_helpers_layers_expand_layer:
+
 expand_layer
 ------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -374,6 +382,8 @@ sampling_id_layer
    :members: sampling_id_layer
    :noindex:

+..  _api_trainer_config_helpers_layers_cost_layers:
+
 Cost Layers
 ===========


--- a/doc/api/trainer_config_helpers/networks.rst
+++ b/doc/api/trainer_config_helpers/networks.rst
@@ -36,6 +36,8 @@ img_conv_group
    :members: img_conv_group
    :noindex:

+..  _api_trainer_config_helpers_network_simple_img_conv_pool:
+
 simple_img_conv_pool
 --------------------
 ..  automodule:: paddle.trainer_config_helpers.networks

--- a/doc/api/trainer_config_helpers/optimizers.rst
+++ b/doc/api/trainer_config_helpers/optimizers.rst
+..  _api_trainer_config_helpers_optimizers:
+
 ==========
 Optimizers
 ==========
@@ -50,6 +52,8 @@ RMSPropOptimizer
    :members: RMSPropOptimizer
    :noindex:

+..  _api_trainer_config_helpers_optimizers_settings:
+
 settings
 ========
 ..  automodule:: paddle.trainer_config_helpers.optimizers

--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -35,7 +35,7 @@ PyDataProvider使用的是异步加载，同时在内存里直接随即选取数

 ..  literalinclude:: src/reduce_min_pool_size.py

-这样做可以极大的减少内存占用，并且可能会加速训练过程，详细文档参考 `这里 <../ui/data_provider/pydataprovider2.html#provider>`_ 。
+这样做可以极大的减少内存占用，并且可能会加速训练过程，详细文档参考 :ref:`api_pydataprovider2` 。

 神经元激活内存
 ++++++++++++++
@@ -95,7 +95,6 @@ PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`spa

 ..  literalinclude:: src/word2vec_config.py

-更多关于sparse训练的内容请参考 `sparse训练的文档 <TBD>`_

 利用更多的计算资源
 ++++++++++++++++++
@@ -103,17 +102,20 @@ PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`spa
 利用更多的计算资源可以分为一下几个方式来进行\:

 * 单机CPU训练
+
  * 使用多线程训练。设置命令行参数 :code:`trainer_count`。

 * 单机GPU训练
+
  * 使用显卡训练。设置命令行参数 :code:`use_gpu`。
  * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。

 * 多机训练
-  * 具体的多机训练方法参考  `多机训练文档 <../ui/data_provider/pydataprovider2.html#provider>`_ 。
+
+  * 请参考 :ref:`cluster_train` 。


-3. 遇到“非法指令”或者是“illegal instruction” 
+3. 遇到“非法指令”或者是“illegal instruction”
 --------------------------------------------

 PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二进制发行版可能会导致这种错误，请选择正确的版本。
@@ -140,7 +142,7 @@ PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二

 ..  code-block:: python

-    hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0), 
+    hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
                      bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))

 上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。
@@ -156,8 +158,8 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字

 这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。

-7. *-cp27mu-linux_x86_64.whl is not a supported wheel on this platform.
---------------------------------------------------------------------------
+7. \*-cp27mu-linux_x86_64.whl is not a supported wheel on this platform.
+------------------------------------------------------------------------

 出现这个问题的主要原因是，系统编译wheel包的时候，使用的 :code:`wheel` 包是最新的，
 而系统中的 :code:`pip` 包比较老。具体的解决方法是，更新 :code:`pip` 包并重新编译PaddlePaddle。
@@ -190,14 +192,14 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
    41 - test_config_parser (Failed)
    42 - test_swig_api (Failed)
    43 - layers_test (Failed)
-    
+
 并且查询PaddlePaddle单元测试的日志，提示：

 ..  code-block:: bash
-    
+
    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
-    
+
 解决办法是：

 * 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包，使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面，单元测试会引用site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。同时，即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
@@ -225,7 +227,7 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 用户强制指定特定的Python版本，具体操作如下：

    ..  code-block:: bash
-        
+
        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>

 用户需要指定本机上Python的路径：``<exc_path>``, ``<lib_path>``, ``<inc_path>``
@@ -238,7 +240,7 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 ..  code-block:: bash

    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes).  To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
-    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr) 
+    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)

 可能的原因是：传给dataprovider的某一个args过大，一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似：

@@ -284,3 +286,22 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 ..      code-block:: bash

        paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
+
+12. 编译源码提示warp-ctc/include/ctc.h 找不到的情况
+---------------------------------------------------
+
+目前Paddle使用\ :code:`git submodule`\ 来引用一些第三方模块。简单的\
+:code:`git clone`\ 命令不能得到第三方模块的代码。需要使用\:
+
+..  code-block:: bash
+
+    git clone --recursive https://github.com/PaddlePaddle/Paddle.git
+
+来获取所有源码。对于已经clone的git版本库，可以在Paddle的源码目录中执行\:
+
+..  code-block:: bash
+
+    git submodule init
+    git submodule update
+
+来获得所有第三方模块。
--- a/doc/getstarted/basic_usage/index_cn.rst
+++ b/doc/getstarted/basic_usage/index_cn.rst
-简介
-====
+经典的线性回归任务
+==================

 PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。

-1. 一个经典的任务
-----------------
+任务简介
+--------

 我们展示如何用PaddlePaddle解决 `单变量的线性回归 <https://www.baidu.com/s?wd=单变量线性回归>`_ 问题。线性回归的输入是一批点 `(x, y)` ，其中 `y = wx + b + ε`， 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。

 一个例子是房产估值。我们假设房产的价格（y）是其大小（x）的一个线性函数，那么我们可以通过收集市场上房子的大小和价格，用来估计线性函数的参数w 和 b。

-2. 准备数据
+准备数据
 -----------

 假设变量 `x` 和 `y` 的真实关系为： `y = 2x + 0.3 + ε`，这里展示如何使用观测数据来拟合这一线性关系。首先，Python代码将随机产生2000个观测点，作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。
@@ -28,7 +28,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
            x = random.random()
            yield [x], [2*x+0.3]

-3. 训练模型
+训练模型
 -----------

 为了还原 `y = 2x + 0.3`，我们先从一条随机的直线 `y' = wx + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小，最终趋于接近。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
@@ -79,7 +79,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍

 PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加误差代价函数的输出在不断的减小，这意味着模型在训练数据上不断的改进，直到逼近真实解：` y = 2x + 0.3 `

-4. 模型检验
+模型检验
 -----------

 训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测，评价预测的效果。在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
@@ -106,10 +106,3 @@ PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件
 从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型一致。

 这样，我们用PaddlePaddle解决了单变量线性回归问题， 包括数据输入、模型训练和最后的结果验证。
-
-5. 推荐后续阅读
---------------
-
- `安装/编译 <../build_and_install/index.html>`_ ：PaddlePaddle的安装与编译文档。
- `快速入门 <../demo/quick_start/index.html>`_ ：使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
- `示例 <../demo/index.html>`_ ：各种实用案例，涵盖图像、文本、推荐等多个领域。
\ No newline at end of file
--- a/doc/getstarted/basic_usage/index_en.rst
+++ b/doc/getstarted/basic_usage/index_en.rst
-Basic Usage
-=============
+Simple Linear Regression
+========================

 PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.

-1. A Classic Problem
---------------------
+Problem Background
+------------------

 Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression <https://en.wikipedia.org/wiki/Simple_linear_regression>`_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.

-2. Prepare the Data
--------------------
+Prepare the Data
+-----------------

 Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.

@@ -26,8 +26,8 @@ Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's se
                x = random.random()
                yield [x], [2*x+0.3]

-3. Train a NeuralNetwork
-------------------------
+Train a NeuralNetwork
+----------------------

 To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle:

@@ -73,8 +73,8 @@ Now that everything is ready, you can train the network with a simple command li
 This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.


-4. Evaluate the Model
-----------------------
+Evaluate the Model
+-------------------

 Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly.


--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -9,6 +9,91 @@ Please be aware that you will need to change `Dockers settings
 of your hardware resource on Mac OS X and Windows.


+Development Using Docker
+------------------------
+
+Developers can work on PaddlePaddle using Docker.  This allows
+developers to work on different platforms -- Linux, Mac OS X, and
+Windows -- in a consistent way.
+
+The general development workflow with Docker and Bazel is as follows:
+
+1. Get the source code of Paddle:
+
+   .. code-block:: bash
+
+      git clone --recursive https://github.com/PaddlePaddle/Paddle.git
+
+   
+   Here **git clone --recursive is required** as we have a submodule `warp-ctc <https://github.com/baidu-research/warp-ctc>`_.
+
+   If you have used :code:`git clone https://github.com/PaddlePaddle/Paddle` and find that the directory :code:`warp-ctc` is
+   empty, please use the following command to get the submodule.
+
+   .. code-block:: bash
+
+      git submodule update --init --recursive
+
+
+2. Build a development Docker image :code:`paddle:dev` from the source
+   code.  This image contains all the development tools and
+   dependencies of PaddlePaddle.
+
+
+   .. code-block:: bash
+
+      cd paddle
+      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
+
+
+3. Run the image as a container and mounting local source code
+   directory into the container.  This allows us to change the code on
+   the host and build it within the container.
+
+   .. code-block:: bash
+
+      docker run       \
+       -d              \
+       --name paddle   \
+       -p 2022:22      \
+       -v $PWD:/paddle \
+       -v $HOME/.cache/bazel:/root/.cache/bazel \
+       paddle:dev
+
+   where :code:`-d` makes the container running in background,
+   :code:`--name paddle` allows us to run a nginx container to serve
+   documents in this container, :code:`-p 2022:22` allows us to SSH
+   into this container, :code:`-v $PWD:/paddle` shares the source code
+   on the host with the container, :code:`-v
+   $HOME/.cache/bazel:/root/.cache/bazel` shares Bazel cache on the
+   host with the container.
+
+4. SSH into the container:
+
+   .. code-block:: bash
+
+      ssh root@localhost -p 2022
+
+5. We can edit the source code in the container or on this host.  Then
+   we can build using cmake
+
+   .. code-block:: bash
+
+      cd /paddle # where paddle source code has been mounted into the container
+      mkdir -p build
+      cd build
+      cmake -DWITH_TESTING=ON ..
+      make -j `nproc`
+      CTEST_OUTPUT_ON_FAILURE=1 ctest
+
+   or Bazel in the container:
+
+   .. code-block:: bash
+
+      cd /paddle
+      bazel test ...
+
+
 CPU-only and GPU Images
 -----------------------

@@ -77,7 +162,7 @@ source code:
 .. code-block:: bash

   cd ~
-   git clone github.com/PaddlePaddle/Paddle
+   git clone https://github.com/PaddlePaddle/Paddle.git
   cd Paddle
   git submodule update --init --recursive
   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
@@ -93,7 +178,7 @@ generated using `woboq code browser
 for users to browse and understand the C++ source code.

 As long as we give the Paddle Docker container a name, we can run an
-additional nginx Docker container to serve the volume from the Paddle
+additional Nginx Docker container to serve the volume from the Paddle
 container:

 .. code-block:: bash
@@ -104,78 +189,3 @@ container:

 Then we can direct our Web browser to the HTML version of source code
 at http://localhost:8088/paddle/
-
-
-Development Using Docker
------------------------
-
-Develpers can work on PaddlePaddle using Docker.  This allows
-developers to work on different platforms -- Linux, Mac OS X, and
-Windows -- in a consistent way.
-
-The general development workflow with Docker and Bazel is as follows:
-
-1. Get the source code of Paddle:
-
-   .. code-block:: bash
-
-      git clone --recursive https://github.com/paddlepaddle/paddle
-
-
-2. Build a development Docker image :code:`paddle:dev` from the source
-   code.  This image contains all the development tools and
-   dependencies of PaddlePaddle.
-
-
-   .. code-block:: bash
-
-      cd paddle
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
-
-
-3. Run the image as a container and mounting local source code
-   directory into the container.  This allows us to change the code on
-   the host and build it within the container.
-
-   .. code-block:: bash
-
-      docker run       \
-       -d              \
-       --name paddle   \
-       -p 2022:22      \
-       -v $PWD:/paddle \
-       -v $HOME/.cache/bazel:/root/.cache/bazel \
-       paddle:dev
-
-   where :code:`-d` makes the container running in background,
-   :code:`--name paddle` allows us to run a nginx container to serve
-   documents in this container, :code:`-p 2022:22` allows us to SSH
-   into this container, :code:`-v $PWD:/paddle` shares the source code
-   on the host with the container, :code:`-v
-   $HOME/.cache/bazel:/root/.cache/bazel` shares Bazel cache on the
-   host with the container.
-
-4. SSH into the container:
-
-   .. code-block:: bash
-
-      ssh root@localhost -p 2022
-
-5. We can edit the source code in the container or on this host.  Then
-   we can build using cmake
-
-   .. code-block:: bash
-
-      cd /paddle # where paddle source code has been mounted into the container
-      mkdir -p build
-      cd build
-      cmake -DWITH_TESTING=ON ..
-      make -j `nproc`
-      CTEST_OUTPUT_ON_FAILURE=1 ctest
-
-   or Bazel in the container:
-
-   .. code-block:: bash
-
-      cd /paddle
-      bazel test ...
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
-编译与安装
-========================
+安装与编译
+==========

-安装
-++++
+.. _install_steps:
+
+安装流程
++++++++

 PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜像，ubuntu的deb安装包等。我们推荐使用Docker镜像来部署环境，同时欢迎贡献更多的安装包。

@@ -14,14 +16,14 @@ PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜



-编译
-++++
+编译流程
++++++++

 ..  warning::

-    编译选项主要推荐高级用户查看，普通用户请走安装流程。
+    编译流程主要推荐高级用户查看，普通用户请走安装流程。

 ..  toctree::
    :maxdepth: 1

-    cmake/build_from_source_cn.rst
\ No newline at end of file
+    cmake/build_from_source_cn.rst
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
-GET STARTED
+新手入门
 ============

 ..  toctree::

--- a/doc/howto/concepts/nn_cn.rst
+++ b/doc/howto/concepts/nn_cn.rst
-TBD
-
-目前正在书写中。敬请期待。
\ No newline at end of file
--- a/doc/howto/concepts/program_concepts_cn.rst
+++ b/doc/howto/concepts/program_concepts_cn.rst
-TBD
-###
-
-目前正在书写中。敬请期待。
\ No newline at end of file
--- a/doc/howto/deep_model/index_cn.rst
+++ b/doc/howto/deep_model/index_cn.rst
-How to Configure Deep Models
-============================
-
-..  toctree::
-  :maxdepth: 1
-
-  rnn/recurrent_group_cn.md
-  rnn/hierarchical_layer_cn.rst
-  rnn/hrnn_rnn_api_compare_cn.rst
-  rnn/hrnn_demo_cn.rst
--- a/doc/howto/deep_model/index_en.rst
+++ b/doc/howto/deep_model/index_en.rst
-How to Configure Deep Models
-============================
-
-..  toctree::
-  :maxdepth: 1
-
-  rnn/rnn_en.rst
--- a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
+++ b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
@@ -22,7 +22,7 @@
 pooling_layer
 ==============

-pooling_layer 的使用示例如下，详细见 `pooling_layer`_ 配置API。
+pooling_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers_layers_pooling_layer` 配置API。

 ..	code-block:: bash

@@ -47,7 +47,7 @@ pooling_layer 的使用示例如下，详细见 `pooling_layer`_ 配置API。
 last_seq 和 first_seq
 =====================

-last_seq 的使用示例如下（ `first_seq`_ 类似），详细见 `last_seq`_ 配置API。
+last_seq 的使用示例如下（ :ref:`api_trainer_config_helpers_layers_first_seq` 类似），详细见 :ref:`api_trainer_config_helpers_layers_last_seq` 配置API。

 ..	code-block:: bash

@@ -68,7 +68,7 @@ last_seq 的使用示例如下（ `first_seq`_ 类似），详细见 `last_seq`_
 expand_layer
 ============

-expand_layer 的使用示例如下，详细见 `expand_layer`_ 配置API。
+expand_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers_layers_expand_layer` 配置API。

 ..	code-block:: bash

@@ -87,9 +87,3 @@ expand_layer 的使用示例如下，详细见 `expand_layer`_ 配置API。
  - 作用：一个单层序列经过运算扩展成一个双层序列
  - 输入：layer1必须是一个单层序列，是待扩展的数据；layer2 必须是一个双层序列，提供扩展的长度信息
  - 输出：一个双层序列，序列中含有元素的数目同 layer2 一致。要求单层序列含有元素的数目（0层序列）和双层序列含有subseq 的数目一致。单层序列第i个元素（0层序列），被扩展为一个单层序列，构成了输出双层序列的第i个 subseq 。
-
-
-.. _pooling_layer: ../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer
-.. _last_seq: ../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq
-.. _first_seq: ../../../doc/ui/api/trainer_config_helpers/layers.html#first-seq
-.. _expand_layer: ../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer
--- a/doc/howto/deep_model/rnn/hrnn_demo_cn.rst
+++ b/doc/howto/deep_model/rnn/hrnn_demo_cn.rst
-..	_algo_hrnn_demo:
-
-#################
-双层RNN的使用示例
-#################
-
-TBD
\ No newline at end of file
--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ b/doc/howto/deep_model/rnn/index_cn.rst
+RNN相关模型
+===========
+
+..  toctree::
+  :maxdepth: 1
+
+  recurrent_group_cn.md
+  hierarchical_layer_cn.rst
+  hrnn_rnn_api_compare_cn.rst
--- a/doc/howto/deep_model/rnn/index_en.rst
+++ b/doc/howto/deep_model/rnn/index_en.rst
+RNN Models
+==========
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn_config_en.rst
--- a/doc/howto/deep_model/rnn/recurrent_group_cn.md
+++ b/doc/howto/deep_model/rnn/recurrent_group_cn.md
@@ -12,7 +12,7 @@

 更进一步，`recurrent_group`同样可以扩展到双层序列的处理上。通过两个嵌套的`recurrent_group`分别定义子句级别和词语级别上需要完成的运算，最终实现一个层次化的复杂RNN。

-目前，在PaddlePaddle中，能够对双向序列进行处理的有`recurrent_group`和部分Layer，具体可参考文档：<a href = "hierarchical-layer.html">支持双层序列作为输入的Layer</a>。
+目前，在PaddlePaddle中，能够对双向序列进行处理的有`recurrent_group`和部分Layer，具体可参考文档：<a href = "hierarchical_layer_cn.html">支持双层序列作为输入的Layer</a>。
 
 ## 相关概念


--- a/doc/howto/deep_model/rnn/rnn_en.rst
+++ b/doc/howto/deep_model/rnn/rnn_en.rst
--- a/doc/howto/new_layer/FullyConnected.jpg
+++ b/doc/howto/new_layer/FullyConnected.jpg
--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
+# 如何贡献代码
+
+我们真诚地感谢您的贡献，欢迎通过 GitHub 的 fork 和 pull request 流程来提交代码。
+ 
+## 代码要求
+- 你的代码必须完全遵守 [doxygen](http://www.stack.nl/~dimitri/doxygen/) 的样式。
+- 确保编译器选项 WITH\_STYLE\_CHECK 已打开，并且编译能通过代码样式检查。
+- 所有代码必须具有单元测试。
+- 通过所有单元测试。
+
+以下教程将指导您提交代码。
+ 
+## [Fork](https://help.github.com/articles/fork-a-repo/)
+ 
+跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页，然后单击 `Fork` 按钮。
+
+## 克隆（Clone）
+
+Paddle 目前使用[git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护。
+**develop** 是主分支，其他用户分支是特征分支（feature branches）。
+
+一旦你创建了一个fork，你可以使用你最喜欢的 git 客户端克隆你的仓库（repo）或只是直接在命令行输入：
+
+```shell
+# 克隆 fork 到本地
+git clone --branch develop https://github.com/USERNAME/Paddle.git
+```
+如果你的仓库不包含 **develop** 分支，你只需自己创建它。
+
+```shell
+git clone https://github.com/USERNAME/Paddle.git Paddle
+cd Paddle
+git checkout -b develop  # 创建 develop 分支
+git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # 添加 upstream 到 baidu/Paddle
+git pull upstream develop  # 更新 upstream
+git submodule update --init --recursive
+```
+
+然后你可以通过做一个本地开发分支开始开发
+
+```shell
+git checkout -b MY_COOL_STUFF_BRANCH
+```
+
+## 使用 `pre-commit` 钩子
+
+Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理git预提交钩子。 它可以帮助我们格式化源代码（cpp，python），在提交前检查一些基本事宜（每个文件只有一个 EOL 
+，git 中不要添加大文件）。 `pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子
+的 PR 不能提交代码到 Paddle。
+
+你可以通过 `pip install pre-commit` 安装 [pre-commit](http://pre-commit.com/)，
+目前 Paddle 使用 `clang-format` 来调整C/C++源代码格式。请确保 clang-format 版本在3.8以上。
+
+然后只需在 Paddle clone 目录中运行 `pre-commit install` 。当你
+提交你的代码时，pre-commit 钩子会检查本地代码是否存在
+不适合提交的东西，等等。
+
+## 提交（Commit）
+
+提交你的代码：
+
+```shell
+# 显示工作树状态
+git status
+# 添加修改过的文件
+git add xx
+env EDITOR=vim git commit  # 你可以用 vim/nano/emacs 写下你的注释
+```
+提交信息的第一行是标题，其他行可以添加一些细节（如果有必要的话）。
+
+## 保持 Fork 状态最新
+
+在拉（pull）你的请求（request）之前，你应该从最新的 PaddlePaddle 同步代码。
+为此，你需要首先添加远程（remote）：
+
+```shell
+# 观察当前远程仓库配置
+git remote -v
+# 添加上游（upstream）仓库
+git remote add upstream https://github.com/PaddlePaddle/Paddle.git
+# 验证新的 upstream
+git remote -v
+```
+
+用最新的 upstream 更新你的 fork：
+
+```shell
+git pull --rebase upstream develop
+```
+如果本地没有提交，git 将简单地执行快进。但是，如果你一直在做一些改变（绝大多数情况下不应该），你可能要处理冲突。
+
+现在，你的本地主分支与上游修改的一致并是最新的。
+
+## 推送（Push）到 GitHub
+
+```shell
+# 在 GitHub 上 push 你的仓库
+git push -u origin MY_COOL_STUFF_BRANCH  # 创建远程分支 MY_COOL_STUFF_BRANCH 到 origin.
+```
+
+## 拉取请求（Pull Request）
+
+转到 GitHub上 你 fork 的页面，选择你的开发分支并单击 **pull request 按钮**。
+
+## 使用最新版本更新你的 pull 请求
+
+在代码审查（code review）期间，由于 baidu/Paddle 中新的提交导致你的 pull 请求可能会失效。如果没有冲突，GitHub允许自动更新。 你可以点击 pull request 页面中的“更新分支（Update Branch）”按钮。 但是如果存在代码冲突，你需要手动进行更新。你需要在本地仓库执行如下命令：
+
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull upstream develop
+# 你可能需要根据git提示解决冲突
+# 创建并测试你的代码
+git push origin MY_COOL_STUFF_BRANCH
+```
+现在你的 Pull Request 是最新的了。
+
+## 修改你的 pull request
+
+当根据审阅者的意见修改 pull 请求时，请使用“git commit”而不是“git commit --amend”来提交更改，以便审阅者可以看到新的请求和旧的请求之间的区别。
+
+可能的命令是
+
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull upstream develop   # 将本地更新到最新的代码库
+# 可能会发生一些冲突
+# 开始开发吧！
+env EDITOR=vim git commit  # 添加修改日志
+git push origin MY_COOL_STUFF_BRANCH
+```
--- a/doc/howto/contribute_to_paddle_en.md
+++ b/doc/howto/contribute_to_paddle_en.md
-# How to Contribute Code
+# Contribute Code

 We sincerely appreciate your contributions. You can use fork and pull request
 workflow to merge your code.

--- a/doc/howto/new_layer/index_en.rst
+++ b/doc/howto/new_layer/index_en.rst
-=======================
-How to Write New Layers
-=======================
+================
+Write New Layers
+================

 This tutorial will guide you to write customized layers in PaddlePaddle. We will utilize fully connected layer as an example to guide you through the following steps for writing a new layer.


--- a/doc/howto/write_docs/index_cn.rst
+++ b/doc/howto/write_docs/index_cn.rst
-###############################
-如何贡献/修改PaddlePaddle的文档
-###############################
+##################
+如何贡献/修改文档
+##################

 PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。

@@ -51,4 +51,4 @@ TBD


 ..	_cmake: https://cmake.org/
-..	_sphinx: http://www.sphinx-doc.org/en/1.4.8/
\ No newline at end of file
+..	_sphinx: http://www.sphinx-doc.org/en/1.4.8/
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
-HOW TO
-=======
+进阶指南
+========

-Usage
-------
+使用说明
+--------

 ..  toctree::
  :maxdepth: 1

-  concepts/use_concepts_cn.rst
-  cluster/k8s/paddle_on_k8s_cn.md
-  cluster/k8s/distributed_training_on_k8s_cn.md
+  usage/concepts/use_concepts_cn.rst
+  usage/cluster/cluster_train_cn.md
+  usage/cluster/k8s/k8s_cn.md
+  usage/cluster/k8s/k8s_distributed_cn.md

-Development
------------
+开发标准
+--------

 ..  toctree::
  :maxdepth: 1

-  write_docs/index_cn.rst
-  deep_model/index_cn.rst
+  dev/write_docs_cn.rst
+  dev/contribute_to_paddle_cn.md

-Optimization
-------------
+模型配置
+--------

 ..  toctree::
  :maxdepth: 1
+
+  deep_model/rnn/index_cn.rst
+
+性能优化
+--------
+
+..  toctree::
+  :maxdepth: 1
+
+  optimization/gpu_profiling_cn.rst
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -7,9 +7,8 @@ Usage
 ..  toctree::
  :maxdepth: 1

-  cmd_parameter/index_en.md
-  deep_model/index_en.rst
-  cluster/cluster_train_en.md
+  usage/cmd_parameter/index_en.md
+  usage/cluster/cluster_train_en.md

 Development
 ------------
@@ -17,8 +16,16 @@ Development
 ..  toctree::
  :maxdepth: 1

-  new_layer/index_en.rst
-  contribute_to_paddle_en.md
+  dev/new_layer_en.rst
+  dev/contribute_to_paddle_en.md
+
+Configuration
+-------------
+
+..  toctree::
+  :maxdepth: 1
+
+  deep_model/rnn/index_en.rst

 Optimization
 -------------
@@ -26,4 +33,4 @@ Optimization
 ..  toctree::
  :maxdepth: 1

-  optimization/index_en.rst
+  optimization/gpu_profiling_en.rst
--- a/doc/howto/optimization/gpu_profiling_cn.rst
+++ b/doc/howto/optimization/gpu_profiling_cn.rst
+==================
+GPU性能分析与调优
+==================
+
+..  contents::
+
+此教程将向您分步介绍如何使用内置的定时工具、 **nvprof** 或 **nvvp** 来运行性能分析和调优。
+
+- 什么是性能分析？
+- 为什么需要性能分析？
+- 如何进行性能分析？
+- 性能分析工具介绍
+- 详细教程
+- 性能分析小技巧
+
+什么是性能分析？
+================
+在软件工程的范畴里，性能分析（Profiling）是一个动态程序分析的术语，它可以指测量一个程序的空间（内存）复杂度或时间复杂度，
+也可以说是某些特定指令的使用情况，或者是函数调用的频率和耗时等。通常情况下，分析得到的信息用于协助进行程序的优化。
+
+简单来说，性能分析工具是用于给应用程序的性能做定量分析的。如果想很好的理解程序的行为，那程序分析工具是必不可少的利器。简单的性能分析，可以告诉您某个操作到底花了多长时间？而更深入的分析，甚至能解释为什么某个操作花了很长时间？
+
+为什么需要性能分析？
+============================
+训练好一个深层神经网络通常要耗费非常长的时间，所以性能也就逐步变成了深度学习领域最重要的指标。
+而优化性能的首要任务，是需要了解哪些步骤拖慢了整体。
+如果某一块根本就不怎么耗时，那也就不需要急着优化性能啦！
+
+如何进行性能分析？
+========================
+为了达到性能最优，您可以采用下面五个步骤：
+
+- 对代码进行性能分析
+- 找到运行慢的部分
+- 找到运行慢的原因
+- 修改成更快的版本
+- 再次对代码进行性能分析
+
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU,  it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+
+通常情况下，处理器有两个关键性能限制：一个是浮点计算量，另一个是内存操作量。
+GPU则还需要高并行性，才能发挥其全部能力。这正是它们速度快的原因。
+
+性能分析工具介绍
+======================
+就通常的GPU性能分析来说，市面上已经有NVIDIA或第三方提供的众多工具。
+
+**nvprof** 是Nvidia性能分析工具， **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
+在这个教程中，我们主要会介绍nvprof和nvvp。
+
+:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
+above profilers.
+
+:code:`paddle/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
+
+.. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+   :language: c++
+   :lines: 137-151
+   :linenos:
+
+上述的代码片段包含了两种方法，您可以任意使用一个或两个来对感兴趣的代码段做性能分析。
+
+1. :code:`REGISTER_TIMER_INFO` 是一个内置的定时器封装，可以用来计算CPU函数或cuda内核的时间消耗。
+
+2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
+program crashes when CPU version of PaddlePaddle invokes them.
+
+3. :code:`REGISTER_GPU_PROFILER` 是一个封装对象，封装了 :code:`cudaProfilerStart` 和 :code:`cudaProfileStop` 两个操作；同时其内部实现可以避免纯CPU版本PaddlePaddle在执行本语句时发生崩溃。
+
+您会在接下来的部分中获得更多的细节介绍。
+
+详细教程
+============
+
+内置定时器
+------------
+
+如果想要启用PaddlePaddle的内置定时器，您首先需要在相关代码段中加入 :code:`REGISTER_TIMER_INFO`。
+接下来就可以使用 :code:`printStatus` 或者 :code:`printAllStatus` 函数来将信息输出到界面中。
+下面举个简单的例子：
+
+1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数（如高亮部分）。
+
+    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
+        :linenos:
+
+2. cmake配置中将 **WITH_TIMER** 打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_TIMER=ON
+        make
+
+3. 执行您的代码，并观察结果(如高亮部分）。
+
+    .. code-block:: bash
+        :emphasize-lines: 1,12-15
+
+        > ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
+        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
+        [  PASSED  ] 1 test.
+
+nvprof 工具
+----------------
+
+要使用命令行分析工具 **nvprof**，您按如下步骤操作即可：
+
+1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中（参考强调部分）。
+
+    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 6-7
+        :linenos:
+
+2. cmake中将 **WITH_PROFILER** 配置打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_PROFILER=ON
+        make
+
+3. 使用 **nvprof** 来分析执行文件。
+
+    .. code-block:: bash
+
+        nvprof  ./paddle/math/tests/test_GpuProfiler
+
+然后，您就能获得如下的分析结果：
+
+.. code-block:: bash
+
+    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
+    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
+
+
+nvvp 工具
+--------------
+
+如果想使用可视化的分析器 **nvvp**，您可以导入 :code:`nvprof -o ...` 的输出，或者从工具的界面里运行您的应用。
+
+**备注: nvvp 也支持CPU的性能分析** (需在nvvp界面中选上才能开启）
+
+..  image:: nvvp1.png
+    :align: center
+    :scale: 33%
+
+从内核函数的角度， **nvvp** 可以精确说明一个长耗时操作的具体原因。
+同时，如下图所示， **nvvp** 的内核block使用情况、寄存器使用情况和共享内存使用情况能让我们对GPU的整体使用有更好的理解。
+
+
+..  image:: nvvp2.png
+    :align: center
+    :scale: 33%
+
+而从应用的角度， **nvvp** 可以帮您提供一些定位性能瓶颈的建议。
+例如，下图中就展示了一些关于内存数据迁徙和计算资源利用率的建议，为您做性能调优提供了方向。
+
+..  image:: nvvp3.png
+    :align: center
+    :scale: 33%
+
+..  image:: nvvp4.png
+    :align: center
+    :scale: 33%
+
+性能分析小技巧
+==================
+
+- 开始阶段，从 **nvprof** 和 **nvvp** 的输出信息入手是个不错的选择。
+- 接下来可以考虑下时间线的分析。
+- 如果真想挖掘内核深处的某个秘密，您最好先确认：这一块的耗时比例真的太高，值得深入分析。
+- 可能的情况下，试着让输出的分析数据和理论值对应。
+
+    1) 例如，如果我知道内核花了10ms来移动1GB数据，那我会期望分析工具统计到速度是100GB/s。
+    2) 若有不一致之处，很有可能实际应用就是没有按照您的预期情况运行。
+- 了解您的硬件：如果您的GPU理论可以达到6 TFLOPs（6万亿次浮点运算每秒），而当前已经有5.5 TFLOPs了，那估计这里的潜力就没啥好挖的了……
+
+性能分析是性能优化的关键一步。有的时候简简单单的改变就能在性能上产生明显的优化效果！
+当然，具体情况因人而异。
+
+参考资料
+===========
+Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
--- a/doc/howto/optimization/gpu_profiling_en.rst
+++ b/doc/howto/optimization/gpu_profiling_en.rst
-Profiling on PaddlePaddle
-=========================
+====================
+Tune GPU Performance 
+====================
+
+..  contents::

 This tutorial will guide you step-by-step through how to conduct profiling and performance tuning using built-in timer, **nvprof** and **nvvp**.

@@ -49,11 +52,11 @@ For general GPU profiling, a bunch of tools are provided from both NVIDIA and th
 In this tutorial, we will focus on nvprof and nvvp.

 :code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
-above profilers. 
+above profilers.

 .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
   :language: c++
-   :lines: 111-124
+   :lines: 137-151
   :linenos:

 The above code snippet includes two methods, you can use any of them to profile the regions of interest.
@@ -79,8 +82,8 @@ As a simple example, consider the following:

    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
        :language: c++
-        :lines: 111-124
-        :emphasize-lines: 8-10,13
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
        :linenos:

 2. Configure cmake with **WITH_TIMER** and recompile PaddlePaddle.
@@ -90,31 +93,31 @@ As a simple example, consider the following:
        cmake .. -DWITH_TIMER=ON
        make

-3. Execute your code and observe the results (see the emphasize-lines). 
+3. Execute your code and observe the results (see the emphasize-lines).

    .. code-block:: bash
        :emphasize-lines: 1,12-15

-        > ./paddle/math/tests/test_GpuProfiler                                                                             
-        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler                                             
-        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions                                                                      
-        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.                                                                   
-        [==========] Running 1 test from 1 test case.                                                                                                
-        [----------] Global test environment set-up.                                                                                                 
-        [----------] 1 test from Profiler                                                                                                            
-        [ RUN      ] Profiler.BilinearFwdBwd                                                                                                         
+        > ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
-        gSizeX = 64, imgSizeY = 64"                                                                                                                  
-        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751                                           
-        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======                                               
-        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1                                                                                                                                  
-        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======                                                          
-        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------                                            
-        [       OK ] Profiler.BilinearFwdBwd (136 ms)                                                                                                
-        [----------] 1 test from Profiler (136 ms total)                                                                                             
-                                                                                                                                                    
-        [----------] Global test environment tear-down                                                                                               
-        [==========] 1 test from 1 test case ran. (136 ms total)                                                                                     
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
        [  PASSED  ] 1 test.

 nvprof profiler
@@ -126,7 +129,7 @@ To use this command line profiler **nvprof**, you can simply issue the following

    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
        :language: c++
-        :lines: 111-124
+        :lines: 137-151
        :emphasize-lines: 6-7
        :linenos:

@@ -147,42 +150,42 @@ Then, you can get the following profiling result:

 .. code-block:: bash

-    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler                                                                                                      
-    ==78544== Profiling result:                                                                                                                                                
-    Time(%)     Time     Calls       Avg       Min       Max  Name                                                                                                            
-    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]                                                                                              
-    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw                                                                                            
-    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw                                                                                        
-    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]                                                                                              
-                                                                                                                                                                            
-    ==78544== API calls:                                                                                                                                                       
-    Time(%)     Time     Calls       Avg       Min       Max  Name                                                                                                            
-    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags                                                                                       
-    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree                                                                                                        
-    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate                                                                                                
-    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy                                                                                                      
-    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize                                                                                           
-    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc                                                                                                   
-    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc                                                                                                      
-    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice                                                                                                   
-    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags                                                                                        
-    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute                                                                                            
-    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount                                                                                              
-    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties                                                                                         
-    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch                                                                                                      
-    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName                                                                                                 
-    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem                                                                                                
-    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice                                                                                                   
-    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate                                                                                                 
-    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute                                                                                          
-    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart                                                                                               
-    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall                                                                                               
-    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError                                                                                                
-    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument                                                                                               
-    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet                                                                                                     
-    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount                                                                                                
-    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion                                                                                              
-    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit                                                                                                          
+    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion



--- a/doc/howto/optimization/index_en.rst
+++ b/doc/howto/optimization/index_en.rst
-How to Tune GPU Performance
-===========================
-
-.. toctree::
-  :maxdepth: 3
-
-  gpu_profiling_en.rst
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
+```eval_rst
+.. _cluster_train:
+```
+
+# 运行分布式训练
+
+在本文中，我们将阐释如何在集群上运行分布式 Paddle 训练作业。我们将以[推荐系统](https://github.com/baidu/Paddle/tree/develop/demo/recommendation)为例创建分布式的单进程训练。
+
+在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统（如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s) ）的用户参考。
+
+## 前提条件
+
+1. 上述脚本使用 Python 库 [fabric](http://www.fabfile.org/) 来运行 SSH 命令。 我们使用 `pip` 来安装 fabric:
+
+   ```bash
+   pip install fabric
+   ```
+
+2. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，需要在 `/usr/local/cuda` 中安装 CUDA; 否则 Paddle 将在运行时报错。
+
+3. 在 [`cluster_train/conf.py`] 中设置 `ROOT_DIR`， 该 ROOT_DIR 要在所有节点上存在。为了方便起见，我们通常在所有节点上创建一个 Unix 用户 `paddle`，并设置 `ROOT_DIR=/home/paddle`。这样，我们可以将 SSH 公钥写入 `/home/paddle/.ssh/authorized_keys`，以便用户 `paddle` 可以 SSH 到所有节点而不用密码。
+
+## 准备工作空间
+
+我们将放置依赖库、配置等文件的目录视为 *工作空间（workspace）*。
+
+这些 `train/test` 数据应该在启动集群作业之前准备好。 为了满足训练/测试数据放置在工作空间中不同目录的要求，PADDLE 根据在模型配置文件中使用的名为 `train.list/test.list` 的索引文件引用训练/测试数据，所以训练/测试数据也包含 train.list/test.list 两个列表文件。所有本地训练 demo 已经提供了脚本来帮助您创建这两个文件，并且集群作业中的所有节点将在正常情况下处理具有相同逻辑代码的文件。
+
+通常，你可以使用本地训练中的相同模型文件进行集群训练。请记住，在模型文件的 `setting`函数中设置的 `batch_size` 表示在集群作业**每个**节点中的 batch 大小，而不是使用同步 SGD 的总 batch 大小。
+
+以下步骤基于 demo 目录中的 [demo/recommendation](https://github.com/PaddlePaddle/Paddle/tree/develop/demo/recommendation)。
+
+你只需完成 demo/recommendation 教程文档到 `Train` 的部分，之后你会得到训练/测试数据和模型配置文件。最后，只需使用 demo/recommendation 作为集群训练的工作空间。
+
+最后，你的工作空间应如下所示：
+```
+.
+|-- common_utils.py
+|-- data
+|   |-- config.json
+|   |-- config_generator.py
+|   |-- meta.bin
+|   |-- meta_config.json
+|   |-- meta_generator.py
+|   |-- ml-1m
+|   |-- ml_data.sh
+|   |-- ratings.dat.test
+|   |-- ratings.dat.train
+|   |-- split.py
+|   |-- test.list
+|   `-- train.list
+|-- dataprovider.py
+|-- evaluate.sh
+|-- prediction.py
+|-- preprocess.sh
+|-- requirements.txt
+|-- run.sh
+`-- trainer_config.py
+```
+虽然这些文件并非都需要集群训练，但是也没有必要删除无用的文件。
+
+`trainer_config.py`
+表示模型配置文件。
+
+`train.list` 和 `test.list`
+文件索引。它存储当前节点所有训练/测试数据的所有相对或绝对文件路径。
+
+`dataprovider.py`
+用于读取训练/测试样本。这与本地训练相同。
+
+`data`
+数据目录中的所有文件被 train.list/test.list 引用。
+
+
+## 准备集群作业配置
+
+以下选项必须在 cluster_train/conf.py 中认真设置
+
+`HOSTS`  所有节点运行集群作业的主机名或 IP 。你还可以将用户和 ssh 端口附加到主机名上，例如 root@192.168.100.17:9090。
+
+`ROOT_DIR` 用于放置 JOB 工作空间目录的工作空间 ROOT 目录
+
+`PADDLE_NIC` 集群通信通道的 NIC(Network Interface Card, 网络接口卡) 接口名称，例如以太网的 eth0，infiniband 的 ib0。
+
+`PADDLE_PORT` 集群通信通道的端口号
+
+`PADDLE_PORTS_NUM` 用于集群通信通道的端口数。 如果集群节点数量少（少于5〜6个节点），建议将其设置为较大，如2〜8，以获得更好的网络性能。
+
+`PADDLE_PORTS_NUM_FOR_SPARSE` 用于 sparse remote updater 集群通信信道的端口数。如果使用 sparse remote update，则可以像 `PADDLE_PORTS_NUM` 一样设置。
+
+`LD_LIBRARY_PATH` 为集群作业设置额外的 LD_LIBRARY_PATH。你可以使用它来设置 CUDA 库的路径。
+
+默认配置如下：
+
+```python
+HOSTS = [
+        "root@192.168.100.17",
+        "root@192.168.100.18",
+        ]
+
+'''
+工作空间配置
+'''
+
+#工作空间根目录
+ROOT_DIR = "/home/paddle"
+
+'''
+网络配置
+'''
+#pserver NIC
+PADDLE_NIC = "eth0"
+#pserver 端口
+PADDLE_PORT = 7164
+#pserver 端口数
+PADDLE_PORTS_NUM = 2
+#pserver sparse ports num
+PADDLE_PORTS_NUM_FOR_SPARSE = 2
+
+#集群作业中所有进程的环境设置
+LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64"
+```
+
+### 启动集群作业
+`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为```paddle.py``` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
+
+`paddle.py` 为方便作业启动提供了两个独特的命令选项。
+
+`job_dispatch_package`  设为本地 `workspace` 目录，它将被分发到 conf.py 中设置的所有节点。  它有助于帮助频繁修改和访问工作区文件的用户减少负担，否则频繁的多节点工作空间部署可能会很麻烦。
+`job_workspace`  设为已部署的工作空间目录，`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
+
+`cluster_train/run.sh` 提供了命令样例来运行 `demo/recommendation` 集群工作，只需用你定义的目录修改 `job_dispatch_package` 和 `job_workspace`，然后：
+```
+sh run.sh
+```
+
+集群作业将会在几秒后启动。
+
+### 终止集群作业
+`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
+
+### 检查集群训练结果
+详细信息请检查 $workspace/log 里的日志，每一个节点都有相同的日志结构。
+
+`paddle_trainer.INFO`
+提供几乎所有训练的内部输出日志，与本地训练相同。这里检验运行时间模型的收敛。
+
+`paddle_pserver2.INFO`
+提供 pserver 运行日志，有助于诊断分布式错误。
+
+`server.log`
+提供 pserver 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+`train.log`
+提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+### 检查模型输出
+运行完成后，模型文件将被写入节点 0 的 `output` 目录中。
+工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
--- a/doc/howto/cluster/cluster_train_en.md
+++ b/doc/howto/cluster/cluster_train_en.md
-# How to Run Distributed Training
+# Run Distributed Training

 In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).

-[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and Kubernetes.
+[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s).

 ## Prerequisite

@@ -20,13 +20,13 @@ In this article, we explain how to run distributed Paddle training jobs on clust

 We refer to the directory where we put dependent libraries, config files, etc., as *workspace*.

-These ```train/test``` data should be prepared before launching cluster job. To  satisfy the requirement that train/test data are placed in different directory from workspace, PADDLE refers train/test data according to index file named as ```train.list/test.list``` which are used in model config file. So the train/test data also contains train.list/test.list two list file. All local training demo already provides scripts to help you create these two files,  and all nodes in cluster job will handle files with same logical code in normal condition.
+These `train/test` data should be prepared before launching cluster job. To  satisfy the requirement that train/test data are placed in different directory from workspace, PADDLE refers train/test data according to index file named as `train.list/test.list` which are used in model config file. So the train/test data also contains train.list/test.list two list file. All local training demo already provides scripts to help you create these two files,  and all nodes in cluster job will handle files with same logical code in normal condition.

-Generally, you can use same model file from local training for cluster training. What you should have in mind that, the ```batch_size``` set in ```setting``` function in model file means batch size in ```each``` node of cluster job instead of total batch size if synchronization SGD was used.
+Generally, you can use same model file from local training for cluster training. What you should have in mind that, the `batch_size` set in `setting` function in model file means batch size in `each` node of cluster job instead of total batch size if synchronization SGD was used.

-Following steps are based on demo/recommendation demo in demo directory.
+Following steps are based on [demo/recommendation](https://github.com/PaddlePaddle/Paddle/tree/develop/demo/recommendation) demo in demo directory.

-You just go through demo/recommendation tutorial doc until ```Train``` section, and at last you will get train/test data and model configuration file. Finaly, just use demo/recommendation as workspace for cluster training.
+You just go through demo/recommendation tutorial doc until `Train` section, and at last you will get train/test data and model configuration file. Finaly, just use demo/recommendation as workspace for cluster training.

 At last your workspace should look like as follow:
 ```
@@ -55,16 +55,16 @@ At last your workspace should look like as follow:
 ```
 Not all of these files are needed for cluster training, but it's not necessary to remove useless files.

-```trainer_config.py```
+`trainer_config.py`
 Indicates the model config file.

-```train.list``` and ```test.list```
+`train.list` and `test.list`
 File index. It stores all relative or absolute file paths of all train/test data at current node.

-```dataprovider.py```
+`dataprovider.py`
 used to read train/test samples. It's same as local training.

-```data```
+`data`
 all files in data directory are refered by train.list/test.list which are refered by data provider.


@@ -72,19 +72,19 @@ all files in data directory are refered by train.list/test.list which are refere

 The options below must be carefully set in cluster_train/conf.py

-```HOSTS```  all nodes hostname or ip that will run cluster job. You can also append user and ssh port with hostname, such as root@192.168.100.17:9090.
+`HOSTS`  all nodes hostname or ip that will run cluster job. You can also append user and ssh port with hostname, such as root@192.168.100.17:9090.

-```ROOT_DIR``` workspace ROOT directory for placing JOB workspace directory
+`ROOT_DIR` workspace ROOT directory for placing JOB workspace directory

-```PADDLE_NIC``` the NIC(Network Interface Card) interface name for cluster communication channel, such as eth0 for ethternet, ib0 for infiniband.
+`PADDLE_NIC` the NIC(Network Interface Card) interface name for cluster communication channel, such as eth0 for ethternet, ib0 for infiniband.

-```PADDLE_PORT``` port number for cluster commnunication channel
+`PADDLE_PORT` port number for cluster commnunication channel

-```PADDLE_PORTS_NUM``` the number of port used for cluster communication channle. if the number of cluster nodes is small(less than 5~6nodes), recommend you set it to larger, such as 2 ~ 8, for better network performance.
+`PADDLE_PORTS_NUM` the number of port used for cluster communication channle. if the number of cluster nodes is small(less than 5~6nodes), recommend you set it to larger, such as 2 ~ 8, for better network performance.

-```PADDLE_PORTS_NUM_FOR_SPARSE``` the number of port used for sparse updater cluster commnunication channel. if sparse remote update is used, set it like ```PADDLE_PORTS_NUM```
+`PADDLE_PORTS_NUM_FOR_SPARSE` the number of port used for sparse updater cluster commnunication channel. if sparse remote update is used, set it like `PADDLE_PORTS_NUM`

-```LD_LIBRARY_PATH``` set addtional LD_LIBRARY_PATH for cluster job. You can use it to set CUDA libraries path.
+`LD_LIBRARY_PATH` set addtional LD_LIBRARY_PATH for cluster job. You can use it to set CUDA libraries path.

 Default Configuration as follow:

@@ -118,15 +118,15 @@ LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64"
 ```

 ### Launching Cluster Job
-```paddle.py``` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can set as ```paddle.py``` command options and ```paddle.py``` will transparently and automatically set these options to PaddlePaddle lower level processes.
+`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.

-```paddle.py```provides two distinguished command option for easy job launching.
+`paddle.py`provides two distinguished command option for easy job launching.

-```job_dispatch_package```  set it with local ```workspace```directory, it will be dispatched to all nodes set in conf.py. It could be helpful for frequent hacking workspace files, otherwise frequent mulit-nodes workspace deployment could make your crazy.
-```job_workspace```  set it with already deployed workspace directory, ```paddle.py``` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
+`job_dispatch_package`  set it with local `workspace`directory, it will be dispatched to all nodes set in conf.py. It could be helpful for frequent hacking workspace files, otherwise frequent mulit-nodes workspace deployment could make your crazy.
+`job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
 dispatch latency.

-```cluster_train/run.sh``` provides command line sample to run ```demo/recommendation``` cluster job, just modify ```job_dispatch_package``` and ```job_workspace``` with your defined directory, then:
+`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
 ```
 sh run.sh
 ```
@@ -134,23 +134,23 @@ sh run.sh
 The cluster Job will start in several seconds.

 ### Kill Cluster Job
-```paddle.py``` can capture ```Ctrl + C``` SIGINT signal to automatically kill all processes launched by it. So just stop ```paddle.py``` to kill cluster job. You should mannally kill job if program crashed.
+`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should mannally kill job if program crashed.

 ### Check Cluster Training Result
 Check log in $workspace/log for details, each node owns same log structure.

-```paddle_trainer.INFO```
+`paddle_trainer.INFO`
 It provides almost all interal output log for training,  same as local training. Check runtime model convergence here.

-```paddle_pserver2.INFO```
+`paddle_pserver2.INFO`
 It provides pserver running log, which could help to diagnose distributed error.

-```server.log```
+`server.log`
 It provides stderr and stdout of pserver process. Check error log if training crashs.

-```train.log```
+`train.log`
 It provides stderr and stdout of trainer process. Check error log if training crashs.

 ### Check Model Output
-After one pass finished, model files will be writed in ```output``` directory in node 0.
-```nodefile``` in workspace indicates the node id of current cluster job.
+After one pass finished, model files will be writed in `output` directory in node 0.
+`nodefile` in workspace indicates the node id of current cluster job.
--- a/doc/howto/cluster/k8s/Dockerfile
+++ b/doc/howto/cluster/k8s/Dockerfile
--- a/doc/howto/cluster/k8s/job.yaml
+++ b/doc/howto/cluster/k8s/job.yaml
--- a/doc/howto/cluster/k8s/k8s-paddle-arch.png
+++ b/doc/howto/cluster/k8s/k8s-paddle-arch.png
--- a/doc/howto/cluster/k8s/paddle_on_k8s_cn.md
+++ b/doc/howto/cluster/k8s/paddle_on_k8s_cn.md
-# Paddle On Kubernetes：单机训练
+# Kubernetes 单机训练

 在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。


--- a/doc/howto/cluster/k8s/distributed_training_on_k8s_cn.md
+++ b/doc/howto/cluster/k8s/distributed_training_on_k8s_cn.md
-
-# PaddlePaddle on Kubernetes：分布式训练
+# Kubernetes 分布式训练

 前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。

@@ -83,7 +82,7 @@ COPY start_paddle.py /root/
 CMD ["bash"," -c","/root/start.sh"]
 ```

-[`start.sh`](start.sh)文件拷贝训练文件到容器内，然后执行[`start_paddle.py`](start_paddle.py)脚本启动训练，前文提到的获取其他节点IP地址，分配`trainer_id`等都在`start_paddle.py`脚本中完成。
+[start.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/start.sh)文件拷贝训练文件到容器内，然后执行[start_paddle.py](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/start_paddle.py)脚本启动训练，前文提到的获取其他节点IP地址，分配`trainer_id`等都在`start_paddle.py`脚本中完成。

 `start_paddle.py`脚本开始时，会先进行参数的初始化与解析。


--- a/doc/howto/cluster/k8s/start.sh
+++ b/doc/howto/cluster/k8s/start.sh
--- a/doc/howto/cluster/k8s/start_paddle.py
+++ b/doc/howto/cluster/k8s/start_paddle.py
--- a/doc/howto/cmd_parameter/arguments_en.md
+++ b/doc/howto/cmd_parameter/arguments_en.md
--- a/doc/howto/cmd_parameter/detail_introduction_en.md
+++ b/doc/howto/cmd_parameter/detail_introduction_en.md
--- a/doc/howto/cmd_parameter/index_en.md
+++ b/doc/howto/cmd_parameter/index_en.md
 ```eval_rst
 ..  _cmd_line_index:
 ```
-# How to Set Command-line Parameters
+# Set Command-line Parameters

 * [Use Case](use_case_en.md)
 * [Arguments](arguments_en.md)

--- a/doc/howto/cmd_parameter/use_case_en.md
+++ b/doc/howto/cmd_parameter/use_case_en.md
--- a/doc/howto/concepts/src/pserver_topology.dot
+++ b/doc/howto/concepts/src/pserver_topology.dot
--- a/doc/howto/concepts/src/trainer_config.py
+++ b/doc/howto/concepts/src/trainer_config.py
--- a/doc/howto/concepts/use_concepts_cn.rst
+++ b/doc/howto/concepts/use_concepts_cn.rst
-#########################
-PaddlePaddle 基本使用概念
-#########################
+############
+基本使用概念
+############

 PaddlePaddle是一个深度学习框架，支持单机模式和多机模式。

@@ -37,7 +37,7 @@ PaddlePaddle是一个深度学习框架，支持单机模式和多机模式。

 DataProvider是PaddlePaddle系统的数据提供器，将用户的原始数据转换成系统可以识别的数据类型。每当系统需要新的数据训练时, trainer进程会调用DataProvider函数返回数据。当所有数据读取完一轮后，DataProvider返回空数据，通知系统一轮数据读取结束，并且系统每一轮训练开始时会重置DataProvider。需要注意的是，DataProvider是被系统调用，而不是新数据驱动系统，一些随机化噪声添加都应该在DataProvider中完成。

-在不同的应用里，训练数据的格式往往各不相同。因此，为了用户能够灵活的处理数据，我们提供了Python处理数据的接口，称为 `PyDataProvider`_ 。在 ``PyDataProvider`` 中，系统C++模块接管了shuffle、处理batch、GPU和CPU通信、双缓冲、异步读取等问题，一些情况下(如：``min_pool_size=0``)需要Python接口里处理shuffle，可以参考 `PyDataProvider`_ 的相关文档继续深入了解。
+在不同的应用里，训练数据的格式往往各不相同。因此，为了用户能够灵活的处理数据，我们提供了Python处理数据的接口，称为 ``PyDataProvider`` 。在 ``PyDataProvider`` 中，系统C++模块接管了shuffle、处理batch、GPU和CPU通信、双缓冲、异步读取等问题，一些情况下(如：``min_pool_size=0``)需要Python接口里处理shuffle，可以参考 :ref:`api_pydataprovider2` 继续深入了解。


 训练配置文件
@@ -50,21 +50,21 @@ DataProvider是PaddlePaddle系统的数据提供器，将用户的原始数据
 ..  literalinclude:: src/trainer_config.py
    :linenos:

-文件开头 ``from paddle.trainer_config_helpers import *`` ，是因为PaddlePaddle配置文件与C++模块通信的最基础协议是protobuf，为了避免用户直接写复杂的protobuf string，我们为用户定以Python接口来配置网络，该Python代码可以生成protobuf包，这就是`trainer_config_helpers`_的作用。因此，在文件的开始，需要import这些函数。 这个包里面包含了模型配置需要的各个模块。
+文件开头 ``from paddle.trainer_config_helpers import *`` ，是因为PaddlePaddle配置文件与C++模块通信的最基础协议是protobuf，为了避免用户直接写复杂的protobuf string，我们为用户定以Python接口来配置网络，该Python代码可以生成protobuf包，这就是 :ref:`api_trainer_config` 的作用。因此，在文件的开始，需要import这些函数。 这个包里面包含了模型配置需要的各个模块。

 下面分别介绍数据源配置、优化算法配置、网络结构配置这三部分该概念。

 数据源配置
 ----------

-使用 `PyDataProvider`_ 的函数 ``define_py_data_sources2`` 配置数据源。``define_py_data_sources2`` 里通过train_list和test_list指定是训练文件列表和测试文件列表。 如果传入字符串的话，是指一个数据列表文件。这个数据列表文件中包含的是每一个训练或者测试文件的路径。如果传入一个list的话，则会默认生成一个list文件，再传入给train.list或者test.list。
+使用 ``PyDataProvider2`` 的函数 ``define_py_data_sources2`` 配置数据源。``define_py_data_sources2`` 里通过train_list和test_list指定是训练文件列表和测试文件列表。 如果传入字符串的话，是指一个数据列表文件。这个数据列表文件中包含的是每一个训练或者测试文件的路径。如果传入一个list的话，则会默认生成一个list文件，再传入给train.list或者test.list。

-``module`` 和 ``obj`` 指定了DataProvider的文件名和返回数据的函数名。更详细的使用，请参考 `PyDataProvider`_ 。
+``module`` 和 ``obj`` 指定了DataProvider的文件名和返回数据的函数名。更详细的使用，请参考 :ref:`api_pydataprovider2` 。

 优化算法配置
 ------------

-通过 `settings`_ 接口设置神经网络所使用的训练参数和 `优化算法`_ ，包括学习率、batch_size、优化算法、正则方法等，具体的使用方法请参考 `settings`_ 文档。
+通过 :ref:`api_trainer_config_helpers_optimizers_settings` 接口设置神经网络所使用的训练参数和 :ref:`api_trainer_config_helpers_optimizers` ，包括学习率、batch_size、优化算法、正则方法等，具体的使用方法请参考 :ref:`api_trainer_config_helpers_optimizers_settings` 文档。

 网络结构配置
 ------------
@@ -82,14 +82,13 @@ DataProvider是PaddlePaddle系统的数据提供器，将用户的原始数据
 
  这个配置文件网络由 ``data_layer`` 、 ``simple_img_conv_pool`` 、 ``fc_layer`` 组成。

-  - `data_layer`_  ： 通常每个配置文件都会包括 ``data_layer`` ，定义输入数据大小。
-  - `simple_img_conv_pool`_ ：是一个组合层，包括了图像的卷积 (convolution)和池化(pooling)。
-  - `fc_layer`_ ：全连接层，激活函数为Softmax，这里也可叫分类层。
+  - :ref:`api_trainer_config_helpers_layers_data_layer`  ： 通常每个配置文件都会包括 ``data_layer`` ，定义输入数据大小。
+  - :ref:`api_trainer_config_helpers_network_simple_img_conv_pool` ：是一个组合层，包括了图像的卷积 (convolution)和池化(pooling)。
+  - :ref:`api_trainer_config_helpers_layers_fc_layer` ：全连接层，激活函数为Softmax，这里也可叫分类层。

-  
 - 损失函数和评估器：损失函数即为网络的优化目标，评估器可以评价模型结果。

-  PaddlePaddle包括很多损失函数和评估起，详细可以参考 `损失函数层`_ 和 `评估器`_ 。这里 ``classification_cost`` 默认使用多类交叉熵损失函数和分类错误率统计评估器。
+  PaddlePaddle包括很多损失函数和评估起，详细可以参考 :ref:`api_trainer_config_helpers_layers_cost_layers` 和 :ref:`api_trainer_config_helpers_evaluators` 。这里 ``classification_cost`` 默认使用多类交叉熵损失函数和分类错误率统计评估器。
  
 - ``outputs``: 标记网络输出的函数为 ``outputs`` 。

@@ -106,7 +105,7 @@ DataProvider是PaddlePaddle系统的数据提供器，将用户的原始数据
       with mixed_layer(size=200) as out:
           out += full_matrix_projection(input=data)

-PaddlePaddle 可以使用 ``mixed layer`` 配置出非常复杂的网络，甚至可以直接配置一个完整的LSTM。用户可以参考 `mixed_layer`_ 的相关文档进行配置。
+PaddlePaddle 可以使用 ``mixed layer`` 配置出非常复杂的网络，甚至可以直接配置一个完整的LSTM。用户可以参考 :ref:`api_trainer_config_helpers_layers_mixed_layer` 的相关文档进行配置。


 分布式训练
@@ -138,18 +137,3 @@ PaddlePaddle多机采用经典的 Parameter Server 架构对多个节点的 trai
 * --ports_num_for_sparse\: 一个pserver进程共绑定多少端口用来做稀疏更新，默认是0。

 使用手工指定端口数量，是因为Paddle的网络通信中，使用了 int32 作为消息长度，比较容易在大模型下溢出。所以，在 pserver 进程中可以启动多个子线程去接受 trainer 的数据，这样单个子线程的长度就不会溢出了。但是这个值不可以调的过大，因为增加这个值，对性能尤其是内存占用有一定的开销，另外稀疏更新的端口如果太大的话，很容易导致某一个参数服务器没有分配到任何参数。
-
-详细的说明可以参考，使用 `集群训练Paddle`_ 。
-
-
-..  _PyDataProvider: ../ui/data_provider/pydataprovider2.html
-.. _settings: ../../doc/ui/api/trainer_config_helpers/optimizers.html#settings
-.. _优化算法: ../../doc/ui/api/trainer_config_helpers/optimizers.html#optimizers
-.. _trainer_config_helper: ../../doc/ui/api/trainer_config_helpers/index.html
-.. _data_layer: ../../doc/ui/api/trainer_config_helpers/layers.html#data-layer
-.. _simple_img_conv_pool: ../../doc/ui/api/trainer_config_helpers/networks.html#simple-img-conv-pool
-.. _fc_layer: ../../doc/ui/api/trainer_config_helpers/layers.html#fc-layer
-.. _损失函数层: ../../doc/ui/api/trainer_config_helpers/layers.html#cost-layers
-.. _评估器: ../../doc/ui/api/trainer_config_helpers/evaluators.html
-.. _mixed_layer: ../../doc/ui/api/trainer_config_helpers/layers.html#mixed-layer
-..  _集群训练Paddle: ../cluster/index.html
--- a/doc/conf.py.cn.in
+++ b/doc/conf.py.cn.in
--- a/doc/conf.py.en.in
+++ b/doc/conf.py.en.in
--- a/doc/tutorials/index_cn.md
+++ b/doc/tutorials/index_cn.md
-# TUTORIALS
-There are several examples and demos here.
+# 完整教程

-## Quick Start
+* [快速入门](quick_start/index_cn.rst)
+* [个性化推荐](rec/ml_regression_cn.rst)
+* [情感分析](sentiment_analysis/index_cn.md)
+* [语义角色标注](semantic_role_labeling/index_cn.md)
+* [机器翻译](text_generation/index_cn.md)

-* [Quick Start](quick_start/index_cn.rst)
+## 常用模型

-## Image
-
-* TBD
-
-## NLP
-
-* [Sentiment Analysis](sentiment_analysis/index_cn.md)
-* [Semantic Role Labeling](semantic_role_labeling/index_cn.rst)
-
-## Recommendation
-
-* TBD
-
-## Model Zoo
-
-* TBD
+* [ResNet模型](imagenet_model/resnet_model_cn.md)
--- a/doc/tutorials/index_en.md
+++ b/doc/tutorials/index_en.md
 # TUTORIALS
 There are several examples and demos here.

-## Quick Start
-
 * [Quick Start](quick_start/index_en.md)
-
-## Image
-
+* [MovieLens Regression](rec/ml_regression_en.rst)
 * [Image Classification](image_classification/index_en.md)
-
-## NLP
-
 * [Sentiment Analysis](sentiment_analysis/index_en.md)
-* [Text Generation](text_generation/index_en.md)
 * [Semantic Role Labeling](semantic_role_labeling/index_en.md)
-
-## Recommendation
-
-* [MovieLens Dataset](rec/ml_dataset_en.md)
-* [MovieLens Regression](rec/ml_regression_en.rst)
+* [Text Generation](text_generation/index_en.md)

 ## Model Zoo
 * [ImageNet: ResNet](imagenet_model/resnet_model_en.md)

--- a/doc/tutorials/quick_start/index_cn.rst
+++ b/doc/tutorials/quick_start/index_cn.rst
-PaddlePaddle快速入门教程
-========================
+=============
+快速入门教程
+=============

 我们将以 `文本分类问题 <https://en.wikipedia.org/wiki/Document_classification>`_ 为例,
 介绍PaddlePaddle的基本使用方法。
@@ -7,7 +8,7 @@ PaddlePaddle快速入门教程
 安装
 ====

-请参考 `安装教程 <../../build_and_install/index.html>`_ 安装PaddlePaddle。
+请参考 :ref:`install_steps` 安装PaddlePaddle。

 使用概述
 ========
@@ -59,7 +60,7 @@ PaddlePaddle快速入门教程
 Python脚本读取数据
 ------------------

-`DataProvider <../../ui/data_provider/index.html>`_ 是PaddlePaddle负责提供数据的模块。``DataProvider`` 主要职责在于将训练数据传入内存或者显存，让模型能够得到训练更新，其包括两个函数：
+`DataProvider` 是PaddlePaddle负责提供数据的模块，主要职责在于将训练数据传入内存或者显存，让模型能够得到训练更新，其包括两个函数：

 * initializer：PaddlePaddle会在调用读取数据的Python脚本之前，先调用initializer函数。在下面例子里，我们在initialzier函数里初始化词表，并且在随后的读取数据过程中填充词表。
 * process：PaddlePaddle调用process函数来读取数据。每次读取一条数据后，process函数会用yield语句输出这条数据，从而能够被PaddlePaddle 捕获 (harvest)。
@@ -72,6 +73,7 @@ Python脚本读取数据
     :linenos:
     :emphasize-lines: 8,33

+详细内容请参见 :ref:`api_dataprovider` 。

 配置中的数据加载定义
 --------------------
@@ -92,7 +94,7 @@ Python脚本读取数据
 - obj="process": 指定生成数据的函数
 - args={"dictionary": word_dict}: 额外的参数，这里指定词典

-更详细数据格式和用例请参考 `PyDataProvider2 <../../ui/data_provider/pydataprovider2.html>`_ 。
+更详细数据格式和用例请参考 :ref:`api_pydataprovider2` 。

 模型网络结构
 ============
@@ -104,7 +106,7 @@ Python脚本读取数据
        :scale: 80%


-我们将以最基本的逻辑回归网络作为起点，并逐渐展示更加深入的功能。更详细的网络配置连接请参考 `Layer文档 <../../../doc/layer.html>`_ 。
+我们将以最基本的逻辑回归网络作为起点，并逐渐展示更加深入的功能。更详细的网络配置连接请参考 :ref:`api_trainer_config_helpers_layers` 。
 所有配置都能在 `源代码 <https://github.com/PaddlePaddle/Paddle>`_ 的 ``demo/quick_start`` 目录下找到。

 逻辑回归模型
@@ -305,7 +307,7 @@ Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优
        --num_passes=15 \
        --use_gpu=false

-这里只简单介绍了单机训练，如何进行分布式训练，可以参考教程 `分布式训练 <../../cluster/index.html>`_ 。
+这里只简单介绍了单机训练，如何进行分布式训练，请参考 :ref:`cluster_train` 。

 预测
 =====
@@ -317,7 +319,7 @@ Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优
    :scale: 80%

 之前配置文件中 ``test.list`` 指定的数据将会被测试，这里直接通过预测脚本 ``predict.sh`` 进行预测,
-更详细的说明，可以参考 `Python API预测 <../../ui/predict/swig_py_paddle.html>`_ 教程。
+更详细的说明，请参考 :ref:`api_swig_py_paddle` 。

    .. code-block:: bash

@@ -372,7 +374,7 @@ Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优

 默认一个pass保存一次模型，也可以通过saving_period_by_batches设置每隔多少batch保存一次模型。
 可以通过show_parameter_stats_period设置打印参数信息等。
-其他参数请参考 `命令行参数文档 <../../ui/index.html#command-line-argument>`_ 。
+其他参数请参考 命令行参数文档（链接待补充）。

 输出日志
 ---------

--- a/doc/tutorials/quick_start/index_en.md
+++ b/doc/tutorials/quick_start/index_en.md
@@ -159,7 +159,7 @@ define_py_data_sources2(train_list='data/train.list',
 You can refer to the following link for more detailed examples and data formats: <a href = "../../api/data_provider/pydataprovider2_en.html">PyDataProvider2</a>.

 ## Network Architecture
-You will describe four kinds of network architectures in this section.
+We will describe four kinds of network architectures in this section.
 <center> ![](./src/PipelineNetwork_en.jpg) </center>

 First, you will build a logistic regression model. Later, you will also get chance to build other more powerful network architectures.
@@ -391,7 +391,7 @@ paddle train \
 --use_gpu=false
 ```

-We do not provide examples on how to train on clusters here. If you want to train on clusters, please follow the <a href = "../../howto/cluster/cluster_train_en.html">distributed training</a> documentation or other demos for more details.
+We do not provide examples on how to train on clusters here. If you want to train on clusters, please follow the <a href = "../../howto/usage/cluster/cluster_train_en.html">distributed training</a> documentation or other demos for more details.

 ## Inference
 You can use the trained model to perform prediction on the dataset with no labels. You can also evaluate the model on dataset with labels to obtain its test accuracy.
@@ -509,7 +509,7 @@ The scripts of data downloading, network configurations, and training scrips are
 * \--config_args：Other configuration arguments.
 * \--init_model_path：The path of the initial model parameter.

-By default, the trainer will save model every pass. You can also specify `saving_period_by_batches` to set the frequency of batch saving. You can use `show_parameter_stats_period` to print the statistics of the parameters, which are very useful for tuning parameters. Other command line arguments can be found in <a href = "../../howto/cmd_parameter/index_en.html">command line argument documentation</a>。
+By default, the trainer will save model every pass. You can also specify `saving_period_by_batches` to set the frequency of batch saving. You can use `show_parameter_stats_period` to print the statistics of the parameters, which are very useful for tuning parameters. Other command line arguments can be found in <a href = "../../howto/usage/cmd_parameter/index_en.html">command line argument documentation</a>。

 ### Log


--- a/doc/tutorials/rec/ml_dataset_cn.md
+++ b/doc/tutorials/rec/ml_dataset_cn.md
+```eval_rst
+.. _demo_ml_dataset:
+
+```
+
+# MovieLens数据集
+
+[MovieLens 数据集](http://grouplens.org/datasets/movielens/)由GroupLens Research实验室搜集整理。
+该数据集包含一些用户信息、电影信息以及电影评分\[1-5\]。根据数据量规模，该数据及有很多不同的版本。
+我们用[MovieLens 百万数据集](http://files.grouplens.org/datasets/movielens/ml-1m.zip)作为示例数据
+集，其中包含6,000位用户对4,000部电影的1,000,000条评价。该数据集于2003年2月发布。
+
+## 数据集特征
+
+在[ml-1m 数据集](http://files.grouplens.org/datasets/movielens/ml-1m.zip)中有许多的特征。在[ml-1m 数据集]
+(http://files.grouplens.org/datasets/movielens/ml-1m.zip)中的这些数据文件(含有".dat"的后缀)实际上是CSV文件，
+分隔符为"::"。以下我们翻译数据集网站中README文件的描述:
+
+### 评分文件描述(ratings.dat)
+
+
+所有的评分数据都包含在"ratings.dat"文件中，遵循如下的格式:
+
+用户ID::电影ID::评分::时间戳
+
+- 用户ID范围从1到6040
+- 电影ID范围从1到3952
+- 评分被调整为5星的规模(只允许整数的星级)
+- 时间戳表示为从1970-01-01(UTC)来的秒数，与time(2)的返回值一致
+- 每位用户至少有20条评分
+
+### 用户文件描述(users.dat)
+
+所有的用户信息都包含在"users.dat"文件中，遵循如下的格式:
+
+用户ID::性别::年龄::职业::邮编
+
+所有的人口统计学信息由用户自愿提供，没有进行正确性的检查。只有含有人
+口统计学信息的用户才被包含在数据集中。
+
+- 性别，用"M"表示男性，"F"表示女性
+- 年龄从下列列表范围中选取:
+
+	*   1:	"18岁以下"
+	*  18:	"18-24岁"
+	*  25:	"25-34岁"
+	*  35:	"35-44岁"
+	*  45:	"45-49岁"
+	*  50:	"50-55岁"
+	*  56:	"56+"
+
+- 职业从下面所列中选择:
+
+	*   0:  "其他"或不确定
+	*   1:  "学术/教育工作者"
+	*   2:  "艺术家"
+	*   3:  "文书工作/管理员"
+	*   4:  "大学生/研究生"
+	*   5:  "客户服务"
+	*   6:  "医生/医疗保健"
+	*   7:  "行政工作/管理人员"
+	*   8:  "农民"
+	*   9:  "操持家务者"
+	*  10:  "高中毕业生"
+	*  11:  "律师"
+	*  12:  "程序员"
+	*  13:  "退休人员"
+	*  14:  "销售/市场"
+	*  15:  "科学家"
+	*  16:  "自由职业者"
+	*  17:  "技术员/工程师"
+	*  18:  "推销员/手工艺者"
+	*  19:  "无业人士"
+	*  20:  "作家"
+
+### 电影文件描述(movies.dat)
+
+所有的电影信息都包含在"movies.dat"文件中，遵循如下的格式:
+
+电影ID::电影名称::电影类型
+
+- 电影名称（包括发行时间）与IMDB网站提供的一致
+- 电影类型如符合多种用管道符号|分割，选自下列类型:
+
+	*	动作片
+	*	冒险片
+	*	动画片
+	*	儿童片
+	*	喜剧片
+	*	犯罪片
+	*	纪录片
+	*	戏剧
+	*	奇幻片
+	*	黑色电影
+	*	恐怖片
+	*	音乐剧
+	*	悬疑片
+	*	浪漫片
+	*	科幻片
+	*	惊险电影
+	*	战争片
+	*	西部片
+
+- 由于意外的副本记录和测试记录，有些电影ID可能与实际电影不相符合
+- 电影大部分是手工输入数据，因此可能会有一些错误和不一致发生
--- a/doc/tutorials/rec/ml_regression_cn.rst
+++ b/doc/tutorials/rec/ml_regression_cn.rst
+MovieLens数据集评分回归模型
+===========================
+
+这里我们在MovieLens数据集描述一种 **余弦相似度回归** 任务。
+该示例将展示paddle如何进行词向量嵌入，处理相似度回归，针对文本
+的单词级别的卷积神经网络，以及paddle如何处理多种类型的输入。
+需要注意的是，该模型网络只是用于进行demo展示paddle如何工作，而
+没有进行结构的微调。
+
+
+**我们非常欢迎您用PADDLEPADDLE构建更好的示例，如果您有好的建议来
+让这个示例变得更好，希望能让我们知晓。**
+
+数据准备
+`````````
+下载并解压数据集
+'''''''''''''''''
+这里我们使用 :ref:`demo_ml_dataset` 。
+要下载和解压数据集，只需要简单的运行下面的命令即可。
+
+.. code-block:: bash
+
+	cd demo/recommendation/data
+	./ml_data.sh
+
+:code:`demo/recommendation/data/ml-1m` 的目录结构为:
+
+.. code-block:: text
+
+	+--ml-1m
+		+--- movies.dat 	# 电影特征
+		+--- ratings.dat 	# 评分
+		+--- users.dat 		# 用户特征
+		+--- README 		# 数据集描述
+
+字段配置文件
+'''''''''''''
+**字段配置文件** 用来具体说明数据集的字段和文件格式，
+例如，说明每个特征文件具体字段是 **什么** 类型。
+
+ml-1m的字段配置文件在目录 :code:`demo/recommendation/data/config.json` 中。
+其具体说明了字段类型和文件名称:
+
+1) 用户文件中有四种类型的字段\: 编号，性别，年龄和职业；
+
+2) 文件名称为"users.dat"，文件的分隔符为"::"。
+
+.. include:: ../../../demo/recommendation/data/config.json
+   :code: json
+   :literal:
+
+准备数据
+`````````
+你需要安装python的第三方库。
+**强烈推荐使用VIRTUALENV来创造一个干净的python环境。**
+
+.. code-block:: bash
+
+	pip install -r requirements.txt
+
+预处理数据一般的命令为:
+
+.. code-block:: bash
+
+	cd demo/recommendation
+	./preprocess.sh
+
+下面介绍预处理过程具体的步骤。
+
+提取电影或用户的特征并生成python对象
+'''''''''''''''''''''''''''''''''''''
+
+在movielens 1m数据集中，电影和用户有许多的特征。
+评分文件的每一行仅仅提供电影或用户的编号来代表相应的电影或用户。
+我们首先处理电影或用户的特征文件，然后用pickle命令将特征( **Meta** )对象存储为文件。
+
+Meta配置文件
+.............
+
+**Meta配置文件** 用来具体描述 **如何** 解析数据集中的每一个字段。
+该文件可以从字段配置文件生成，或是手动编辑生成。文件的格式可以
+为json或yaml格式。解析器能通过文件的扩展名自动识别文件的格式。
+
+要将字段配置文件转化为meta配置文件，只需要运行：
+
+.. code-block:: bash
+
+	cd demo/recommendation/data
+	python config_generator.py config.json > meta_config.json
+
+生成的meta配置文件如下所示：
+
+.. include:: ../../../demo/recommendation/data/meta_config.json
+	:code: json
+	:literal:
+
+在meta文件中有两种特征\: 电影和用户。
+
+* 在电影文件movies.dat中
+	* 我们仅用"::"来分隔每一行
+	* pos 0 代表编号
+	* pos 1 特征：
+		* name是电影名
+		* 利用正则表达式来解析该特征
+		* 基于字母的词嵌入特征
+		* 是序列
+	* pos 2 特征：
+		* name是体裁
+		* type是one hot稠密向量
+		* dictionary由解析自动生成，每一个key由'|'分隔
+* 在用户文件users.dat中
+	* 我们仅用"::"来分隔每一行
+	* pos 0 代表编号
+	* pos 1 特征：
+		* name是性别
+		* 简单的基于字母的词嵌入
+	* pos 2 特征：
+		* name是年龄
+		* 是整个的词嵌入
+		* 嵌入编号会根据单词排序
+	* pos 3 特征：
+		* name是职业
+		* 简单的整个词嵌入
+
+
+Meta文件
+''''''''
+
+有了meta配置文件之后，我们可以生成 **Meta文件** ，该文件是python的pickle对象，
+存储着电影或用户信息。可以运行下面的命令来生成。
+
+.. code-block:: bash
+
+	python meta_generator.py ml-1m meta.bin --config=meta_config.json
+
+meta文件 :code:`meta.bin` 的结构如下：
+
+.. code-block:: text
+
+    +--+ movie
+    |      +--+ __meta__
+    |      |       +--+ raw_meta  # 每个特征的meta配置。列表
+    |      |       |       +
+    |      |       |       |     # 编号字段，我们用编号作为key 
+    |      |       |       +--+ {'count': 3883, 'max': 3952, 'is_key': True, 'type': 'id', 'min': 1}
+    |      |       |       |
+    |      |       |       |     # 电影名字段，嵌入特征字典
+    |      |       |       +--+ {'dict': [ ... ], 'type': 'embedding', 'name': 'title', 'seq': 'sequence'}
+    |      |       |       |
+    |      |       |       |     # 体裁字段，体裁字典
+    |      |       |       +--+ {'dict': [ ... ], 'type': 'one_hot_dense', 'name': 'genres'}
+    |      |       |
+    |      |       +--+ feature_map [1, 2] # a list for raw_meta index for feature field.
+    |      |                               # it means there are 2 features for each key.
+    |      |                               #    * 0 offset of feature is raw_meta[1], Title.
+    |      |                               #    * 1 offset of feature is raw_meta[2], Genres.
+    |      |
+    |      +--+ 1 # 电影1的特征
+    |      |    +
+    |      |    +---+ [[...], [...]] # title ids, genres dense vector
+    |      |
+    |      +--+ 2
+    |      |
+    |      +--+ ...
+    |
+    +--- user
+           +--+ __meta__
+           |       +
+           |       +--+ raw_meta
+           |       |       +
+           |       |       +--+ id field as user
+           |       |       |
+           |       |       +--+ {'dict': ['F', 'M'], 'type': 'embedding', 'name': 'gender', 'seq': 'no_sequence'}
+           |       |       |
+           |       |       +--+ {'dict': ['1', '18', '25', '35', '45', '50', '56'], 'type': 'embedding', 'name': 'age', 'seq': 'no_sequence'}
+           |       |       |
+           |       |       +--+ {'dict': [...], 'type': 'embedding', 'name': 'occupation', 'seq': 'no_sequence'}
+           |       |
+           |       +--+ feature_map [1, 2, 3]
+           |
+           +--+ 1 # 用户1的特征
+           |
+           +--+ 2
+           +--+ ...
+
+
+分割训练/测试文件
+''''''''''''''''''
+
+我们将 :code:`ml-1m/ratings.dat` 文件分割为训练和测试文件。分割文件的方法是：对于每位用户，我们将评分分成两部分。
+这样的话每位用户在测试文件中将与训练文件含有同样的信息。
+
+用 :code:`separate.py` 来分离训练和测试文件。
+
+.. code-block:: bash
+
+	python split.py ml-1m/ratings.dat --delimiter="::" --test_ratio=0.1
+
+这样就会生成两个文件：:code:`ml-1m/ratings.dat.train` 和 :code:`ml-1m/ratings.data.test` 。
+将他们移动到目录 :code:`data` ，然后进行随机打乱，再为paddle的训练过程提供文件列表。
+
+..  code-block:: bash
+
+    shuf ml-1m/ratings.dat.train > ratings.dat.train
+    cp ml-1m/ratings.dat.test .
+    echo "./data/ratings.dat.train" > train.list
+    echo "./data/ratings.dat.test" > test.list
+
+
+神经网络结构配置
+`````````````````
+
+训练器配置文件
+'''''''''''''''
+
+网络结构如下图所示：
+
+..  image:: rec_regression_network.png
+    :align: center
+    :alt: rec_regression_network
+
+该示例的神经网络配置文件 :code:`trainer_config.py` 如下所示：
+
+..  literalinclude:: ../../../demo/recommendation/trainer_config.py
+    :language: python
+    :lines: 15-
+
+在文件 :code:`trainer_config.py` 中，我们仅仅是将每个特征种类映射到一个特征向量中，以下
+展示了如何将每个特征映射到一个向量。
+
+* :code:`id` \: 仅仅是简单的嵌入，然后添加一个全连接层。
+* :code:`embedding` \:
+    - 如果是序列，则先做嵌入，然后再做一次文本卷积网络操作，
+      然后得到平均采样的结果。
+    - 如果不是序列，则先做嵌入，然后添加一个全连接层。
+* :code:`one_host_dense` \:
+    - 仅仅是两个全连接层。
+
+然后我们利用多输入的:code:`fc_layer` 全连接层将电影的每个特征结合成一个电影特征，
+并且对用户的特征做同样的操作，也得到一个用户特征。然后我们求这两个特征的余弦相似度。
+
+在这些网络中，我们用以下的一些:ref:`api_trainer_config` 中的接口。
+
+*  数据层， :ref:`api_trainer_config_helpers_layers_data_layer`
+*  全连接层， :ref:`api_trainer_config_helpers_layers_fc_layer`
+*  嵌入层， :ref:`api_trainer_config_helpers_layers_embedding_layer`
+*  文本投影层， :ref:`api_trainer_config_helpers_layers_context_projection`
+*  采样层， :ref:`api_trainer_config_helpers_layers_pooling_layer`
+*  余弦相似度层， :ref:`api_trainer_config_helpers_layers_cos_sim`
+*  文本卷积采样层， :ref:`api_trainer_config_helpers_network_text_conv_pool`
+*  声明Python数据源， :ref:`api_trainer_config_helpers_data_sources` 
+
+数据提供脚本
+'''''''''''''
+
+..  literalinclude:: ../../../demo/recommendation/dataprovider.py
+    :language: python
+    :lines: 15-
+
+数据提供脚本仅仅是读取meta.bin和评分文件，生成训练需要的样本。
+在脚本 :code:`dataprovider.py` 中，我们需要设置：
+
+* obj.slots\: 特征的类型和维度。
+* use_seq\: :code:`dataprovider.py` 中的数据是否为序列模式。
+* process\: 返回数据的每一条样本给 :code:`paddle` 。
+
+数据提供脚本的细节文档可以参考 :ref:`api_pydataprovider2` 。
+
+训练
+````
+
+准备好数据，配置了网络，编写好数据提供脚本后，现在我们可以开始paddle训练了。
+
+代码 :code:`run.sh` 如下：
+
+..  literalinclude:: ../../../demo/recommendation/run.sh
+    :language: bash
+    :lines: 16-
+
+该脚本仅仅是开始一个paddle训练过程，将日志写入文件 :code:`log.txt` ，然后
+打印在屏幕上。
+
+脚本 :code:`run.sh` 中的每一行命令，请参考页面 :ref:`cmd_line_index` 。
+这些参数的简短介绍如下：
+
+*  config\: 告诉paddle哪个文件是神经网络的配置文件。
+*  save_dir\: 告诉paddle将模型保存在: code:`./output` 中。
+*  use_gpu\: 是否使用GPU，默认为不使用。
+*  trainer_count\: 一台机器上面的线程数量。
+*  test_all_data_in_one_period\: 每一个测试周期测试一次所有数据。否则，
+   每个测试周期测试: code:`batch_size` 批次的数据。
+*  log_period\: 在训练了: code:`log_period` 批次后打印日志。
+*  dot_period\: 在每训练: code:`dot_period` 个批次后打印一个 :code:`.` 。
+*  num_passes\: 训练至多: code:`num_passes` 轮。
+
+如果训练过程启动成功的话，输出应该类似如下：
+
+..  code-block:: text
+
+    I0601 08:07:22.832059 10549 TrainerInternal.cpp:157]  Batch=100 samples=160000 AvgCost=4.13494 CurrentCost=4.13494 Eval:  CurrentEval:
+
+    I0601 08:07:50.672627 10549 TrainerInternal.cpp:157]  Batch=200 samples=320000 AvgCost=3.80957 CurrentCost=3.48421 Eval:  CurrentEval:
+
+    I0601 08:08:18.877369 10549 TrainerInternal.cpp:157]  Batch=300 samples=480000 AvgCost=3.68145 CurrentCost=3.42519 Eval:  CurrentEval:
+
+    I0601 08:08:46.863963 10549 TrainerInternal.cpp:157]  Batch=400 samples=640000 AvgCost=3.6007 CurrentCost=3.35847 Eval:  CurrentEval:
+
+    I0601 08:09:15.413025 10549 TrainerInternal.cpp:157]  Batch=500 samples=800000 AvgCost=3.54811 CurrentCost=3.33773 Eval:  CurrentEval:
+    I0601 08:09:36.058670 10549 TrainerInternal.cpp:181]  Pass=0 Batch=565 samples=902826 AvgCost=3.52368 Eval:
+    I0601 08:09:46.215489 10549 Tester.cpp:101]  Test samples=97383 cost=3.32155 Eval:
+    I0601 08:09:46.215966 10549 GradientMachine.cpp:132] Saving parameters to ./output/model/pass-00000
+    I0601 08:09:46.233397 10549 ParamUtil.cpp:99] save dir ./output/model/pass-00000
+    I0601 08:09:46.233438 10549 Util.cpp:209] copy trainer_config.py to ./output/model/pass-00000
+    I0601 08:09:46.233541 10549 ParamUtil.cpp:147] fileName trainer_config.py
+
+模型被保存在 :code:`output/` 目录中。你可以在任何时候用 :code:`Ctrl-C` 来停止训练。
+
+模型评估和预测
+```````````````
+
+在训练了几个轮次以后，你可以对模型进行评估，得到最好轮次下的模型。运行下面命令即可：
+
+.. code-block:: bash
+
+    ./evaluate.sh 
+
+你将看到如下的信息：
+
+.. code-block:: text
+
+    Best pass is 00009,  error is 3.06949, which means predict get error as 0.875998002281
+    evaluating from pass output/pass-00009
+
+然后，你可以预测任何用户对于任何一部电影的评价，运行下面命令即可：
+
+..  code-block:: bash
+
+    python prediction.py 'output/pass-00009/'
+
+预测程序将读取用户的输入，然后输出预测分数。用户预测的命令行界面如下：
+
+..  code-block:: text
+
+    Input movie_id: 9
+    Input user_id: 4
+    Prediction Score is 2.56
+    Input movie_id: 8
+    Input user_id: 2
+    Prediction Score is 3.13
--- a/doc/tutorials/rec/ml_regression_en.rst
+++ b/doc/tutorials/rec/ml_regression_en.rst
@@ -36,7 +36,7 @@ And the directory structure of :code:`demo/recommendation/data/ml-1m` is:

 Field config file
 '''''''''''''''''
-**Field config file** is used to specific the fields dataset and file format,
+**Field config file** is used to specify the fields of the dataset and the file format,
 i.e, specific **WHAT** type it is in each feature file.

 The field config file of ml-1m shows in :code:`demo/recommendation/data/config.json`.
@@ -188,7 +188,7 @@ Split Training/Testing files
 We split :code:`ml-1m/ratings.dat` into a training and testing file. The way to split file is for each user, we split the
 rating by two parts. So each user in testing file will have some rating information in training file.

-Use separate.py to separate the training and testing file.
+Use :code:`separate.py` to separate the training and testing file.

 ..  code-block:: bash

@@ -217,7 +217,7 @@ The network structure shows below.
    :align: center
    :alt: rec_regression_network

-The demo's neural network config file "trainer_config.py" show as below.
+The demo's neural network config file :code:`trainer_config.py` show as below.

 ..  literalinclude:: ../../../demo/recommendation/trainer_config.py
    :language: python
@@ -239,7 +239,7 @@ Then we combine each features of movie into one movie feature by a
 get one user feature. Then we calculate the cosine similarity of these two
 features.

-In these network, we use several api in :ref:`api_trainer_config` . There are
+In these networks, we use several APIs in :ref:`api_trainer_config` . There are

 *  Data Layer, :ref:`api_trainer_config_helpers_layers_data_layer`
 *  Fully Connected Layer, :ref:`api_trainer_config_helpers_layers_fc_layer`
@@ -271,19 +271,19 @@ Train

 After prepare data, config network, writting data provider, now we can run paddle training.

-The run.sh is shown as follow:
+The :code:`run.sh` is shown as follow:

 ..  literalinclude:: ../../../demo/recommendation/run.sh
    :language: bash
    :lines: 16-

-It just start a paddle training process, write the log to `log.txt`,
+It just start a paddle training process, write the log to :code:`log.txt`,
 then print it on screen.

 Each command line argument in :code:`run.sh`, please refer to the :ref:`cmd_line_index` page. The short description of these arguments is shown as follow.

 *  config\: Tell paddle which file is neural network configuration.
-*  save_dir\: Tell paddle save model into './output'
+*  save_dir\: Tell paddle save model into :code:`./output`.
 *  use_gpu\: Use gpu or not. Default is false.
 *  trainer_count\: The compute thread in one machine.
 *  test_all_data_in_one_period\: Test All Data during one test period. Otherwise,

--- a/doc/tutorials/text_generation/index_cn.md
+++ b/doc/tutorials/text_generation/index_cn.md
+# 文本生成教程 #
+
+在语言生成领域中，“序列到序列”（sequence to sequence）的方法已被证明是一种强大的模型。它可以被应用于进行机器翻译（machine translation）、query改写（query rewriting）、图像描述（image captioning）等等。
+
+本篇教程将会指导你通过训练一个“序列到序列”的神经网络机器翻译（NMT）模型来将法语翻译成英语。
+
+我们遵循 [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473) 这篇文章，其中详细说明了模型架构，以及在WMT-14数据集上得到良好表现的训练过程。本篇教程在PaddlePaddle中重现了这一良好的训练结果。
+
+我们感谢@caoying的pull request，其中定义了模型架构和solver配置。
+
+## 数据准备 ##
+### 下载与解压缩 ###
+从该链接 [http://www-lium.univ-lemans.fr/~schwenk/cslm\_joint\_paper/](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/) 下载WMT-14数据集，然后解压，并将Develop和Test数据分别放入不同的文件夹。
+
+- **Train data**: [bitexts (选择过后的)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)
+- **Develop and Test data**: [dev 与 test 数据](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)
+
+在Linux下，只需要简单地运行以下命令。否则你需要自己下载、解压、拆分到不同文件夹、并且分别重命名文件后缀。
+
+```bash
+cd demo/seqToseq/data
+./wmt14_data.sh
+```
+
+我们会发现数据集 `wmt14` 中包含如下表所示的3个文件夹。
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+<colgroup>
+<col  class="left" />
+<col  class="left" />
+<col  class="left" />
+<col  class="left" />
+</colgroup>
+
+<thead>
+<tr>
+<th scope="col" class="left">folder name</th>
+<th scope="col" class="left">French-English parallel corpora file</th>
+<th scope="col" class="left">number of total file</th>
+<th scope="col" class="left">size</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">train_data</td>
+<td class="left">ccb2_pc30.src, ccb2_pc30.trg, etc</td>
+<td class="left">12</td>
+<td class="left">3.55G</td>
+</tr>
+
+<tr>
+<td class="left">test_data</td>
+<td class="left">ntst1213.src, ntst1213.trg</td>
+<td class="left">2</td>
+<td class="left">1636k</td>
+</tr>
+
+<tr>
+<td class="left">gen_data</td>
+<td class="left">ntst14.src, ntst14.trg</td>
+<td class="left">2</td>
+<td class="left">864k</td>
+</tr>
+</tbody>
+</table>
+<br/>
+
+- 每个文件夹都包含法语到英语的平行语料库
+- **XXX.src** 是原始法语文件；**XXX.trg** 是目标英语文件
+- **XXX.src** 和 **XXX.trg** 的行数应该一致
+- 每行都是一个法语或者英语的句子
+- **XXX.src** 和 **XXX.trg** 中任意第i行的句子之间都有着一一对应的关系
+
+### 用户自定义数据集 ###
+
+如果你想进行诸如语义转述（Paraphrasing）等其他“序列到序列”的任务，你只需要按照如下方式组织数据，并将它们放在`demo/seqToseq/data`目录下：
+
+    dataset
+      train
+        file1.src file1.trg
+        file2.src file2.trg
+        ......
+      test
+        file1.src file1.trg
+        file2.src file2.trg
+        ......
+      gen
+        file1.src file1.trg
+        file2.src file2.trg
+        ......
+  
+- 一级目录：数据集文件夹名称
+- 二级目录：train、test和gen这三个文件夹是固定的
+- 三级目录：源语言到目标语言的平行语料库文件
+  - **XXX.src** 是源语言的文件，**XXX.trg** 时目标语言的文件
+  - 文件中的每行都必须是一个句子
+  - **XXX.src** 和 **XXX.trg** 中任意第i行的句子之间都必须有着一一对应的关系
+
+## 数据预处理 ##
+### 预处理工作流程 ###
+- 将每个源语言到目标语言的平行语料库文件合并为一个文件：
+  - 合并每个 **XXX.src** 和 **XXX.trg** 文件为 **XXX**
+  - **XXX** 中的第i行 = **XXX.src** 中的第i行 + '\t' + **XXX.trg**中的第i行
+- 创建训练数据的“源字典”和“目标字典”，每个字典都有DICTSIZE个单词，包括：
+  - 词频最高的（DICTSIZE - 3）个单词
+  - 3个特殊符号
+  - `<s>`：序列的开始
+  - `<e>`：序列的结束
+  - `<unk>`：未包含在字典中的单词
+
+### 预处理命令和结果
+对数据集进行预处理的基本命令是：
+
+```python
+cd demo/seqToseq/
+python preprocess.py -i INPUT [-d DICTSIZE] [-m]
+```
+
+- `-i INPUT`：输入的原始数据集路径
+- `-d DICTSIZE`：指定的字典单词数，如果没有设置，字典会包含输入数据集中的所有单词
+- `-m --mergeDict`：合并 “源字典”和“目标字典”，使得两个字典有相同的上下文
+
+你将会看到如下消息：
+
+    concat parallel corpora for dataset
+    build source dictionary for train data
+    build target dictionary for train data
+    dictionary size is XXX
+
+然后你只需要运行以下命令：
+
+```python
+python preprocess.py -i data/wmt14 -d 30000
+```
+
+这将花费数分钟的时间，并且将预处理好的数据集存放在`demo/seqToseq/data/pre-wmt14`目录下。目录结构如下：
+
+    train test gen train.list test.list gen.list src.dict trg.dict# Text generation Tutorial #
+
+- **train, test, gen**：分别包含了法语到英语的平行语料库的训练数据、测试数据和生成数据。文件夹中的每个文件的每一行包含两部分，首先是法语序列，然后是对应的英语序列。
+- **train.list, test.list, gen.list**：分别为train，test，gen文件夹中的文件列表
+- **src.dict, trg.dict**：源（法语）/目标（英语）字典，每个字典包含总共30000个单词：29997个最高频单词和3个特殊符号
+
+## 模型训练 ##
+### 简介###
+
+神经网络机器翻译（NMT）旨在建立一个可以被协同调至最优翻译效果的单神经元网络。近期提出的NMT模型通常都属于编解码模型（encoder–decoder models）的一种。编解码模型将一个源语句编码为一个定长的向量，然后解码器通过这个向量生成一个目标语句。
+
+在这个任务中，我们使用了一个编解码模型的扩展，它同时学习排列(align)与翻译。每当模型在翻译过程中生成了一个单词，它就会在源语句中搜索出最相关信息的位置的集合。解码器根据上下文向量预测出一个目标单词，这个向量与源中搜索出的位置和所有之前生成的目标单词有关。如想了解更多详细的解释，可以参考 [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473)。
+
+这个模型对于编解码模型来说，最不同的特色是它并没有将输入语句编码为一个单独的定长向量。相反，它将输入语句编码为向量的序列，其中每个向量对应输入语句中的一个元素。然后在解码被翻译的语句时，会自适应地从这些向量中选择一个子集出来。这使得NMT模型得以解放出来，不必再将任意长度源语句中的所有信息压缩至一个定长的向量中。该模型在长语句翻译的场景下效果提升更加明显，在任意长度语句翻译的场景下都可以观察到其效果的提升。
+<center>![](./encoder-decoder-attention-model.png)</center>
+<center>Figure 1. Encoder-Decoder-Attention-Model</center>
+
+### 使用PaddlePaddle训练模型 ###
+我们在训练之前需要常见一个模型配置文件，这里是一个例子`demo/seqToseq/translation/train.conf`。前三行import了定义network，job_mode和attention_mode的python函数。
+
+```python
+from seqToseq_net import *
+is_generating = False
+
+### Data Definiation
+train_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14",
+                             is_generating = is_generating)
+
+### Algorithm Configuration
+settings(
+    learning_method = AdamOptimizer(),
+    batch_size = 50,
+    learning_rate = 5e-4)
+
+### Network Architecture
+gru_encoder_decoder(train_conf, is_generating)
+```
+
+1. **Data Definiation**：在示例中我们定义了一个序列到序列的训练和测试数据。它返回train_conf作为配置，其输入参数如下：
+  - data_dir：训练数据和测试数据的目录
+  - is_generating：这个配置是否用来生成，这里设置为False
+2. **Algorithm Configuration**：在示例中我们使用SGD训练算法（默认），和ADAM学习方法，指定batch_size为50，learning_rate为5e-4
+3. **Network Architecture**：在示例中我们使用attention版本的GRU编解码网络。它包括了一个双向的GRU作为编码器和解码器，它模拟了解码翻译过程中在源语句中的搜索。
+
+### 训练模型的命令与结果###
+写完模型配置之后，我们可以通过以下命令来训练模型：
+
+```bash
+cd demo/seqToseq/translation
+./train.sh
+```
+
+`train.sh` 的内容如下所示：
+
+```bash
+paddle train \
+--config='translation/train.conf' \
+--save_dir='translation/model' \
+--use_gpu=false \
+--num_passes=16 \
+--show_parameter_stats_period=100 \
+--trainer_count=4 \
+--log_period=10 \
+--dot_period=5 \
+2>&1 | tee 'translation/train.log'
+```
+- config: 设置神经网络的配置文件
+- save_dir: 设置保存模型的输出路径
+- use_gpu: 是否使用GPU训练，这里设置为使用CPU
+- num_passes: 设置passes的数量。paddle中的一条pass表示训练数据集中所有的样本一次
+- show_parameter_stats_period: 这里每隔100个batch显示一次参数统计信息
+- trainer_count: 设置CPU线程数或者GPU设备数
+- log_period: 这里每隔10个batch打印一次日志
+- dot_period: 这里每个5个batch打印一个点"."
+
+训练的损失函数默认每隔10个batch打印一次，你将会看到如下消息：
+
+    I0719 19:16:45.952062 15563 TrainerInternal.cpp:160]  Batch=10 samples=500 AvgCost=198.475 CurrentCost=198.475 Eval: classification_error_evaluator=0.737155  CurrentEval: classification_error_evaluator=0.737155
+    I0719 19:17:56.707319 15563 TrainerInternal.cpp:160]  Batch=20 samples=1000 AvgCost=157.479 CurrentCost=116.483 Eval: classification_error_evaluator=0.698392  CurrentEval: classification_error_evaluator=0.659065
+    .....
+- AvgCost：从第0个batch到当前batch的平均cost
+- CurrentCost:：当前batch的cost
+- classification\_error\_evaluator(Eval)：从第0个评估到当前评估中，每个单词的预测错误率
+- classification\_error\_evaluator(CurrentEval)：当前评估中，每个单词的预测错误率
+
+当classification\_error\_evaluator的值低于0.35时，模型就训练成功了。
+
+## 文本生成 ##
+### 简介###
+
+一般而言，NMT模型受制于源语句的编码，并且通过给出当前目标单词来预测下一个目标单词。在训练过程中，当前单词在相比之下总是被当作真值（ground truth）。在生成过程中，当前单词是解码器最后一步的输出，这来自于PaddlePaddle的内存中。
+
+而且，我们使用集束搜索（Beam Search）来生成序列。集束搜索使用广度优先搜索来构建搜索树。对于树的每一层，生成当前层的所有后继状态，并将它们按照启发代价（heuristic cost）升序排列。但是这种方法在每层只保存预设数量的最优状态（这个数量称为beam size）。
+
+### 预训练的模型 ###
+我们在拥有50个节点的集群中训练模型，每个节点有两个6核CPU。我们在5天里训练了16个pass，其中每条pass花费了7个小时。model_dir中有16个子目录，每个里面都包含202MB的全部的模型参数。然后我们发现pass-00012的模型有着最高的BLEU值27.77（参考文献[BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf)）。要下载解压这个模型，只需在linux下运行如下命令：
+
+```bash
+cd demo/seqToseq/data
+./wmt14_model.sh
+```
+
+### 使用PaddlePaddle生成模型 ###
+在翻译法语句子之前，我们需要创建模型配置文件。这里是一个例子`demo/seqToseq/translation/gen.conf`。前三行import了定义network，job_mode和attention_mode的python函数。
+
+```python
+from seqToseq_net import *
+is_generating = True
+
+################## Data Definiation #####################
+gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14",
+                           is_generating = is_generating,
+                           gen_result = "./translation/gen_result")
+
+############## Algorithm Configuration ##################
+settings(
+  learning_method = AdamOptimizer(),
+  batch_size = 1,
+  learning_rate = 0)
+
+################# Network configure #####################
+gru_encoder_decoder(gen_conf, is_generating)
+```
+
+1. **Data Definiation**：在示例中我们定义了一个序列到序列的生成数据。它返回gen_conf作为配置，其输入参数如下：
+  - data_dir：生成数据的目录
+  - is_generating：这个配置是否用来生成，这里设置为True
+  - gen_result：保存生成结果的文件
+2. **Algorithm Configuration**：在生成过程中我们使用SGD训练算法，并指定batch_size为1（每次生成1个序列），learning_rate为0
+3. **Network Architecture**：本质上与训练模型一样
+
+### 生成模型的命令与结果 ###
+写完模型配置之后，我们可以通过以下命令来进行从法语到英语的文本翻译：
+
+```bash
+cd demo/seqToseq/translation
+./gen.sh
+```
+
+ `gen.sh` 的内容如下所示。与训练模型不同的是，这里有一些不同的参数需要指定：
+
+```bash
+paddle train \
+--job=test \
+--config='translation/gen.conf' \
+--save_dir='data/wmt14_model' \
+--use_gpu=true \
+--num_passes=13 \
+--test_pass=12 \
+--trainer_count=1 \
+2>&1 | tee 'translation/gen.log'
+```
+- job：设置任务的模式为测试
+- save_dir：存储模型的路径
+- num_passes and test_pass：从test_pass到（num_passes - 1）加载模型参数，这里只加载 `data/wmt14_model/pass-00012`
+
+你将会看到这样的消息：
+
+    I0706 14:48:31.178915 31441 GradientMachine.cpp:143] Loading parameters from data/wmt14_model/pass-00012
+    I0706 14:48:40.012039 31441 Tester.cpp:125]  Batch=100 samples=100 AvgCost=0
+    I0706 14:48:48.898632 31441 Tester.cpp:125]  Batch=200 samples=200 AvgCost=0
+    ...
+
+然后在`demo/seqToseq/translation/gen_result`中的生成结果如下所示：
+
+    0
+    0       -11.1314         The <unk> <unk> about the width of the seats while large controls are at stake <e>
+    1       -11.1519         The <unk> <unk> on the width of the seats while large controls are at stake <e>
+    2       -11.5988         The <unk> <unk> about the width of the seats while large controls are at stake . <e>
+
+    1
+    0       -24.4149         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of the Dubai <unk> . <e>
+    1       -26.9524         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of Dubai &apos; s <unk> . <e>
+    2       -27.9574         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of Dubai &apos; s Dubai <unk> . <e>
+    ...
+
+- 这是集束搜索的结果，其中beam size是3
+- 第一行的“0”和第6行的“1”表示生成数据的序列id
+- 其他六行列出了集束搜索的结果
+  - 第二列是集束搜索的得分（从大到小）
+  - 第三列是生成的英语序列
+- 有两个特殊标识：
+  - `<e>`：序列的结尾
+  - `<unk>`：不包含在字典中的单词
+
+### BLEU评估 ###
+对机器翻译的人工评估工作很广泛但也很昂贵。一篇论文 [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf) 展示了一种方法，当需要快速或者频繁的评估时，使用自动的替补来替代经验丰富的人工评判。[Moses](http://www.statmt.org/moses/) 是一个统计学的机器翻译系统，我们使用其中的 [multi-bleu.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl) 来做BLEU评估。运行以下命令来下载这个脚本：
+
+```bash
+cd demo/seqToseq/translation
+./moses_bleu.sh
+```
+
+由于标准的翻译结果已经下载到这里`data/wmt14/gen/ntst14.trg`，我们可以运行以下命令来做BLEU评估。
+
+```bash
+cd demo/seqToseq/translation
+./eval_bleu.sh FILE BEAMSIZE
+```
+
+- FILE：生成的结果文件
+- BEAMSIZE：集束搜索中的扩展广度
--- a/doc/tutorials/text_generation/index_en.md
+++ b/doc/tutorials/text_generation/index_en.md
@@ -260,8 +260,8 @@ gru_encoder_decoder(gen_conf, is_generating)

 1. **Data Definiation**: We defines an SeqToSeq gen data in our example. It returns gen_conf as the configuration, following is its input arguments:
   - data\_dir: directory of gen data
-   - is\_generating: whether this config is used for generating, here is false
-   - gen\_result: file to store the generation result
+   - is\_generating: whether this config is used for generating, here is true
+   - gen\_result: file to store the generation result
 2. **Algorithm Configuration**: We use SGD traing algorithm in generation, and specify batch_size as 1 (each time generate one sequence), and learning rate as 0.
 3. **Network Architecture**: Essentially the same as the training model.


--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -175,11 +175,15 @@ void hl_cudnn_init(cudnnHandle_t* cudnn_handle, cudaStream_t stream) {
      << "PaddlePaddle Requirement: "
      << "(header v[2-3] with libcudnn v[2-3]) Or "
      << "(header v4 with libcudnn v4) Or "
-      << "(header v5 with libcudnn v5).";
+      << "(header v5 with libcudnn v5) Or"
+      << "(header v6 with libcudnn v6).";

-  CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
+  CHECK(!(CUDNN_VERSION < 6000 && CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
      << "cudnn v5 requires cuda version >= 7.5";

+  CHECK(!(CUDNN_VERSION >= 6000 && CUDA_VERSION < 8000))
+      << "cudnn v6 requires cuda version >= 8.0";
+
  CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
  CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));

@@ -610,6 +614,23 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
  CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));

  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+
+#if CUDNN_VERSION >= 6000
+#ifndef PADDLE_TYPE_DOUBLE
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       1,
+                                                       1,
+                                                       mode,
+                                                       data_type));
+#else
  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
                                                       padding_height,
                                                       padding_width,
@@ -618,6 +639,7 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                                       1,
                                                       1,
                                                       mode));
+#endif

  hl_conv->input_image = image;
  hl_conv->filter = filter;
@@ -645,6 +667,23 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,

  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+
+#if CUDNN_VERSION >= 6000
+#ifndef PADDLE_TYPE_DOUBLE
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       1,
+                                                       1,
+                                                       mode,
+                                                       data_type));
+#else
  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
                                                       padding_height,
                                                       padding_width,
@@ -653,6 +692,7 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                                       1,
                                                       1,
                                                       mode));
+#endif

  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
  hl_conv->input_image = image;

--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -252,19 +252,9 @@ private:
    // only for instance will make python reference-count error.
    //
    // So here, we increase reference count manually.
-    if (gModuleClsPtrs_.find((uintptr_t)module.get()) !=
-        gModuleClsPtrs_.end()) {
-      // Multi instance use same module
-      Py_XINCREF(module.get());
-      Py_XINCREF(moduleDict.get());
-    } else {
-      gModuleClsPtrs_.insert((uintptr_t)module.get());
-    }
-    if (gModuleClsPtrs_.find((uintptr_t)cls.get()) != gModuleClsPtrs_.end()) {
-      Py_XINCREF(cls.get());
-    } else {
-      gModuleClsPtrs_.insert((uintptr_t)cls.get());
-    }
+    Py_XINCREF(module.get());
+    Py_XINCREF(moduleDict.get());
+    Py_XINCREF(cls.get());

    PyObjectPtr fileListInPy = loadPyFileLists(fileListName);
    PyDict_SetItemString(kwargs.get(), "file_list", fileListInPy.get());
@@ -471,7 +461,6 @@ private:
  std::vector<std::string> fileLists_;
  std::vector<SlotHeader> headers_;
  static PyObjectPtr zeroTuple_;
-  static std::unordered_set<uintptr_t> gModuleClsPtrs_;

  class PositionRandom {
  public:
@@ -671,7 +660,6 @@ public:
  }
 };

-std::unordered_set<uintptr_t> PyDataProvider2::gModuleClsPtrs_;
 PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0));

 REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);

--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -59,24 +59,14 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {

 void BatchNormalizationLayer::calMovingMeanAndVar() {
  // calculating and saving moving mean and variance
-  MatrixPtr movingMean = movingMean_->getW();
-  MatrixPtr movingVar = movingVar_->getW();
-
-  if (!useGpu_ && FLAGS_trainer_count > 1) {
-    auto mvMean = std::dynamic_pointer_cast<SharedCpuMatrix>(movingMean);
-    auto mvVar = std::dynamic_pointer_cast<SharedCpuMatrix>(movingVar);
-    CHECK(mvMean && mvVar);
-
-    mvMean->add(*savedMean_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-    mvVar->add(*savedInvVar_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-  } else {
-    // movingMean =  movingMean * movingAvgFraction_
-    //            + savedMean_ * (1 - movingAvgFraction_)
-    movingMean->add(*savedMean_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-    // movingVar =  movingVar * movingAvgFraction_
-    //           + savedInvVar_ * (1 - movingAvgFraction_)
-    movingVar->add(*savedInvVar_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-  }
+  auto& movingMean = movingMean_->getW();
+  auto& movingVar = movingVar_->getW();
+  // movingMean =  movingMean * movingAvgFraction_
+  //            + savedMean_ * (1 - movingAvgFraction_)
+  movingMean->add(*savedMean_, movingAvgFraction_, 1.0 - movingAvgFraction_);
+  // movingVar =  movingVar * movingAvgFraction_
+  //           + savedInvVar_ * (1 - movingAvgFraction_)
+  movingVar->add(*savedInvVar_, movingAvgFraction_, 1.0 - movingAvgFraction_);
 }

 void BatchNormalizationLayer::setMeanAndStd() {

--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -130,6 +130,10 @@ void ConvProjection::reshapeTensorDesc(int batchSize) {
 void ConvProjection::reshape(int batchSize) {
  size_t width = calOutputSize();
  CHECK_EQ(width, out_->value->getWidth());
+  CHECK_EQ(channels_ * imageH_ * imageW_, in_->value->getWidth())
+      << "Wrong input size for convolution"
+      << " channels=" << channels_ << " imageH=" << imageH_
+      << " imageW=" << imageW_ << " inputSize=" << in_->value->getWidth();

  isSelectAlgo_ = (batchSize == batchNum_);
  batchNum_ = batchSize;

--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -155,13 +155,14 @@ TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
 }

 int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+
  if (paddle::version::isWithPyDataProvider()) {
    if (!paddle::version::isWithGpu()) {
      FLAGS_use_gpu = false;
    }
    initMain(argc, argv);
    initPython(argc, argv);
-    testing::InitGoogleTest(&argc, argv);
    return RUN_ALL_TESTS();
  } else {
    return 0;

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -1908,8 +1908,8 @@ public:

 public:
  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-  void add(Matrix& b, real p1, real p2);
-  void add(real p1, real p2);
+  virtual void add(Matrix& b, real p1, real p2);
+  virtual void add(real p1, real p2);

 private:
  using Matrix::mul;

--- a/paddle/scripts/travis/before_install.linux.sh
+++ b/paddle/scripts/travis/before_install.linux.sh
 #!/bin/bash
 set -e
+pip install protobuf
+cd /tmp
+wget https://github.com/google/protobuf/archive/v3.0.2.tar.gz -O protobuf.tar.gz
+tar xf protobuf.tar.gz
+cd protobuf*
+./autogen.sh
+./configure --prefix=/usr/
+make -j 2 install
+cd ..
+rm -rf protobuf*
+
 pushd /usr/src/gtest
 cmake .
 make

--- a/paddle/scripts/travis/before_install.osx.sh
+++ b/paddle/scripts/travis/before_install.osx.sh
@@ -2,9 +2,8 @@
 brew update
 brew tap homebrew/science
 brew install python
-sudo pip install --upgrade protobuf==2.6.0
-brew install homebrew/versions/protobuf260 --without-python
-brew install cmake python glog gflags openblas wget md5sha1sum
+sudo pip install --upgrade protobuf
+brew install cmake python glog gflags openblas wget md5sha1sum protobuf

 wget https://github.com/google/googletest/archive/release-1.8.0.tar.gz -O gtest.tar.gz
 tar xf gtest.tar.gz

--- a/paddle/scripts/travis/precommit.sh
+++ b/paddle/scripts/travis/precommit.sh
@@ -12,6 +12,9 @@ cd ..
 export PATH=/usr/bin:$PATH
 pre-commit install
 clang-format --version
-pre-commit run -a
+
+if ! pre-commit run -a ; then
+  git diff  --exit-code
+fi

 trap : 0
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
--- a/paddle/trainer/tests/fake_file_list.list
+++ b/paddle/trainer/tests/fake_file_list.list
--- a/paddle/trainer/tests/simple_sparse_neural_network.py
+++ b/paddle/trainer/tests/simple_sparse_neural_network.py
--- a/paddle/trainer/tests/simple_sparse_neural_network_dp.py
+++ b/paddle/trainer/tests/simple_sparse_neural_network_dp.py
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
--- a/paddle/utils/CompilerMacros.h
+++ b/paddle/utils/CompilerMacros.h
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer_config_helpers/__init__.py
+++ b/python/paddle/trainer_config_helpers/__init__.py
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr