Merge branch 'develop' into link

52f6c9a6 · Luo Tao · 3f9f2223 · 0fd44c61 · 52f6c9a6 · 52f6c9a6
275 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,10 +8,13 @@ os:
 env:
  - JOB=DOCS
  - JOB=BUILD_AND_TEST
+  - JOB=PRE_COMMIT
 matrix:
  exclude:
    - os: osx
-      env: JOB=DOCS  # Only generate documentation in linux
+      env: JOB=DOCS  # Only generate documentation in linux.
+    - os: osx
+      env: JOB=PRE_COMMIT # Only check pre-commit hook in linux

 addons:
  apt:
@@ -39,18 +42,23 @@ addons:
      - lcov
      - graphviz
      - swig
+      - clang-format-3.8
 before_install:
  - |
    if [ ${JOB} == "BUILD_AND_TEST" ]; then
-      if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)'
-      then
-        echo "Only markdown docs were updated, stopping build process."
-        exit
+      local change_list=`git diff --name-only $TRAVIS_COMMIT_RANGE`
+      if [ $? -eq 0 ]; then  # if git diff return no zero, then rerun unit test.
+        if ! echo ${change_list} | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)'
+        then
+          echo "Only markdown docs were updated, stopping build process."
+          exit
+        fi
      fi
    fi
  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
-  - pip install wheel protobuf sphinx recommonmark virtualenv numpy sphinx_rtd_theme
+  - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
+  - pip install wheel protobuf sphinx recommonmark virtualenv numpy sphinx_rtd_theme pre-commit
 script:
  - paddle/scripts/travis/main.sh
 notifications:

--- a/WORKSPACE
+++ b/WORKSPACE
 # External dependency to Google protobuf.
 http_archive(
-    name = "protobuf",
-    url = "http://github.com/google/protobuf/archive/v3.1.0.tar.gz",
-    sha256 = "0a0ae63cbffc274efb573bdde9a253e3f32e458c41261df51c5dbc5ad541e8f7",
-    strip_prefix = "protobuf-3.1.0",
-)
+    name="protobuf",
+    url="http://github.com/google/protobuf/archive/v3.1.0.tar.gz",
+    sha256="0a0ae63cbffc274efb573bdde9a253e3f32e458c41261df51c5dbc5ad541e8f7",
+    strip_prefix="protobuf-3.1.0", )

 # External dependency to gtest 1.7.0.  This method comes from
 # https://www.bazel.io/versions/master/docs/tutorial/cpp.html.
 new_http_archive(
-    name = "gtest",
-    url = "https://github.com/google/googletest/archive/release-1.7.0.zip",
-    sha256 = "b58cb7547a28b2c718d1e38aee18a3659c9e3ff52440297e965f5edffe34b6d0",
-    build_file = "third_party/gtest.BUILD",
-    strip_prefix = "googletest-release-1.7.0",
-)
+    name="gtest",
+    url="https://github.com/google/googletest/archive/release-1.7.0.zip",
+    sha256="b58cb7547a28b2c718d1e38aee18a3659c9e3ff52440297e965f5edffe34b6d0",
+    build_file="third_party/gtest.BUILD",
+    strip_prefix="googletest-release-1.7.0", )
--- a/benchmark/tensorflow/rnn/run_multi.sh
+++ b/benchmark/tensorflow/rnn/run_multi.sh
@@ -25,4 +25,3 @@ test 4 2 256 512
 test 4 2 512 128 
 test 4 2 512 256 
 test 4 2 512 512 
-
--- a/demo/gan/README.md
+++ b/demo/gan/README.md
@@ -10,4 +10,4 @@ Then you can run the command below. The flag -d specifies the training data (cif
 $python gan_trainer.py -d cifar --use_gpu 1

 The generated images will be stored in ./cifar_samples/
-The corresponding models will be stored in ./cifar_params/
\ No newline at end of file
+The corresponding models will be stored in ./cifar_params/
--- a/demo/gan/data/download_cifar.sh
+++ b/demo/gan/data/download_cifar.sh
@@ -15,4 +15,3 @@ set -e
 wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
 tar zxf cifar-10-python.tar.gz
 rm cifar-10-python.tar.gz
-
--- a/demo/gan/data/get_mnist_data.sh
+++ b/demo/gan/data/get_mnist_data.sh
@@ -15,5 +15,3 @@ do
        gunzip ${fname}.gz
    fi
 done
-
-
--- a/demo/gan/gan_conf.py
+++ b/demo/gan/gan_conf.py
@@ -14,10 +14,9 @@
 from paddle.trainer_config_helpers import *

 mode = get_config_arg("mode", str, "generator")
-assert mode in set(["generator",
-                    "discriminator",
-                    "generator_training",
-                    "discriminator_training"])
+assert mode in set([
+    "generator", "discriminator", "generator_training", "discriminator_training"
+])

 is_generator_training = mode == "generator_training"
 is_discriminator_training = mode == "discriminator_training"
@@ -38,8 +37,8 @@ sample_dim = 2
 settings(
    batch_size=128,
    learning_rate=1e-4,
-    learning_method=AdamOptimizer(beta1=0.5)
-)
+    learning_method=AdamOptimizer(beta1=0.5))
+

 def discriminator(sample):
    """
@@ -50,70 +49,87 @@ def discriminator(sample):
    of the sample is from real data.
    """
    param_attr = ParamAttr(is_static=is_generator_training)
-    bias_attr = ParamAttr(is_static=is_generator_training,
-                          initial_mean=1.0,
-                          initial_std=0)
-
-    hidden = fc_layer(input=sample, name="dis_hidden", size=hidden_dim,
-                    bias_attr=bias_attr,
-                    param_attr=param_attr,
-                    act=ReluActivation())
-
-    hidden2 = fc_layer(input=hidden, name="dis_hidden2", size=hidden_dim,
-                    bias_attr=bias_attr,
-                    param_attr=param_attr,
-                    act=LinearActivation())
-    
-    hidden_bn = batch_norm_layer(hidden2, 
-                     act=ReluActivation(), 
-                     name="dis_hidden_bn", 
-                     bias_attr=bias_attr, 
-                     param_attr=ParamAttr(is_static=is_generator_training,
-                           initial_mean=1.0,
-                           initial_std=0.02),
-                     use_global_stats=False)
-    
-    return fc_layer(input=hidden_bn, name="dis_prob", size=2,
-                    bias_attr=bias_attr,
-                    param_attr=param_attr,
-                    act=SoftmaxActivation())
+    bias_attr = ParamAttr(
+        is_static=is_generator_training, initial_mean=1.0, initial_std=0)
+
+    hidden = fc_layer(
+        input=sample,
+        name="dis_hidden",
+        size=hidden_dim,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=ReluActivation())
+
+    hidden2 = fc_layer(
+        input=hidden,
+        name="dis_hidden2",
+        size=hidden_dim,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=LinearActivation())
+
+    hidden_bn = batch_norm_layer(
+        hidden2,
+        act=ReluActivation(),
+        name="dis_hidden_bn",
+        bias_attr=bias_attr,
+        param_attr=ParamAttr(
+            is_static=is_generator_training, initial_mean=1.0,
+            initial_std=0.02),
+        use_global_stats=False)
+
+    return fc_layer(
+        input=hidden_bn,
+        name="dis_prob",
+        size=2,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=SoftmaxActivation())
+

 def generator(noise):
    """
    generator generates a sample given noise
    """
    param_attr = ParamAttr(is_static=is_discriminator_training)
-    bias_attr = ParamAttr(is_static=is_discriminator_training,
-                           initial_mean=1.0,
-                           initial_std=0)
-    
-    hidden = fc_layer(input=noise,
-                    name="gen_layer_hidden",
-                    size=hidden_dim,
-                    bias_attr=bias_attr,
-                    param_attr=param_attr,
-                    act=ReluActivation())
-
-    hidden2 = fc_layer(input=hidden, name="gen_hidden2", size=hidden_dim,
-                    bias_attr=bias_attr,
-                    param_attr=param_attr,
-                    act=LinearActivation())
-    
-    hidden_bn = batch_norm_layer(hidden2, 
-                     act=ReluActivation(), 
-                     name="gen_layer_hidden_bn", 
-                     bias_attr=bias_attr, 
-                     param_attr=ParamAttr(is_static=is_discriminator_training,
-                           initial_mean=1.0,
-                           initial_std=0.02),
-                     use_global_stats=False)
-    
-    return fc_layer(input=hidden_bn,
-                    name="gen_layer1",
-                    size=sample_dim,
-                    bias_attr=bias_attr,
-                    param_attr=param_attr,
-                    act=LinearActivation())
+    bias_attr = ParamAttr(
+        is_static=is_discriminator_training, initial_mean=1.0, initial_std=0)
+
+    hidden = fc_layer(
+        input=noise,
+        name="gen_layer_hidden",
+        size=hidden_dim,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=ReluActivation())
+
+    hidden2 = fc_layer(
+        input=hidden,
+        name="gen_hidden2",
+        size=hidden_dim,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=LinearActivation())
+
+    hidden_bn = batch_norm_layer(
+        hidden2,
+        act=ReluActivation(),
+        name="gen_layer_hidden_bn",
+        bias_attr=bias_attr,
+        param_attr=ParamAttr(
+            is_static=is_discriminator_training,
+            initial_mean=1.0,
+            initial_std=0.02),
+        use_global_stats=False)
+
+    return fc_layer(
+        input=hidden_bn,
+        name="gen_layer1",
+        size=sample_dim,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=LinearActivation())
+

 if is_generator_training:
    noise = data_layer(name="noise", size=noise_dim)
@@ -126,7 +142,8 @@ if is_generator_training or is_discriminator_training:
    label = data_layer(name="label", size=1)
    prob = discriminator(sample)
    cost = cross_entropy(input=prob, label=label)
-    classification_error_evaluator(input=prob, label=label, name=mode+'_error')
+    classification_error_evaluator(
+        input=prob, label=label, name=mode + '_error')
    outputs(cost)

 if is_generator:

--- a/demo/gan/gan_conf_image.py
+++ b/demo/gan/gan_conf_image.py
@@ -15,10 +15,9 @@ from paddle.trainer_config_helpers import *

 mode = get_config_arg("mode", str, "generator")
 dataSource = get_config_arg("data", str, "mnist")
-assert mode in set(["generator",
-                    "discriminator",
-                    "generator_training",
-                    "discriminator_training"])
+assert mode in set([
+    "generator", "discriminator", "generator_training", "discriminator_training"
+])

 is_generator_training = mode == "generator_training"
 is_discriminator_training = mode == "discriminator_training"
@@ -36,24 +35,33 @@ noise_dim = 100
 gf_dim = 64
 df_dim = 64
 if dataSource == "mnist":
-    sample_dim = 28 # image dim
-    c_dim = 1 # image color
+    sample_dim = 28  # image dim
+    c_dim = 1  # image color
 else:
    sample_dim = 32
    c_dim = 3
-s2, s4 = int(sample_dim/2), int(sample_dim/4), 
-s8, s16 = int(sample_dim/8), int(sample_dim/16)
+s2, s4 = int(sample_dim / 2), int(sample_dim / 4),
+s8, s16 = int(sample_dim / 8), int(sample_dim / 16)

 settings(
    batch_size=128,
    learning_rate=2e-4,
-    learning_method=AdamOptimizer(beta1=0.5)
-)
+    learning_method=AdamOptimizer(beta1=0.5))

-def conv_bn(input, channels, imgSize, num_filters, output_x, stride, name, 
-                 param_attr, bias_attr, param_attr_bn, bn, trans=False, 
-                 act=ReluActivation()):
-    
+
+def conv_bn(input,
+            channels,
+            imgSize,
+            num_filters,
+            output_x,
+            stride,
+            name,
+            param_attr,
+            bias_attr,
+            param_attr_bn,
+            bn,
+            trans=False,
+            act=ReluActivation()):
    """
    conv_bn is a utility function that constructs a convolution/deconv layer 
    with an optional batch_norm layer
@@ -63,10 +71,10 @@ def conv_bn(input, channels, imgSize, num_filters, output_x, stride, name,
    :param trans: whether to use conv (False) or deconv (True)
    :type trans: bool
    """
-    
+
    # calculate the filter_size and padding size based on the given
    # imgSize and ouput size
-    tmp =  imgSize - (output_x - 1) * stride
+    tmp = imgSize - (output_x - 1) * stride
    if tmp <= 1 or tmp > 5:
        raise ValueError("conv input-output dimension does not fit")
    elif tmp <= 3:
@@ -76,111 +84,134 @@ def conv_bn(input, channels, imgSize, num_filters, output_x, stride, name,
        filter_size = tmp
        padding = 0

-    print (imgSize, output_x, stride, filter_size, padding)
-    
+    print(imgSize, output_x, stride, filter_size, padding)
+
    if trans:
        nameApx = "_conv"
    else:
        nameApx = "_convt"
-    
+
    if bn:
-        conv = img_conv_layer(input, filter_size=filter_size, 
-                   num_filters=num_filters,
-                   name=name + nameApx, num_channels=channels,
-                   act=LinearActivation(), groups=1, stride=stride, 
-                   padding=padding, bias_attr=bias_attr,
-                   param_attr=param_attr, shared_biases=True, layer_attr=None,
-                   filter_size_y=None, stride_y=None, padding_y=None, 
-                   trans=trans)
-        
-        conv_bn = batch_norm_layer(conv, 
-                         act=act, 
-                         name=name + nameApx + "_bn", 
-                         bias_attr=bias_attr, 
-                         param_attr=param_attr_bn,
-                         use_global_stats=False)
-        
+        conv = img_conv_layer(
+            input,
+            filter_size=filter_size,
+            num_filters=num_filters,
+            name=name + nameApx,
+            num_channels=channels,
+            act=LinearActivation(),
+            groups=1,
+            stride=stride,
+            padding=padding,
+            bias_attr=bias_attr,
+            param_attr=param_attr,
+            shared_biases=True,
+            layer_attr=None,
+            filter_size_y=None,
+            stride_y=None,
+            padding_y=None,
+            trans=trans)
+
+        conv_bn = batch_norm_layer(
+            conv,
+            act=act,
+            name=name + nameApx + "_bn",
+            bias_attr=bias_attr,
+            param_attr=param_attr_bn,
+            use_global_stats=False)
+
        return conv_bn
    else:
-        conv = img_conv_layer(input, filter_size=filter_size, 
-                   num_filters=num_filters,
-                   name=name + nameApx, num_channels=channels,
-                   act=act, groups=1, stride=stride, 
-                   padding=padding, bias_attr=bias_attr,
-                   param_attr=param_attr, shared_biases=True, layer_attr=None,
-                   filter_size_y=None, stride_y=None, padding_y=None,
-                   trans=trans)
+        conv = img_conv_layer(
+            input,
+            filter_size=filter_size,
+            num_filters=num_filters,
+            name=name + nameApx,
+            num_channels=channels,
+            act=act,
+            groups=1,
+            stride=stride,
+            padding=padding,
+            bias_attr=bias_attr,
+            param_attr=param_attr,
+            shared_biases=True,
+            layer_attr=None,
+            filter_size_y=None,
+            stride_y=None,
+            padding_y=None,
+            trans=trans)
        return conv
-    
+
+
 def generator(noise):
    """
    generator generates a sample given noise
    """
-    param_attr = ParamAttr(is_static=is_discriminator_training,
-                           initial_mean=0.0,
-                           initial_std=0.02)
-    bias_attr = ParamAttr(is_static=is_discriminator_training,
-                           initial_mean=0.0,
-                           initial_std=0.0)
-    
-    param_attr_bn=ParamAttr(is_static=is_discriminator_training,
-                           initial_mean=1.0,
-                           initial_std=0.02)
-    
-    h1 = fc_layer(input=noise,
-                    name="gen_layer_h1",
-                    size=s8 * s8 * gf_dim * 4,
-                    bias_attr=bias_attr,
-                    param_attr=param_attr,
-                    act=LinearActivation())
-    
-    h1_bn = batch_norm_layer(h1, 
-                     act=ReluActivation(), 
-                     name="gen_layer_h1_bn", 
-                     bias_attr=bias_attr, 
-                     param_attr=param_attr_bn,
-                     use_global_stats=False)
-    
-    h2_bn = conv_bn(h1_bn, 
-                    channels=gf_dim*4, 
-                    output_x=s8,
-                    num_filters=gf_dim*2, 
-                    imgSize=s4,
-                    stride=2,
-                    name="gen_layer_h2", 
-                    param_attr=param_attr, 
-                    bias_attr=bias_attr, 
-                    param_attr_bn=param_attr_bn,
-                    bn=True,
-                    trans=True)
-    
-    h3_bn = conv_bn(h2_bn, 
-                    channels=gf_dim*2, 
-                    output_x=s4,
-                    num_filters=gf_dim, 
-                    imgSize=s2,
-                    stride=2,
-                    name="gen_layer_h3", 
-                    param_attr=param_attr, 
-                    bias_attr=bias_attr, 
-                    param_attr_bn=param_attr_bn,
-                    bn=True,
-                    trans=True)
-     
-    
-    return conv_bn(h3_bn,
-                   channels=gf_dim, 
-                   output_x=s2,
-                   num_filters=c_dim, 
-                   imgSize=sample_dim,
-                   stride=2,
-                   name="gen_layer_h4", 
-                   param_attr=param_attr, 
-                   bias_attr=bias_attr, 
-                   param_attr_bn=param_attr_bn,
-                   bn=False,
-                   trans=True,
-                   act=TanhActivation())
+    param_attr = ParamAttr(
+        is_static=is_discriminator_training, initial_mean=0.0, initial_std=0.02)
+    bias_attr = ParamAttr(
+        is_static=is_discriminator_training, initial_mean=0.0, initial_std=0.0)
+
+    param_attr_bn = ParamAttr(
+        is_static=is_discriminator_training, initial_mean=1.0, initial_std=0.02)
+
+    h1 = fc_layer(
+        input=noise,
+        name="gen_layer_h1",
+        size=s8 * s8 * gf_dim * 4,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=LinearActivation())
+
+    h1_bn = batch_norm_layer(
+        h1,
+        act=ReluActivation(),
+        name="gen_layer_h1_bn",
+        bias_attr=bias_attr,
+        param_attr=param_attr_bn,
+        use_global_stats=False)
+
+    h2_bn = conv_bn(
+        h1_bn,
+        channels=gf_dim * 4,
+        output_x=s8,
+        num_filters=gf_dim * 2,
+        imgSize=s4,
+        stride=2,
+        name="gen_layer_h2",
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        param_attr_bn=param_attr_bn,
+        bn=True,
+        trans=True)
+
+    h3_bn = conv_bn(
+        h2_bn,
+        channels=gf_dim * 2,
+        output_x=s4,
+        num_filters=gf_dim,
+        imgSize=s2,
+        stride=2,
+        name="gen_layer_h3",
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        param_attr_bn=param_attr_bn,
+        bn=True,
+        trans=True)
+
+    return conv_bn(
+        h3_bn,
+        channels=gf_dim,
+        output_x=s2,
+        num_filters=c_dim,
+        imgSize=sample_dim,
+        stride=2,
+        name="gen_layer_h4",
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        param_attr_bn=param_attr_bn,
+        bn=False,
+        trans=True,
+        act=TanhActivation())


 def discriminator(sample):
@@ -191,58 +222,60 @@ def discriminator(sample):
    of the sample is from generator and dimension 1 is the probabblity
    of the sample is from real data.
    """
-    param_attr = ParamAttr(is_static=is_generator_training,
-                           initial_mean=0.0,
-                           initial_std=0.02)
-    bias_attr = ParamAttr(is_static=is_generator_training,
-                          initial_mean=0.0,
-                          initial_std=0.0)
-    
-    param_attr_bn=ParamAttr(is_static=is_generator_training,
-                           initial_mean=1.0,
-                           initial_std=0.02)
-    
-    h0 = conv_bn(sample, 
-                 channels=c_dim, 
-                 imgSize=sample_dim,
-                 num_filters=df_dim, 
-                 output_x=s2, 
-                 stride=2, 
-                 name="dis_h0", 
-                 param_attr=param_attr, 
-                 bias_attr=bias_attr, 
-                 param_attr_bn=param_attr_bn, 
-                 bn=False)
-    
-    h1_bn = conv_bn(h0, 
-                 channels=df_dim,
-                 imgSize=s2,
-                 num_filters=df_dim*2, 
-                 output_x=s4, 
-                 stride=2, 
-                 name="dis_h1", 
-                 param_attr=param_attr, 
-                 bias_attr=bias_attr, 
-                 param_attr_bn=param_attr_bn, 
-                 bn=True)
-
-    h2_bn = conv_bn(h1_bn, 
-                 channels=df_dim*2,
-                 imgSize=s4,
-                 num_filters=df_dim*4, 
-                 output_x=s8, 
-                 stride=2, 
-                 name="dis_h2", 
-                 param_attr=param_attr, 
-                 bias_attr=bias_attr, 
-                 param_attr_bn=param_attr_bn, 
-                 bn=True)
-        
-    return fc_layer(input=h2_bn, name="dis_prob", size=2,
-                    bias_attr=bias_attr,
-                    param_attr=param_attr,
-                    act=SoftmaxActivation())
+    param_attr = ParamAttr(
+        is_static=is_generator_training, initial_mean=0.0, initial_std=0.02)
+    bias_attr = ParamAttr(
+        is_static=is_generator_training, initial_mean=0.0, initial_std=0.0)
+
+    param_attr_bn = ParamAttr(
+        is_static=is_generator_training, initial_mean=1.0, initial_std=0.02)
+
+    h0 = conv_bn(
+        sample,
+        channels=c_dim,
+        imgSize=sample_dim,
+        num_filters=df_dim,
+        output_x=s2,
+        stride=2,
+        name="dis_h0",
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        param_attr_bn=param_attr_bn,
+        bn=False)
+
+    h1_bn = conv_bn(
+        h0,
+        channels=df_dim,
+        imgSize=s2,
+        num_filters=df_dim * 2,
+        output_x=s4,
+        stride=2,
+        name="dis_h1",
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        param_attr_bn=param_attr_bn,
+        bn=True)
+
+    h2_bn = conv_bn(
+        h1_bn,
+        channels=df_dim * 2,
+        imgSize=s4,
+        num_filters=df_dim * 4,
+        output_x=s8,
+        stride=2,
+        name="dis_h2",
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        param_attr_bn=param_attr_bn,
+        bn=True)

+    return fc_layer(
+        input=h2_bn,
+        name="dis_prob",
+        size=2,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=SoftmaxActivation())


 if is_generator_training:
@@ -250,13 +283,14 @@ if is_generator_training:
    sample = generator(noise)

 if is_discriminator_training:
-    sample = data_layer(name="sample", size=sample_dim * sample_dim*c_dim)
+    sample = data_layer(name="sample", size=sample_dim * sample_dim * c_dim)

 if is_generator_training or is_discriminator_training:
    label = data_layer(name="label", size=1)
    prob = discriminator(sample)
    cost = cross_entropy(input=prob, label=label)
-    classification_error_evaluator(input=prob, label=label, name=mode+'_error')
+    classification_error_evaluator(
+        input=prob, label=label, name=mode + '_error')
    outputs(cost)

 if is_generator:

--- a/demo/gan/gan_trainer.py
+++ b/demo/gan/gan_trainer.py
@@ -16,7 +16,7 @@ import argparse
 import random
 import numpy
 import cPickle
-import sys,os
+import sys, os
 from PIL import Image

 from paddle.trainer.config_parser import parse_config
@@ -24,6 +24,7 @@ from paddle.trainer.config_parser import logger
 import py_paddle.swig_paddle as api
 import matplotlib.pyplot as plt

+
 def plot2DScatter(data, outputfile):
    '''
    Plot the data as a 2D scatter plot and save to outputfile
@@ -41,9 +42,11 @@ def plot2DScatter(data, outputfile):
    plt.scatter(x, y)
    plt.savefig(outputfile, bbox_inches='tight')

+
 def CHECK_EQ(a, b):
    assert a == b, "a=%s, b=%s" % (a, b)

+
 def copy_shared_parameters(src, dst):
    '''
    copy the parameters from src to dst
@@ -52,11 +55,9 @@ def copy_shared_parameters(src, dst):
    :param dst: the destination of the parameters
    :type dst: GradientMachine
    '''
-    src_params = [src.getParameter(i)
-               for i in xrange(src.getParameterSize())]
+    src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]
    src_params = dict([(p.getName(), p) for p in src_params])

-
    for i in xrange(dst.getParameterSize()):
        dst_param = dst.getParameter(i)
        src_param = src_params.get(dst_param.getName(), None)
@@ -67,15 +68,17 @@ def copy_shared_parameters(src, dst):
        CHECK_EQ(len(src_value), len(dst_value))
        dst_value.copyFrom(src_value)
        dst_param.setValueUpdated()
-        
+
+
 def print_parameters(src):
-    src_params = [src.getParameter(i)
-               for i in xrange(src.getParameterSize())]
+    src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]

    print "***************"
    for p in src_params:
        print "Name is %s" % p.getName()
-        print "value is %s \n" % p.getBuf(api.PARAMETER_VALUE).copyToNumpyArray()
+        print "value is %s \n" % p.getBuf(api.PARAMETER_VALUE).copyToNumpyArray(
+        )
+

 def load_mnist_data(imageFile):
    f = open(imageFile, "rb")
@@ -86,33 +89,36 @@ def load_mnist_data(imageFile):
        n = 60000
    else:
        n = 10000
-    
-    data = numpy.fromfile(f, 'ubyte', count=n*28*28).reshape((n, 28*28))
+
+    data = numpy.fromfile(f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28))
    data = data / 255.0 * 2.0 - 1.0

    f.close()
    return data.astype('float32')

+
 def load_cifar_data(cifar_path):
    batch_size = 10000
-    data = numpy.zeros((5*batch_size, 32*32*3), dtype = "float32")
+    data = numpy.zeros((5 * batch_size, 32 * 32 * 3), dtype="float32")
    for i in range(1, 6):
        file = cifar_path + "/data_batch_" + str(i)
        fo = open(file, 'rb')
        dict = cPickle.load(fo)
        fo.close()
-        data[(i - 1)*batch_size:(i*batch_size), :] = dict["data"]
-    
+        data[(i - 1) * batch_size:(i * batch_size), :] = dict["data"]
+
    data = data / 255.0 * 2.0 - 1.0
    return data

+
 # synthesize 2-D uniform data
 def load_uniform_data():
    data = numpy.random.rand(1000000, 2).astype('float32')
    return data

+
 def merge(images, size):
-    if images.shape[1] == 28*28:
+    if images.shape[1] == 28 * 28:
        h, w, c = 28, 28, 1
    else:
        h, w, c = 32, 32, 3
@@ -124,6 +130,7 @@ def merge(images, size):
          ((images[idx, :].reshape((h, w, c), order="F").transpose(1, 0, 2) + 1.0) / 2.0 * 255.0)
    return img.astype('uint8')

+
 def save_images(images, path):
    merged_img = merge(images, [8, 8])
    if merged_img.shape[2] == 1:
@@ -131,14 +138,17 @@ def save_images(images, path):
    else:
        im = Image.fromarray(merged_img, mode="RGB")
    im.save(path)
-    
+
+
 def get_real_samples(batch_size, data_np):
-    return data_np[numpy.random.choice(data_np.shape[0], batch_size, 
-                                       replace=False),:]
-    
+    return data_np[numpy.random.choice(
+        data_np.shape[0], batch_size, replace=False), :]
+
+
 def get_noise(batch_size, noise_dim):
    return numpy.random.normal(size=(batch_size, noise_dim)).astype('float32')

+
 def get_fake_samples(generator_machine, batch_size, noise):
    gen_inputs = api.Arguments.createArguments(1)
    gen_inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
@@ -147,12 +157,14 @@ def get_fake_samples(generator_machine, batch_size, noise):
    fake_samples = gen_outputs.getSlotValue(0).copyToNumpyMat()
    return fake_samples

+
 def get_training_loss(training_machine, inputs):
    outputs = api.Arguments.createArguments(0)
    training_machine.forward(inputs, outputs, api.PASS_TEST)
    loss = outputs.getSlotValue(0).copyToNumpyMat()
    return numpy.mean(loss)

+
 def prepare_discriminator_data_batch_pos(batch_size, data_np):
    real_samples = get_real_samples(batch_size, data_np)
    labels = numpy.ones(batch_size, dtype='int32')
@@ -161,6 +173,7 @@ def prepare_discriminator_data_batch_pos(batch_size, data_np):
    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels))
    return inputs

+
 def prepare_discriminator_data_batch_neg(generator_machine, batch_size, noise):
    fake_samples = get_fake_samples(generator_machine, batch_size, noise)
    labels = numpy.zeros(batch_size, dtype='int32')
@@ -169,6 +182,7 @@ def prepare_discriminator_data_batch_neg(generator_machine, batch_size, noise):
    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels))
    return inputs

+
 def prepare_generator_data_batch(batch_size, noise):
    label = numpy.ones(batch_size, dtype='int32')
    inputs = api.Arguments.createArguments(2)
@@ -193,10 +207,9 @@ def get_layer_size(model_conf, layer_name):
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-d", "--data_source", help="mnist or cifar or uniform")
-    parser.add_argument("--use_gpu", default="1", 
-                        help="1 means use gpu for training")
-    parser.add_argument("--gpu_id", default="0", 
-                        help="the gpu_id parameter")
+    parser.add_argument(
+        "--use_gpu", default="1", help="1 means use gpu for training")
+    parser.add_argument("--gpu_id", default="0", help="the gpu_id parameter")
    args = parser.parse_args()
    data_source = args.data_source
    use_gpu = args.use_gpu
@@ -208,30 +221,32 @@ def main():

    if not os.path.exists("./%s_params/" % data_source):
        os.makedirs("./%s_params/" % data_source)
-        
-    api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10', '--log_period=100', 
-                   '--gpu_id=' + args.gpu_id, '--save_dir=' + "./%s_params/" % data_source)
-    
+
+    api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10',
+                   '--log_period=100', '--gpu_id=' + args.gpu_id,
+                   '--save_dir=' + "./%s_params/" % data_source)
+
    if data_source == "uniform":
        conf = "gan_conf.py"
        num_iter = 10000
    else:
        conf = "gan_conf_image.py"
        num_iter = 1000
-        
+
    gen_conf = parse_config(conf, "mode=generator_training,data=" + data_source)
-    dis_conf = parse_config(conf, "mode=discriminator_training,data=" + data_source)
+    dis_conf = parse_config(conf,
+                            "mode=discriminator_training,data=" + data_source)
    generator_conf = parse_config(conf, "mode=generator,data=" + data_source)
    batch_size = dis_conf.opt_config.batch_size
    noise_dim = get_layer_size(gen_conf.model_config, "noise")
-    
+
    if data_source == "mnist":
        data_np = load_mnist_data("./data/mnist_data/train-images-idx3-ubyte")
    elif data_source == "cifar":
        data_np = load_cifar_data("./data/cifar-10-batches-py/")
    else:
        data_np = load_uniform_data()
-    
+
    # this creates a gradient machine for discriminator
    dis_training_machine = api.GradientMachine.createFromConfigProto(
        dis_conf.model_config)
@@ -244,26 +259,24 @@ def main():
    logger.info(str(generator_conf.model_config))
    generator_machine = api.GradientMachine.createFromConfigProto(
        generator_conf.model_config)
-    
-    dis_trainer = api.Trainer.create(
-        dis_conf, dis_training_machine)

-    gen_trainer = api.Trainer.create(
-        gen_conf, gen_training_machine)
-    
+    dis_trainer = api.Trainer.create(dis_conf, dis_training_machine)
+
+    gen_trainer = api.Trainer.create(gen_conf, gen_training_machine)
+
    dis_trainer.startTrain()
    gen_trainer.startTrain()
-    
+
    # Sync parameters between networks (GradientMachine) at the beginning
    copy_shared_parameters(gen_training_machine, dis_training_machine)
    copy_shared_parameters(gen_training_machine, generator_machine)
-    
+
    # constrain that either discriminator or generator can not be trained
    # consecutively more than MAX_strike times
    curr_train = "dis"
    curr_strike = 0
    MAX_strike = 5
-     
+
    for train_pass in xrange(100):
        dis_trainer.startTrainPass()
        gen_trainer.startTrainPass()
@@ -272,23 +285,25 @@ def main():
            noise = get_noise(batch_size, noise_dim)
            data_batch_dis_pos = prepare_discriminator_data_batch_pos(
                batch_size, data_np)
-            dis_loss_pos = get_training_loss(dis_training_machine, data_batch_dis_pos)
-            
+            dis_loss_pos = get_training_loss(dis_training_machine,
+                                             data_batch_dis_pos)
+
            data_batch_dis_neg = prepare_discriminator_data_batch_neg(
                generator_machine, batch_size, noise)
-            dis_loss_neg = get_training_loss(dis_training_machine, data_batch_dis_neg)            
-                         
+            dis_loss_neg = get_training_loss(dis_training_machine,
+                                             data_batch_dis_neg)
+
            dis_loss = (dis_loss_pos + dis_loss_neg) / 2.0
-            
+
            # Do forward pass in generator to get the gen_loss
-            data_batch_gen = prepare_generator_data_batch(
-                    batch_size, noise)
+            data_batch_gen = prepare_generator_data_batch(batch_size, noise)
            gen_loss = get_training_loss(gen_training_machine, data_batch_gen)
-             
+
            if i % 100 == 0:
-                print "d_pos_loss is %s     d_neg_loss is %s" % (dis_loss_pos, dis_loss_neg) 
+                print "d_pos_loss is %s     d_neg_loss is %s" % (dis_loss_pos,
+                                                                 dis_loss_neg)
                print "d_loss is %s    g_loss is %s" % (dis_loss, gen_loss)
-            
+
            # Decide which network to train based on the training history
            # And the relative size of the loss        
            if (not (curr_train == "dis" and curr_strike == MAX_strike)) and \
@@ -297,11 +312,12 @@ def main():
                    curr_strike += 1
                else:
                    curr_train = "dis"
-                    curr_strike = 1                
+                    curr_strike = 1
                dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_neg)
-                dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_pos)               
-                copy_shared_parameters(dis_training_machine, gen_training_machine)
- 
+                dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_pos)
+                copy_shared_parameters(dis_training_machine,
+                                       gen_training_machine)
+
            else:
                if curr_train == "gen":
                    curr_strike += 1
@@ -311,19 +327,23 @@ def main():
                gen_trainer.trainOneDataBatch(batch_size, data_batch_gen)
                # TODO: add API for paddle to allow true parameter sharing between different GradientMachines 
                # so that we do not need to copy shared parameters. 
-                copy_shared_parameters(gen_training_machine, dis_training_machine)
+                copy_shared_parameters(gen_training_machine,
+                                       dis_training_machine)
                copy_shared_parameters(gen_training_machine, generator_machine)
- 
+
        dis_trainer.finishTrainPass()
        gen_trainer.finishTrainPass()
        # At the end of each pass, save the generated samples/images
        fake_samples = get_fake_samples(generator_machine, batch_size, noise)
        if data_source == "uniform":
-            plot2DScatter(fake_samples, "./%s_samples/train_pass%s.png" % (data_source, train_pass))
+            plot2DScatter(fake_samples, "./%s_samples/train_pass%s.png" %
+                          (data_source, train_pass))
        else:
-            save_images(fake_samples, "./%s_samples/train_pass%s.png" % (data_source, train_pass))
+            save_images(fake_samples, "./%s_samples/train_pass%s.png" %
+                        (data_source, train_pass))
    dis_trainer.finishTrain()
    gen_trainer.finishTrain()

+
 if __name__ == '__main__':
    main()
--- a/demo/quick_start/trainer_config.resnet-lstm.py
+++ b/demo/quick_start/trainer_config.resnet-lstm.py
@@ -13,7 +13,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 This configuration is a demonstration of how to implement the stacked LSTM
 with residual connections, i.e. an LSTM layer takes the sum of the hidden states
@@ -46,11 +45,12 @@ is_predict = get_config_arg('is_predict', bool, False)
 trn = 'data/train.list' if not is_predict else None
 tst = 'data/test.list' if not is_predict else 'data/pred.list'
 process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(train_list=trn,
-                        test_list=tst,
-                        module="dataprovider_emb",
-                        obj=process,
-                        args={"dictionary": word_dict})
+define_py_data_sources2(
+    train_list=trn,
+    test_list=tst,
+    module="dataprovider_emb",
+    obj=process,
+    args={"dictionary": word_dict})

 batch_size = 128 if not is_predict else 1
 settings(
@@ -58,10 +58,9 @@ settings(
    learning_rate=2e-3,
    learning_method=AdamOptimizer(),
    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25
-)
+    gradient_clipping_threshold=25)

-bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
+bias_attr = ParamAttr(initial_std=0., l2_rate=0.)

 data = data_layer(name="word", size=len(word_dict))
 emb = embedding_layer(input=data, size=128)
@@ -73,17 +72,15 @@ for i in range(3):
    # The input to the current layer is the sum of the hidden state
    # and input of the previous layer.
    current_input = addto_layer(input=[previous_input, previous_hidden_state])
-    hidden_state = simple_lstm(input=current_input, size=128,
-                               lstm_cell_attr=ExtraAttr(drop_rate=0.1))
+    hidden_state = simple_lstm(
+        input=current_input, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.1))
    previous_input, previous_hidden_state = current_input, hidden_state

 lstm = previous_hidden_state

 lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
-output = fc_layer(input=lstm_last, size=2,
-                  bias_attr=bias_attr,
-                  act=SoftmaxActivation())
-
+output = fc_layer(
+    input=lstm_last, size=2, bias_attr=bias_attr, act=SoftmaxActivation())

 if is_predict:
    maxid = maxid_layer(output)

--- a/demo/semantic_role_labeling/data/extract_dict_feature.py
+++ b/demo/semantic_role_labeling/data/extract_dict_feature.py
@@ -33,7 +33,7 @@ def extract_dict_features(pair_file, feature_file):
                ctx_n1 = sentence_list[verb_index - 1]
            else:
                ctx_n1 = 'bos'
-            
+
            if verb_index > 1:
                mark[verb_index - 2] = 1
                ctx_n2 = sentence_list[verb_index - 2]
@@ -48,7 +48,7 @@ def extract_dict_features(pair_file, feature_file):
                ctx_p1 = sentence_list[verb_index + 1]
            else:
                ctx_p1 = 'eos'
-            
+
            if verb_index < len(labels_list) - 3:
                mark[verb_index + 2] = 1
                ctx_p2 = sentence_list[verb_index + 2]
@@ -69,7 +69,6 @@ def extract_dict_features(pair_file, feature_file):
            feature_out.write(feature_str + '\n')


-
 if __name__ == '__main__':

    usage = '-p pair_file -f feature_file'

--- a/demo/semantic_role_labeling/data/extract_pairs.py
+++ b/demo/semantic_role_labeling/data/extract_pairs.py
@@ -66,8 +66,8 @@ def transform_labels(sentences, labels):
        else:
            verb_list = []
            for x in labels[i][0]:
-                if x !='-':
-                   verb_list.append(x)
+                if x != '-':
+                    verb_list.append(x)

            for j in xrange(1, len(labels[i])):
                label_list = labels[i][j]
@@ -93,7 +93,7 @@ def transform_labels(sentences, labels):
                        is_in_bracket = True
                    else:
                        print 'error:', ll
-                sen_lab_pair.append((sentences[i], verb_list[j-1], label_seq))
+                sen_lab_pair.append((sentences[i], verb_list[j - 1], label_seq))
    return sen_lab_pair


@@ -103,7 +103,7 @@ def write_file(sen_lab_pair, output_file):
            sentence = x[0]
            label_seq = ' '.join(x[2])
            assert len(sentence.split()) == len(x[2])
-            fout.write(sentence + '\t' + x[1]+'\t' +label_seq + '\n')
+            fout.write(sentence + '\t' + x[1] + '\t' + label_seq + '\n')


 if __name__ == '__main__':

--- a/demo/semantic_role_labeling/dataprovider.py
+++ b/demo/semantic_role_labeling/dataprovider.py
@@ -21,7 +21,7 @@ def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
    settings.word_dict = word_dict
    settings.label_dict = label_dict
    settings.predicate_dict = predicate_dict
-   
+
    #all inputs are integral and sequential type
    settings.slots = [
        integer_value_sequence(len(word_dict)),
@@ -29,25 +29,28 @@ def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
        integer_value_sequence(len(word_dict)),
        integer_value_sequence(len(word_dict)),
        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)), 
-        integer_value_sequence(len(predicate_dict)),
-        integer_value_sequence(2),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(predicate_dict)), integer_value_sequence(2),
        integer_value_sequence(len(label_dict))
    ]


 def get_batch_size(yeild_data):
    return len(yeild_data[0])
-    

-@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size, 
-          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
+
+@provider(
+    init_hook=hook,
+    should_shuffle=True,
+    calc_batch_size=get_batch_size,
+    can_over_batch_size=False,
+    cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_name):
    with open(file_name, 'r') as fdata:
        for line in fdata:
            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
                line.strip().split('\t')
-           
+
            words = sentence.split()
            sen_len = len(words)
            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]

--- a/demo/semantic_role_labeling/db_lstm.py
+++ b/demo/semantic_role_labeling/db_lstm.py
@@ -20,7 +20,7 @@ from paddle.trainer_config_helpers import *
 #file paths
 word_dict_file = './data/wordDict.txt'
 label_dict_file = './data/targetDict.txt'
-predicate_file= './data/verbDict.txt'
+predicate_file = './data/verbDict.txt'
 train_list_file = './data/train.list'
 test_list_file = './data/test.list'

@@ -47,7 +47,6 @@ if not is_predict:
            w = line.strip()
            predicate_dict[w] = i

-
    if is_test:
        train_list_file = None

@@ -57,9 +56,11 @@ if not is_predict:
        test_list=test_list_file,
        module='dataprovider',
        obj='process',
-        args={'word_dict': word_dict,
-              'label_dict': label_dict,
-              'predicate_dict': predicate_dict })
+        args={
+            'word_dict': word_dict,
+            'label_dict': label_dict,
+            'predicate_dict': predicate_dict
+        })

    word_dict_len = len(word_dict)
    label_dict_len = len(label_dict)
@@ -77,24 +78,16 @@ mark_dim = 5
 hidden_dim = 512
 depth = 8

-
-
 ########################### Optimizer #######################################

-
 settings(
    batch_size=150,
    learning_method=MomentumOptimizer(momentum=0),
    learning_rate=2e-2,
    regularization=L2Regularization(8e-4),
    is_async=False,
-    model_average=ModelAverage(average_window=0.5,
-                               max_average_window=10000),
-                               
-)
-
-
-
+    model_average=ModelAverage(
+        average_window=0.5, max_average_window=10000), )

 ####################################### network ##############################
 #8 features and 1 target
@@ -108,22 +101,28 @@ ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len)
 ctx_p2 = data_layer(name='ctx_p2_data', size=word_dict_len)
 mark = data_layer(name='mark_data', size=mark_dict_len)

-
 if not is_predict:
    target = data_layer(name='target', size=label_dict_len)

-
-default_std=1/math.sqrt(hidden_dim)/3.0
+default_std = 1 / math.sqrt(hidden_dim) / 3.0

 emb_para = ParameterAttribute(name='emb', initial_std=0., learning_rate=0.)
 std_0 = ParameterAttribute(initial_std=0.)
-std_default = ParameterAttribute(initial_std=default_std) 
-
-predicate_embedding = embedding_layer(size=word_dim, input=predicate, param_attr=ParameterAttribute(name='vemb',initial_std=default_std))
-mark_embedding = embedding_layer(name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0)
-
-word_input=[word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-emb_layers = [embedding_layer(size=word_dim, input=x, param_attr=emb_para) for x in word_input]
+std_default = ParameterAttribute(initial_std=default_std)
+
+predicate_embedding = embedding_layer(
+    size=word_dim,
+    input=predicate,
+    param_attr=ParameterAttribute(
+        name='vemb', initial_std=default_std))
+mark_embedding = embedding_layer(
+    name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0)
+
+word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+emb_layers = [
+    embedding_layer(
+        size=word_dim, input=x, param_attr=emb_para) for x in word_input
+]
 emb_layers.append(predicate_embedding)
 emb_layers.append(mark_embedding)

@@ -131,84 +130,89 @@ hidden_0 = mixed_layer(
    name='hidden0',
    size=hidden_dim,
    bias_attr=std_default,
-    input=[ full_matrix_projection(input=emb, param_attr=std_default ) for emb in emb_layers ])
-
+    input=[
+        full_matrix_projection(
+            input=emb, param_attr=std_default) for emb in emb_layers
+    ])

 mix_hidden_lr = 1e-3
 lstm_para_attr = ParameterAttribute(initial_std=0.0, learning_rate=1.0)
-hidden_para_attr = ParameterAttribute(initial_std=default_std, learning_rate=mix_hidden_lr)
-
-lstm_0 = lstmemory(name='lstm0',
-                   input=hidden_0, 
-                   act=ReluActivation(),
-                   gate_act=SigmoidActivation(),
-                   state_act=SigmoidActivation(),
-                   bias_attr=std_0,
-                   param_attr=lstm_para_attr)
+hidden_para_attr = ParameterAttribute(
+    initial_std=default_std, learning_rate=mix_hidden_lr)
+
+lstm_0 = lstmemory(
+    name='lstm0',
+    input=hidden_0,
+    act=ReluActivation(),
+    gate_act=SigmoidActivation(),
+    state_act=SigmoidActivation(),
+    bias_attr=std_0,
+    param_attr=lstm_para_attr)

 #stack L-LSTM and R-LSTM with direct edges
 input_tmp = [hidden_0, lstm_0]

-
 for i in range(1, depth):

-    mix_hidden = mixed_layer(name='hidden'+str(i),
-                             size=hidden_dim, 
-                             bias_attr=std_default,
-                             input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
-                                    full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
-                                   ]
-                             )
-
-    lstm = lstmemory(name='lstm'+str(i),
-                     input=mix_hidden,
-                     act=ReluActivation(),
-                     gate_act=SigmoidActivation(),
-                     state_act=SigmoidActivation(),
-                     reverse=((i % 2)==1),
-                     bias_attr=std_0,
-                     param_attr=lstm_para_attr)
+    mix_hidden = mixed_layer(
+        name='hidden' + str(i),
+        size=hidden_dim,
+        bias_attr=std_default,
+        input=[
+            full_matrix_projection(
+                input=input_tmp[0], param_attr=hidden_para_attr),
+            full_matrix_projection(
+                input=input_tmp[1], param_attr=lstm_para_attr)
+        ])
+
+    lstm = lstmemory(
+        name='lstm' + str(i),
+        input=mix_hidden,
+        act=ReluActivation(),
+        gate_act=SigmoidActivation(),
+        state_act=SigmoidActivation(),
+        reverse=((i % 2) == 1),
+        bias_attr=std_0,
+        param_attr=lstm_para_attr)

    input_tmp = [mix_hidden, lstm]

-feature_out = mixed_layer(name='output',
-                          size=label_dict_len,
-                          bias_attr=std_default, 
-                          input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
-                                 full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
-                                ],
-                          )
-
-
+feature_out = mixed_layer(
+    name='output',
+    size=label_dict_len,
+    bias_attr=std_default,
+    input=[
+        full_matrix_projection(
+            input=input_tmp[0], param_attr=hidden_para_attr),
+        full_matrix_projection(
+            input=input_tmp[1], param_attr=lstm_para_attr)
+    ], )

 if not is_predict:
-    crf_l = crf_layer( name = 'crf',
-                       size = label_dict_len,
-                       input = feature_out, 
-                       label = target,
-                       param_attr=ParameterAttribute(name='crfw',initial_std=default_std, learning_rate=mix_hidden_lr)
-
-                      )
-
-    
-    crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
-                                   size = label_dict_len,
-                                   input = feature_out,
-                                   label = target,
-                                   param_attr=ParameterAttribute(name='crfw')
-                                       )
-
+    crf_l = crf_layer(
+        name='crf',
+        size=label_dict_len,
+        input=feature_out,
+        label=target,
+        param_attr=ParameterAttribute(
+            name='crfw', initial_std=default_std, learning_rate=mix_hidden_lr))
+
+    crf_dec_l = crf_decoding_layer(
+        name='crf_dec_l',
+        size=label_dict_len,
+        input=feature_out,
+        label=target,
+        param_attr=ParameterAttribute(name='crfw'))

    eval = sum_evaluator(input=crf_dec_l)
-        
+
    outputs(crf_l)

 else:
-    crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
-                                   size = label_dict_len,
-                                   input = feature_out,
-                                   param_attr=ParameterAttribute(name='crfw')
-                                       )
+    crf_dec_l = crf_decoding_layer(
+        name='crf_dec_l',
+        size=label_dict_len,
+        input=feature_out,
+        param_attr=ParameterAttribute(name='crfw'))

    outputs(crf_dec_l)
-
--- a/demo/semantic_role_labeling/predict.py
+++ b/demo/semantic_role_labeling/predict.py
@@ -26,7 +26,8 @@ UNK_IDX = 0


 class Prediction():
-    def __init__(self, train_conf, dict_file, model_dir, label_file, predicate_dict_file):
+    def __init__(self, train_conf, dict_file, model_dir, label_file,
+                 predicate_dict_file):
        """
        train_conf: trainer configure.
        dict_file: word dictionary file name.
@@ -35,7 +36,7 @@ class Prediction():

        self.dict = {}
        self.labels = {}
-        self.predicate_dict={}
+        self.predicate_dict = {}
        self.labels_reverse = {}
        self.load_dict_label(dict_file, label_file, predicate_dict_file)

@@ -44,25 +45,18 @@ class Prediction():
        len_pred = len(self.predicate_dict)

        conf = parse_config(
-            train_conf,
-            'dict_len=' + str(len_dict) + 
-            ',label_len=' + str(len_label) +
-            ',pred_len=' + str(len_pred) +
-            ',is_predict=True')
+            train_conf, 'dict_len=' + str(len_dict) + ',label_len=' +
+            str(len_label) + ',pred_len=' + str(len_pred) + ',is_predict=True')
        self.network = swig_paddle.GradientMachine.createFromConfigProto(
            conf.model_config)
        self.network.loadParameters(model_dir)

        slots = [
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict), 
-            integer_value_sequence(len_pred),
-            integer_value_sequence(2)
-            ]
+            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
+            integer_value_sequence(len_pred), integer_value_sequence(2)
+        ]
        self.converter = DataProviderConverter(slots)

    def load_dict_label(self, dict_file, label_file, predicate_dict_file):
@@ -78,6 +72,7 @@ class Prediction():

        for line_count, line in enumerate(open(predicate_dict_file, 'r')):
            self.predicate_dict[line.strip()] = line_count
+
    def get_data(self, data_file):
        """
        Get input data of paddle format.
@@ -88,9 +83,10 @@ class Prediction():
                ).split('\t')
                words = sentence.split()
                sen_len = len(words)
-                 
+
                word_slot = [self.dict.get(w, UNK_IDX) for w in words]
-                predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX)] * sen_len
+                predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX)
+                                  ] * sen_len
                ctx_n2_slot = [self.dict.get(ctx_n2, UNK_IDX)] * sen_len
                ctx_n1_slot = [self.dict.get(ctx_n1, UNK_IDX)] * sen_len
                ctx_0_slot = [self.dict.get(ctx_0, UNK_IDX)] * sen_len
@@ -99,7 +95,7 @@ class Prediction():

                marks = mark.split()
                mark_slot = [int(w) for w in marks]
-                
+
                yield word_slot, ctx_n2_slot, ctx_n1_slot, \
                      ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot

@@ -123,8 +119,9 @@ class Prediction():


 def option_parser():
-    usage = ("python predict.py -c config -w model_dir " 
-             "-d word dictionary -l label_file -i input_file  -p pred_dict_file")
+    usage = (
+        "python predict.py -c config -w model_dir "
+        "-d word dictionary -l label_file -i input_file  -p pred_dict_file")
    parser = OptionParser(usage="usage: %s [options]" % usage)
    parser.add_option(
        "-c",
@@ -187,8 +184,9 @@ def main():
    output_file = options.output_file

    swig_paddle.initPaddle("--use_gpu=0")
-    predict = Prediction(train_conf, dict_file, model_path, label_file, predict_dict_file)
-    predict.predict(data_file,output_file)
+    predict = Prediction(train_conf, dict_file, model_path, label_file,
+                         predict_dict_file)
+    predict.predict(data_file, output_file)


 if __name__ == '__main__':

--- a/demo/sentiment/predict.py
+++ b/demo/sentiment/predict.py
@@ -71,9 +71,7 @@ class SentimentPrediction():
        transform word into integer index according to the dictionary.
        """
        words = data.strip().split()
-        word_slot = [
-            self.word_dict[w] for w in words if w in self.word_dict
-        ]
+        word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
        return word_slot

    def batch_predict(self, data_batch):
@@ -85,8 +83,8 @@ class SentimentPrediction():
            if self.label is None:
                print("predicting label is %d" % (lab[0]))
            else:
-                print("predicting label is %s" %
-                      (self.label[lab[0]]))
+                print("predicting label is %s" % (self.label[lab[0]]))
+

 def option_parser():
    usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
@@ -143,9 +141,10 @@ def main():
        batch.append([predict.get_index(line)])
        if len(batch) == batch_size:
            predict.batch_predict(batch)
-            batch=[]
+            batch = []
    if len(batch) > 0:
        predict.batch_predict(batch)

+
 if __name__ == '__main__':
    main()
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -14,6 +14,13 @@ cd paddle
 git submodule update --init --recursive
 ```

+If you already have a local PaddlePaddle repo and have not initialized the submodule, your local submodule folder will be empty. You can simply run the last line of the above codes in your PaddlePaddle home directory to initialize your submodule folder.
+
+If you have already initialized your submodule and you would like to sync with the upstream submodule repo, you can run the following command
+```
+git submodule update --remote
+```
+
 ## <span id="requirements">Requirements</span>

 To compile the source code, your computer must be equipped with the following dependencies.

--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -122,9 +122,9 @@ The general development workflow with Docker and Bazel is as follows:
      git clone --recursive https://github.com/paddlepaddle/paddle


-2. Build a development Docker image `paddle:dev` from the source code.
-   This image contains all the development tools and dependencies of
-   PaddlePaddle.
+2. Build a development Docker image :code:`paddle:dev` from the source
+   code.  This image contains all the development tools and
+   dependencies of PaddlePaddle.


   .. code-block:: bash
@@ -139,14 +139,22 @@ The general development workflow with Docker and Bazel is as follows:

   .. code-block:: bash

-      docker run \
-       -d # run the container in background mode \
-       --name paddle # we can run a nginx container to serve documents \
-       -p 2022:22    # so we can SSH into this container \
-       -v $PWD:/paddle # mount the source code \
-       -v $HOME/.cache/bazel:/root/.cache/bazel # mount Bazel cache \
+      docker run       \
+       -d              \
+       --name paddle   \
+       -p 2022:22      \
+       -v $PWD:/paddle \
+       -v $HOME/.cache/bazel:/root/.cache/bazel \
       paddle:dev

+   where :code:`-d` makes the container running in background,
+   :code:`--name paddle` allows us to run a nginx container to serve
+   documents in this container, :code:`-p 2022:22` allows us to SSH
+   into this container, :code:`-v $PWD:/paddle` shares the source code
+   on the host with the container, :code:`-v
+   $HOME/.cache/bazel:/root/.cache/bazel` shares Bazel cache on the
+   host with the container.
+
 4. SSH into the container:

   .. code-block:: bash

--- a/doc_cn/cluster/k8s/distributed_training_on_kubernetes.md
+++ b/doc_cn/cluster/k8s/distributed_training_on_kubernetes.md
@@ -306,4 +306,4 @@ I1116 09:10:18.019069    50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:
 I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
 I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
 I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
-```
\ No newline at end of file
+```
--- a/doc_cn/cluster/k8s/job.yaml
+++ b/doc_cn/cluster/k8s/job.yaml
@@ -40,4 +40,4 @@ spec:
        - name: jobpath
          mountPath: /home/jobpath       
      restartPolicy: Never
-    
\ No newline at end of file
+    
--- a/doc_cn/cluster/k8s/start_paddle.py
+++ b/doc_cn/cluster/k8s/start_paddle.py
@@ -19,7 +19,6 @@ import socket
 import os
 import argparse

-
 # configuration for cluster
 API = "/api/v1/namespaces/"
 JOBSELECTOR = "labelSelector=job-name="
@@ -145,8 +144,8 @@ def startPaddle(idMap={}, train_args_dict=None):


 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(prog="start_paddle.py",
-                                     description='simple tool for k8s')
+    parser = argparse.ArgumentParser(
+        prog="start_paddle.py", description='simple tool for k8s')
    args, train_args_list = parser.parse_known_args()
    train_args = refine_unknown_args(train_args_list)
    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))

--- a/doc_cn/demo/sentiment_analysis/index.rst
+++ b/doc_cn/demo/sentiment_analysis/index.rst
-情感分析教程
-===========================
-
-.. toctree::
-    :maxdepth: 3
-    :glob:
-
+情感分析教程
+===========================
+
+.. toctree::
+    :maxdepth: 3
+    :glob:
+
    Training Locally <sentiment_analysis.md>
\ No newline at end of file
--- a/doc_theme/static/js/paddle_doc_init.js
+++ b/doc_theme/static/js/paddle_doc_init.js
@@ -28,4 +28,4 @@ $(document).ready(function(){
    $('.doc-menu-vertical').find('li.current').last().addClass('active');

    $('.doc-menu-vertical').perfectScrollbar();
-});
\ No newline at end of file
+});
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"

-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "Internal.h"
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"

 std::vector<int> GradientMachine::defaultParamTypes = {
    PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM};

--- a/paddle/api/Internal.h
+++ b/paddle/api/Internal.h
@@ -16,14 +16,13 @@ limitations under the License. */

 #include "PaddleAPI.h"

-#include <vector>
 #include <algorithm>
+#include <vector>

 template <typename T1, typename T2>
 void staticCastVector(std::vector<T2>* dest, const std::vector<T1>& src) {
  dest->resize(src.size());
-  std::transform(src.begin(),
-                 src.end(),
-                 dest->begin(),
-                 [](T1 t) { return static_cast<T2>(t); });
+  std::transform(src.begin(), src.end(), dest->begin(), [](T1 t) {
+    return static_cast<T2>(t);
+  });
 }
--- a/paddle/api/Matrix.cpp
+++ b/paddle/api/Matrix.cpp
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "PaddleAPI.h"
 #include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/math/CpuSparseMatrix.h"
-#include <iostream>
 #include <cstring>
+#include <iostream>
+#include "PaddleAPI.h"
+#include "paddle/math/CpuSparseMatrix.h"
+#include "paddle/math/SparseMatrix.h"

 struct MatrixPrivate {
  std::shared_ptr<paddle::Matrix> mat;

--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -16,8 +16,8 @@ limitations under the License. */

 #include <stddef.h>
 #include <stdint.h>
-#include <string>
 #include <stdexcept>
+#include <string>
 #include <vector>
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/TypeDefs.h"

--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "PaddleAPI.h"
 #include "paddle/parameter/Parameter.h"
+#include "PaddleAPI.h"

 struct ParameterPrivate {
  std::shared_ptr<paddle::Parameter> sharedPtr;

--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
 #include "paddle/parameter/ParameterOptimizer.h"
-#include "Internal.h"
 #include <algorithm>
+#include "Internal.h"
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"

 struct ParameterOptimizerPrivate {
  std::unique_ptr<paddle::ParameterOptimizer> optimizer;
@@ -36,16 +36,13 @@ struct ParameterTraverseCallbackPrivate {
             size_t sparseId) {
    std::vector<paddle::VectorPtr> real_vecs;
    real_vecs.resize(vecs.size());
-    std::transform(vecs.begin(),
-                   vecs.end(),
-                   real_vecs.begin(),
-                   [](Vector* v) {
-                     if (v) {
-                       return *(paddle::VectorPtr*)(v->getSharedPtr());
-                     } else {
-                       return paddle::VectorPtr();
-                     }
-                   });
+    std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
+      if (v) {
+        return *(paddle::VectorPtr*)(v->getSharedPtr());
+      } else {
+        return paddle::VectorPtr();
+      }
+    });

    paddle::ParameterConfig& real_conf =
        *(paddle::ParameterConfig*)(const_cast<ParameterConfig&>(conf)

--- a/paddle/api/SequenceGenerator.cpp
+++ b/paddle/api/SequenceGenerator.cpp
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <algorithm>
+#include <iterator>
+#include <sstream>
+#include <vector>
 #include "PaddleAPI.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
 #include "paddle/parameter/Argument.h"
 #include "paddle/utils/Flags.h"
-#include <vector>
-#include <sstream>
-#include <algorithm>
-#include <iterator>

 // used to represent partial sequence
 struct Path {

--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -16,12 +16,12 @@ limitations under the License. */
 #include "PaddleAPIPrivate.h"

 #include <stdlib.h>
-#include <memory>
 #include <atomic>
+#include <memory>

+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "paddle/trainer/ParamUtil.h"
 #include "paddle/trainer/Trainer.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "paddle/trainer/TrainerInternal.h"
 #include "paddle/utils/Flags.h"


--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -14,16 +14,16 @@ limitations under the License. */

 #include "PaddleAPI.h"

-#include "paddle/utils/Util.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Excepts.h"
 #include "paddle/parameter/Parameter.h"
+#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Util.h"

 #include <fenv.h>
+#include <algorithm>
 #include <iostream>
 #include <iterator>
-#include <algorithm>

 void initPaddle(int argc, char** argv) {
  paddle::initMain(argc, argv);

--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -282,7 +282,7 @@ FloatArray Vector::getData() const {
 }

 void Vector::copyFrom(Vector* src) throw(RangeError) {
-  if (src->m->vec->getSize() !=  m->vec->getSize()) {
+  if (src->m->vec->getSize() != m->vec->getSize()) {
    throw RangeError();
  }
  m->vec->copyFrom(*src->m->vec);

--- a/paddle/api/test/testMatrix.py
+++ b/paddle/api/test/testMatrix.py
@@ -100,11 +100,12 @@ class TestMatrix(unittest.TestCase):

            for a, e in zip(gpu_m.getData(), [1.0, 3.23, 3.0, 4.0, 5.0, 6.0]):
                self.assertAlmostEqual(a, e)
-    
+
    def test_numpy(self):
        numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
        m = swig_paddle.Matrix.createDenseFromNumpy(numpy_mat)
-        self.assertEqual((int(m.getHeight()), int(m.getWidth())), numpy_mat.shape)
+        self.assertEqual((int(m.getHeight()), int(m.getWidth())),
+                         numpy_mat.shape)
        self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
        for a, e in zip(m.getData(), [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]):
            self.assertAlmostEqual(a, e)

--- a/paddle/api/test/testVector.py
+++ b/paddle/api/test/testVector.py
@@ -26,17 +26,17 @@ class TestIVector(unittest.TestCase):
            self.assertEqual(m[i], 0)
            m[i] = i
            self.assertEqual(m[i], i)
-        
+
        m = swig_paddle.IVector.createZero(10)
        self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
-        self.assertEqual(m.getData(), [0]*10)
+        self.assertEqual(m.getData(), [0] * 10)

    def test_create(self):
        m = swig_paddle.IVector.create(range(10), False)
        self.assertIsNotNone(m)
        for i in xrange(10):
            self.assertEqual(m[i], i)
-        
+
        m = swig_paddle.IVector.create(range(10))
        self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
        self.assertEqual(m.getData(), range(10))
@@ -69,7 +69,7 @@ class TestIVector(unittest.TestCase):
            expect_vec = range(0, 10)
            expect_vec[4] = 7
            self.assertEqual(vec.getData(), expect_vec)
-    
+
    def test_numpy(self):
        vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
        iv = swig_paddle.IVector.createVectorFromNumpy(vec)
@@ -85,10 +85,10 @@ class TestVector(unittest.TestCase):
            self.assertTrue(util.doubleEqual(v[i], 0))
            v[i] = i
            self.assertTrue(util.doubleEqual(v[i], i))
-        
+
        v = swig_paddle.Vector.createZero(10)
        self.assertEqual(v.isGpu(), swig_paddle.isUsingGpu())
-        self.assertEqual(v.getData(), [0]*10)
+        self.assertEqual(v.getData(), [0] * 10)

    def testCreate(self):
        v = swig_paddle.Vector.create([x / 100.0 for x in xrange(100)], False)
@@ -96,14 +96,13 @@ class TestVector(unittest.TestCase):
        for i in xrange(len(v)):
            self.assertTrue(util.doubleEqual(v[i], i / 100.0))
        self.assertEqual(100, len(v))
-        
+
        v = swig_paddle.Vector.create([x / 100.0 for x in xrange(100)])
        self.assertEqual(v.isGpu(), swig_paddle.isUsingGpu())
        self.assertEqual(100, len(v))
        vdata = v.getData()
        for i in xrange(len(v)):
            self.assertTrue(util.doubleEqual(vdata[i], i / 100.0))
-        

    def testCpuNumpy(self):
        numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
@@ -128,7 +127,7 @@ class TestVector(unittest.TestCase):

        for i in xrange(1, len(numpy_3)):
            util.doubleEqual(numpy_3[i], vec[i])
-    
+
    def testNumpy(self):
        numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
        vec = swig_paddle.Vector.createVectorFromNumpy(numpy_arr)
@@ -136,7 +135,6 @@ class TestVector(unittest.TestCase):
        vecData = vec.getData()
        for n, v in zip(numpy_arr, vecData):
            self.assertTrue(util.doubleEqual(n, v))
-        

    def testCopyFromNumpy(self):
        vec = swig_paddle.Vector.createZero(1, False)

--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -223,9 +223,9 @@ typedef struct {

 #ifdef __NVCC__

-#include "paddle/utils/Logging.h"
-#include "hl_cuda.h"
 #include "cuda_runtime.h"
+#include "hl_cuda.h"
+#include "paddle/utils/Logging.h"

 extern __thread bool g_sync_flag;
 extern __thread cudaStream_t default_stream;

--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #define HL_DSO_LOADER_H_

 #include <dlfcn.h>
-#include <string>
 #include <memory>
+#include <string>
 #include "hl_base.h"

 /**

--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -15,28 +15,28 @@ limitations under the License. */
 #ifndef HL_GPU_H_
 #define HL_GPU_H_

+#include "hl_aggregate.h"
 #include "hl_base.h"
+#include "hl_cnn.h"
 #include "hl_cuda.h"
 #include "hl_cuda_cublas.h"
 #include "hl_cuda_cudnn.h"
-#include "hl_matrix.h"
-#include "hl_aggregate.h"
-#include "hl_cnn.h"
-#include "hl_sparse.h"
 #include "hl_lstm.h"
+#include "hl_matrix.h"
 #include "hl_sequence.h"
+#include "hl_sparse.h"
 #include "hl_warpctc_wrap.h"

 #ifdef HPPL_STUB_FUNC
-#include "stub/hl_cuda_stub.h"
-#include "stub/hl_cuda_cublas_stub.h"
-#include "stub/hl_cuda_cudnn_stub.h"
-#include "stub/hl_matrix_stub.h"
 #include "stub/hl_aggregate_stub.h"
 #include "stub/hl_cnn_stub.h"
-#include "stub/hl_sparse_stub.h"
+#include "stub/hl_cuda_cublas_stub.h"
+#include "stub/hl_cuda_cudnn_stub.h"
+#include "stub/hl_cuda_stub.h"
 #include "stub/hl_lstm_stub.h"
+#include "stub/hl_matrix_stub.h"
 #include "stub/hl_sequence_stub.h"
+#include "stub/hl_sparse_stub.h"
 #endif

 #endif /* HL_GPU_H_ */
--- a/paddle/cuda/include/hl_time.h
+++ b/paddle/cuda/include/hl_time.h
@@ -14,7 +14,7 @@ limitations under the License. */

 #ifndef HL_TIME_H_
 #define HL_TIME_H_
-
+#include <cstdint>
 /**
 * @brief   High resolution timer.
 *

--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "hl_cuda_cublas.h"
 #include <sys/time.h>
 #include <mutex>
 #include "hl_cuda.h"
-#include "hl_cuda_cublas.h"
-#include "hl_thread.ph"
 #include "hl_dso_loader.h"
+#include "hl_thread.ph"
 #include "paddle/utils/Logging.h"

 namespace dynload {

--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "hl_cuda_cudnn.h"
 #include <cudnn.h>
 #include <mutex>
-#include "hl_cuda_cudnn.h"
 #include "hl_cuda_cudnn.ph"
-#include "hl_thread.ph"
 #include "hl_dso_loader.h"
-#include "paddle/utils/Logging.h"
+#include "hl_thread.ph"
 #include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Logging.h"

 P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
               4096,

--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "hl_cuda.h"
 #include <cuda_profiler_api.h>
 #include <string.h>
 #include <sys/syscall.h>
 #include <sys/time.h>
 #include <unistd.h>
 #include <mutex>
-#include "hl_cuda.h"
 #include "hl_cuda.ph"
 #include "hl_dso_loader.h"
 #include "hl_thread.ph"

--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
@@ -14,8 +14,8 @@ limitations under the License. */

 #ifdef PADDLE_USE_DSO

-#include <mutex>
 #include <cuda_runtime.h>
+#include <mutex>
 #include "hl_dso_loader.h"

 /**

--- a/paddle/cuda/src/hl_time.cc
+++ b/paddle/cuda/src/hl_time.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <chrono>
+#include "hl_time.h"
 #include <stdlib.h>
+#include <chrono>
+#include <cstdint>
 #include <iostream>
-#include "hl_time.h"

 using std::chrono::high_resolution_clock;


--- a/paddle/cuda/src/hl_warpctc_wrap.cc
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <mutex>
 #include "hl_warpctc_wrap.h"
+#include <mutex>
 #include "hl_dso_loader.h"
 #include "paddle/utils/Logging.h"


--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -15,13 +15,13 @@ limitations under the License. */
 #include "ActivationFunction.h"

 #include <algorithm>
-#include <memory>
 #include <iostream>
-#include <type_traits>
+#include <memory>
 #include <string>
 #include <thread>
-#include "paddle/utils/ClassRegistrar.h"
+#include <type_traits>
 #include "paddle/parameter/Argument.h"
+#include "paddle/utils/ClassRegistrar.h"

 #include "paddle/utils/Logging.h"


--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -14,12 +14,12 @@ limitations under the License. */

 #include "DataProvider.h"

-#include "paddle/utils/Util.h"
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Logging.h"
-#include <algorithm>
 #include <unistd.h>
+#include <algorithm>
 #include "ProtoDataProvider.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/StringUtil.h"
+#include "paddle/utils/Util.h"

 namespace paddle {


--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -14,28 +14,28 @@ limitations under the License. */

 #pragma once

-#include <vector>
-#include <memory>
-#include <mutex>
-#include <iostream>
-#include <fstream>
 #include <stdint.h>
-#include <string.h>
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <vector>

+#include "DataConfig.pb.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/parameter/Argument.h"
+#include "paddle/utils/ClassRegistrar.h"
+#include "paddle/utils/Locks.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Queue.h"
-#include "paddle/utils/Locks.h"
 #include "paddle/utils/ThreadLocal.h"
 #include "paddle/utils/TypeDefs.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
 #include "paddle/utils/Util.h"
-#include "paddle/math/Vector.h"
-#include "DataConfig.pb.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/parameter/Argument.h"

 namespace paddle {
 /**

--- a/paddle/gserver/dataproviders/MultiDataProvider.cpp
+++ b/paddle/gserver/dataproviders/MultiDataProvider.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/utils/Util.h"
 #include "MultiDataProvider.h"
-#include "paddle/utils/Logging.h"
 #include <algorithm>
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Util.h"

 namespace paddle {


--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "ProtoDataProvider.h"
-#include "paddle/utils/Util.h"
-#include "paddle/utils/StringUtil.h"
 #include <algorithm>
 #include <fstream>
 #include <istream>
+#include "paddle/utils/StringUtil.h"
+#include "paddle/utils/Util.h"

-#include "paddle/utils/Logging.h"
 #include "DataProviderGroup.h"
+#include "paddle/utils/Logging.h"

 P_DEFINE_double(memory_threshold_on_load_data,
                1.0,
@@ -562,16 +562,16 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
        auto mat = cpuArguments[slot].value;
        mat->resize(size, dim);
        if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(),
-                         slots_[slot].indices.data(),
-                         slots_[slot].sparseNonValueData.data(),
-                         HPPL_STREAM_1);
+          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+              dataPos.data(),
+              slots_[slot].indices.data(),
+              slots_[slot].sparseNonValueData.data(),
+              HPPL_STREAM_1);
        } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(),
-                         slots_[slot].indices.data(),
-                         slots_[slot].sparseNonValueData.data());
+          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+              dataPos.data(),
+              slots_[slot].indices.data(),
+              slots_[slot].sparseNonValueData.data());
        } else {
          LOG(FATAL) << "Not Supported";
        }
@@ -598,16 +598,16 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
        auto mat = cpuArguments[slot].value;
        mat->resize(size, dim);
        if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(),
-                         slots_[slot].indices.data(),
-                         slots_[slot].sparseFloatValueData.data(),
-                         HPPL_STREAM_1);
+          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+              dataPos.data(),
+              slots_[slot].indices.data(),
+              slots_[slot].sparseFloatValueData.data(),
+              HPPL_STREAM_1);
        } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(),
-                         slots_[slot].indices.data(),
-                         slots_[slot].sparseFloatValueData.data());
+          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+              dataPos.data(),
+              slots_[slot].indices.data(),
+              slots_[slot].sparseFloatValueData.data());
        } else {
          LOG(FATAL) << "Not Supported";
        }

--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.h
@@ -16,8 +16,8 @@ limitations under the License. */

 #include <vector>

-#include "paddle/utils/Stat.h"
 #include "DataFormat.pb.h"
+#include "paddle/utils/Stat.h"

 #include "DataProvider.h"
 #include "ProtoReader.h"

--- a/paddle/gserver/dataproviders/ProtoReader.h
+++ b/paddle/gserver/dataproviders/ProtoReader.h
@@ -16,10 +16,10 @@ limitations under the License. */

 #include <memory>

-#include <google/protobuf/message_lite.h>
 #include <google/protobuf/io/coded_stream.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <google/protobuf/io/gzip_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/message_lite.h>

 namespace paddle {


--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "PyDataProvider.h"
-#include "paddle/utils/PythonUtil.h"
 #include <fenv.h>
-#include "paddle/utils/Util.h"
 #include "paddle/utils/Excepts.h"
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Util.h"

 namespace paddle {

@@ -316,16 +316,16 @@ void PyDataProvider::handleSparseNonValueSlot(
  auto mat = cpuArguments[slotIndex].value;
  mat->resize(slot.sampleNum, dim, slot.sampleNum, NO_VALUE, SPARSE_CSR);
  if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(),
-                   slot.indices.data(),
-                   slot.sparseNonValueData.data(),
-                   HPPL_STREAM_1);
+    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseNonValueData.data(),
+        HPPL_STREAM_1);
  } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(),
-                   slot.indices.data(),
-                   slot.sparseNonValueData.data());
+    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseNonValueData.data());
  } else {
    LOG(FATAL) << "Not Supported";
  }
@@ -347,16 +347,16 @@ void PyDataProvider::handleSparseValueSlot(
  auto mat = cpuArguments[slotIndex].value;
  mat->resize(slot.sampleNum, dim, slot.sampleNum, FLOAT_VALUE, SPARSE_CSR);
  if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(),
-                   slot.indices.data(),
-                   slot.sparseFloatValueData.data(),
-                   HPPL_STREAM_DEFAULT);
+    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseFloatValueData.data(),
+        HPPL_STREAM_DEFAULT);
  } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(),
-                   slot.indices.data(),
-                   slot.sparseFloatValueData.data());
+    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseFloatValueData.data());
  } else {
    LOG(FATAL) << "Not Supported";
  }

--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -15,18 +15,18 @@ limitations under the License. */
 #ifndef PADDLE_NO_PYTHON

 #include <Python.h>
+#include <numpy/numpyconfig.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <unordered_set>
 #include <list>
-#include <numpy/numpyconfig.h>
+#include <unordered_set>
 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 #include <numpy/ndarrayobject.h>

 #include "DataProvider.h"

-#include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Locks.h"
+#include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"

 namespace paddle {
@@ -400,10 +400,9 @@ private:

      if (this->loadThread_) {  // wait poolActualSize < poolSize;
        std::unique_lock<std::mutex> l(mtx_);
-        pushCV_.wait(l,
-                     [this, additionalBatchSize] {
-                       return this->poolActualSize_ < poolSize_;
-                     });
+        pushCV_.wait(l, [this, additionalBatchSize] {
+          return this->poolActualSize_ < poolSize_;
+        });
      }

      {
@@ -529,12 +528,10 @@ public:
                        // but, loading from cache, cache object should ensure
                        // data pool ready.
      std::unique_lock<std::mutex> l(mtx_);
-      pullCV_.wait(l,
-                   [this, &size] {
-                     return this->poolActualSize_ >=
-                                std::max(size, this->minPoolSize_) ||
-                            callingContexts_.empty();
-                   });
+      pullCV_.wait(l, [this, &size] {
+        return this->poolActualSize_ >= std::max(size, this->minPoolSize_) ||
+               callingContexts_.empty();
+      });

      if (unittest::OnPoolFilled) {
        (*unittest::OnPoolFilled)(this->poolActualSize_);

--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/utils/Stat.h"
 #include "paddle/gserver/evaluators/Evaluator.h"
+#include "paddle/utils/Stat.h"

 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"

@@ -842,9 +842,9 @@ void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
  auto start = predictArray.begin();
  while (start != predictArray.end()) {
    auto end = std::find_if(
-        start + 1,
-        predictArray.end(),
-        [=](const PredictionResult& x) { return x.queryid != start->queryid; });
+        start + 1, predictArray.end(), [=](const PredictionResult& x) {
+          return x.queryid != start->queryid;
+        });
    CHECK(end != start);
    stat(start - predictArray.begin(),
         end - predictArray.begin(),

--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -14,11 +14,11 @@ limitations under the License. */

 #pragma once

-#include "paddle/pserver/ParameterClient2.h"
-#include "paddle/utils/ClassRegistrar.h"
+#include <fstream>
 #include "ModelConfig.pb.h"
 #include "paddle/parameter/Argument.h"
-#include <fstream>
+#include "paddle/pserver/ParameterClient2.h"
+#include "paddle/utils/ClassRegistrar.h"

 namespace paddle {


--- a/paddle/gserver/gradientmachines/GradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/GradientMachine.cpp
@@ -14,16 +14,16 @@ limitations under the License. */

 #include "GradientMachine.h"

-#include "paddle/utils/Logging.h"
 #include <fstream>
+#include "paddle/utils/Logging.h"

-#include "hl_gpu.h"
-#include "NeuralNetwork.h"
-#include "ParallelNeuralNetwork.h"
+#include "GradientMachineMode.h"
 #include "MultiGradientMachine.h"
-#include "NeuralNetwork.h"
 #include "MultiNetwork.h"
-#include "GradientMachineMode.h"
+#include "NeuralNetwork.h"
+#include "NeuralNetwork.h"
+#include "ParallelNeuralNetwork.h"
+#include "hl_gpu.h"

 namespace paddle {


--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -17,15 +17,15 @@ limitations under the License. */
 #include <iostream>
 #include <vector>

-#include "paddle/math/Matrix.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/ParameterUpdaterBase.h"
-#include "paddle/utils/Thread.h"
-#include "TrainerConfig.pb.h"
 #include "ModelConfig.pb.h"
+#include "TrainerConfig.pb.h"
 #include "paddle/gserver/dataproviders/DataProvider.h"
 #include "paddle/gserver/evaluators/Evaluator.h"
 #include "paddle/gserver/layers/Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/parameter/ParameterUpdaterBase.h"
+#include "paddle/utils/Thread.h"

 namespace paddle {
 /**

--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -18,9 +18,9 @@ limitations under the License. */

 #include "GradientMachine.h"

-#include "paddle/utils/Queue.h"
-#include "paddle/utils/Locks.h"
 #include "hl_gpu.h"
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/Queue.h"

 namespace paddle {


--- a/paddle/gserver/gradientmachines/MultiNetwork.cpp
+++ b/paddle/gserver/gradientmachines/MultiNetwork.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <algorithm>
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
-#include <algorithm>

 #include "MultiNetwork.h"


--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -14,15 +14,15 @@ limitations under the License. */

 #include "paddle/utils/Util.h"

-#include "paddle/utils/Logging.h"
 #include "paddle/utils/CustomStackTrace.h"
+#include "paddle/utils/Logging.h"

-#include "paddle/utils/Stat.h"
-#include "hl_gpu.h"
+#include "MultiNetwork.h"
 #include "NeuralNetwork.h"
 #include "RecurrentGradientMachine.h"
-#include "MultiNetwork.h"
+#include "hl_gpu.h"
 #include "paddle/gserver/layers/AgentLayer.h"
+#include "paddle/utils/Stat.h"

 namespace paddle {
 void parameterInitNN(int paramId,

--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -14,18 +14,18 @@ limitations under the License. */

 #pragma once

-#include <memory>
-#include <map>
 #include <functional>
+#include <map>
+#include <memory>

-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/parameter/Parameter.h"
 #include "ModelConfig.pb.h"
+#include "paddle/gserver/dataproviders/DataProvider.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
 #include "paddle/gserver/layers/CostLayer.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/dataproviders/DataProvider.h"
 #include "paddle/gserver/layers/Layer.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/utils/ClassRegistrar.h"

 namespace paddle {
 /*
@@ -57,14 +57,13 @@ void parameterInitNN(int paramId,

 class NeuralNetwork : public GradientMachine {
 public:
-  virtual void init(
-      const ModelConfig& config,
-      ParamInitCallback callback = nullptr,
-      const std::vector<ParameterType>&
-          parameterTypes = std::vector<ParameterType>{PARAMETER_VALUE,
-                                                      PARAMETER_GRADIENT,
-                                                      PARAMETER_MOMENTUM},
-      bool useGpu = FLAGS_use_gpu);
+  virtual void init(const ModelConfig& config,
+                    ParamInitCallback callback = nullptr,
+                    const std::vector<ParameterType>& parameterTypes =
+                        std::vector<ParameterType>{PARAMETER_VALUE,
+                                                   PARAMETER_GRADIENT,
+                                                   PARAMETER_MOMENTUM},
+                    bool useGpu = FLAGS_use_gpu);

  /**
   * Connect two submodels and

--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
@@ -37,14 +37,13 @@ public:
                        NeuralNetwork *rootNetwork = nullptr)
      : NeuralNetwork(subModelName, rootNetwork) {}

-  virtual void init(
-      const ModelConfig &config,
-      ParamInitCallback callback = nullptr,
-      const std::vector<ParameterType>
-          &parameterTypes = std::vector<ParameterType>{PARAMETER_VALUE,
-                                                       PARAMETER_GRADIENT,
-                                                       PARAMETER_MOMENTUM},
-      bool useGpu = FLAGS_use_gpu);
+  virtual void init(const ModelConfig &config,
+                    ParamInitCallback callback = nullptr,
+                    const std::vector<ParameterType> &parameterTypes =
+                        std::vector<ParameterType>{PARAMETER_VALUE,
+                                                   PARAMETER_GRADIENT,
+                                                   PARAMETER_MOMENTUM},
+                    bool useGpu = FLAGS_use_gpu);

  virtual void forward(const std::vector<Argument> &inArgs,
                       std::vector<Argument> *outArgs,

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-#include "paddle/utils/Flags.h"
+#include "RecurrentGradientMachine.h"
+#include <dlfcn.h>
 #include <algorithm>
+#include <cmath>
 #include <functional>
-#include <dlfcn.h>
 #include <limits>
-#include <cmath>
-#include "RecurrentGradientMachine.h"
 #include "NeuralNetwork.h"
 #include "paddle/gserver/layers/AgentLayer.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"

 P_DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");

@@ -78,20 +78,22 @@ static inline SymbolType loadDiySymbol(const char* symbolName) {
  return reinterpret_cast<SymbolType>(sym);
 }

-static InitFunction __init__diy_prob_method([] {
-  std::string soName = FLAGS_diy_beam_search_prob_so;
-  if (!soName.empty()) {
-    gDiyProbHandle = dlopen(soName.c_str(), RTLD_LAZY);
-    CHECK(gDiyProbHandle) << "Cannot Open DIY Prob So " << soName;
-    atexit(exit_diy_prob);
-    gDiyProbMethod =
-        loadDiySymbol<decltype(gDiyProbMethod)>(DIY_CALC_PROB_SYMBOL_NAME);
-    gDiyProbStart =
-        loadDiySymbol<decltype(gDiyProbStart)>(DIY_START_CALC_PROB_SYMBOL_NAME);
-    gDiyProbStop =
-        loadDiySymbol<decltype(gDiyProbStop)>(DIY_FINISH_CALC_PROB_SYMBOL_NAME);
-  }
-}, std::numeric_limits<int>::max());
+static InitFunction __init__diy_prob_method(
+    [] {
+      std::string soName = FLAGS_diy_beam_search_prob_so;
+      if (!soName.empty()) {
+        gDiyProbHandle = dlopen(soName.c_str(), RTLD_LAZY);
+        CHECK(gDiyProbHandle) << "Cannot Open DIY Prob So " << soName;
+        atexit(exit_diy_prob);
+        gDiyProbMethod =
+            loadDiySymbol<decltype(gDiyProbMethod)>(DIY_CALC_PROB_SYMBOL_NAME);
+        gDiyProbStart = loadDiySymbol<decltype(gDiyProbStart)>(
+            DIY_START_CALC_PROB_SYMBOL_NAME);
+        gDiyProbStop = loadDiySymbol<decltype(gDiyProbStop)>(
+            DIY_FINISH_CALC_PROB_SYMBOL_NAME);
+      }
+    },
+    std::numeric_limits<int>::max());

 class BeamSearchControlCallbacks {
 public:
@@ -1281,10 +1283,9 @@ void RecurrentGradientMachine::beamSearch(size_t batchSize) {
      std::vector<std::vector<int>*> prefixes;
      prefixes.resize(paths.size());
      std::transform(
-          paths.begin(),
-          paths.end(),
-          prefixes.begin(),
-          [](const Path& p) { return const_cast<std::vector<int>*>(&p.ids); });
+          paths.begin(), paths.end(), prefixes.begin(), [](const Path& p) {
+            return const_cast<std::vector<int>*>(&p.ids);
+          });
      beamSearchCtrlCallbacks_->beamSearchCandidateAdjust(
          prefixes, frames_[machineCur].get(), i);
    }

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -14,9 +14,9 @@ limitations under the License. */

 #pragma once

+#include <functional>
 #include "GradientMachine.h"
 #include "NeuralNetwork.h"
-#include <functional>

 #include "paddle/utils/Locks.h"


--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/utils/Stat.h"
-#include "Layer.h"
 #include "BatchNormBaseLayer.h"
 #include "BatchNormalizationLayer.h"
+#include "Layer.h"
+#include "paddle/utils/Stat.h"
 #ifndef PADDLE_ONLY_CPU
 #include "CudnnBatchNormLayer.h"
 #endif

--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -14,8 +14,8 @@ limitations under the License. */

 #pragma once

-#include "paddle/utils/Stat.h"
 #include "Layer.h"
+#include "paddle/utils/Stat.h"

 namespace paddle {


--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -14,8 +14,8 @@ limitations under the License. */

 #pragma once

-#include "Layer.h"
 #include "BatchNormBaseLayer.h"
+#include "Layer.h"

 namespace paddle {


--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ b/paddle/gserver/layers/ConcatenateLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/utils/Stat.h"
 #include "Layer.h"
 #include "Projection.h"
+#include "paddle/utils/Stat.h"

 namespace paddle {


--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/utils/Stat.h"
 #include "ContextProjection.h"
+#include "paddle/utils/Stat.h"

 namespace paddle {


--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/utils/Logging.h"
 #include "ConvBaseLayer.h"
 #include "paddle/math/MathUtils.h"
+#include "paddle/utils/Logging.h"
 namespace paddle {

 bool ConvBaseLayer::init(const LayerMap& layerMap,

--- a/paddle/gserver/layers/ConvOperator.cpp
+++ b/paddle/gserver/layers/ConvOperator.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/math/Matrix.h"
-#include "paddle/math/MathUtils.h"
 #include "Operator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"

 namespace paddle {


--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/utils/Stat.h"
 #include "ConvProjection.h"
+#include "paddle/utils/Stat.h"

 namespace paddle {


--- a/paddle/gserver/layers/ConvShiftLayer.cpp
+++ b/paddle/gserver/layers/ConvShiftLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"

 namespace paddle {

--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"

 namespace paddle {

--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ b/paddle/gserver/layers/CosSimVecMatLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"

 namespace paddle {

--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <memory>
+#include "CostLayer.h"
 #include <algorithm>
-#include "paddle/utils/Logging.h"
 #include <cmath>
-#include "CostLayer.h"
+#include <memory>
+#include "paddle/utils/Logging.h"

 #include "paddle/math/SparseMatrix.h"


--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/utils/Stat.h"
-#include "Layer.h"
 #include "CudnnBatchNormLayer.h"
+#include "Layer.h"
+#include "paddle/utils/Stat.h"

 namespace paddle {


--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -14,9 +14,9 @@ limitations under the License. */

 #pragma once

-#include "paddle/utils/Stat.h"
-#include "Layer.h"
 #include "BatchNormBaseLayer.h"
+#include "Layer.h"
+#include "paddle/utils/Stat.h"

 namespace paddle {


--- a/paddle/gserver/layers/CudnnConvLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "CudnnConvLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "CudnnConvLayer.h"

 namespace paddle {


--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvLayer.h
@@ -14,10 +14,10 @@ limitations under the License. */

 #pragma once

+#include <vector>
 #include "ConvBaseLayer.h"
-#include "paddle/math/Matrix.h"
 #include "Projection.h"
-#include <vector>
+#include "paddle/math/Matrix.h"

 namespace paddle {


--- a/paddle/gserver/layers/CudnnPoolLayer.cpp
+++ b/paddle/gserver/layers/CudnnPoolLayer.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "CudnnPoolLayer.h"
+#include "paddle/math/Matrix.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "paddle/math/Matrix.h"
-#include "CudnnPoolLayer.h"

 namespace paddle {


--- a/paddle/gserver/layers/EosIdCheckLayer.cpp
+++ b/paddle/gserver/layers/EosIdCheckLayer.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/utils/Logging.h"
 #include "Layer.h"
+#include "paddle/utils/Logging.h"

 namespace paddle {
 /**

--- a/paddle/gserver/layers/ExpandConvBaseLayer.h
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.h
@@ -14,9 +14,9 @@ limitations under the License. */

 #pragma once

+#include <vector>
 #include "ConvBaseLayer.h"
 #include "paddle/math/Matrix.h"
-#include <vector>

 namespace paddle {


--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "ExpandConvLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "ExpandConvLayer.h"

 namespace paddle {


--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -14,9 +14,9 @@ limitations under the License. */

 #pragma once

-#include "paddle/math/Matrix.h"
 #include <vector>
 #include "ExpandConvBaseLayer.h"
+#include "paddle/math/Matrix.h"

 namespace paddle {


--- a/paddle/gserver/layers/ExpandConvTransLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvTransLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "ExpandConvTransLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "ExpandConvTransLayer.h"

 /* The implementation of the convTransLayer is basically a swap of forward and
 * backward of the original convLayer.

--- a/paddle/gserver/layers/ExpandConvTransLayer.h
+++ b/paddle/gserver/layers/ExpandConvTransLayer.h
@@ -14,9 +14,9 @@ limitations under the License. */

 #pragma once

-#include "paddle/math/Matrix.h"
 #include <vector>
 #include "ExpandConvBaseLayer.h"
+#include "paddle/math/Matrix.h"

 namespace paddle {


--- a/paddle/gserver/layers/FullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/FullyConnectedLayer.cpp
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "FullyConnectedLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/math/SparseMatrix.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "paddle/math/SparseMatrix.h"
-#include <vector>
-#include <algorithm>

 namespace paddle {


--- a/paddle/gserver/layers/GatedRecurrentLayer.cpp
+++ b/paddle/gserver/layers/GatedRecurrentLayer.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "Layer.h"
 #include "GatedRecurrentLayer.h"
+#include "Layer.h"
 #include "paddle/utils/Stat.h"

 namespace paddle {
@@ -386,8 +386,9 @@ void GatedRecurrentLayer::backwardBatch(int batchSize, MatrixPtr inputGrad) {
      {
        batchSize = outputGradTmp->getHeight();
        gruValue.prevOutValue =
-            (n == 0 ? nullptr : (batchValue_->getBatchValue(n - 1, batchSize))
-                                    ->getData());
+            (n == 0
+                 ? nullptr
+                 : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
        gruGrad.prevOutGrad =
            (n == 0 ? nullptr
                    : (batchGrad_->getBatchValue(n - 1, batchSize))->getData());

--- a/paddle/gserver/layers/GatedRecurrentLayer.h
+++ b/paddle/gserver/layers/GatedRecurrentLayer.h
@@ -14,10 +14,10 @@ limitations under the License. */

 #pragma once

-#include "paddle/math/Matrix.h"
-#include "SequenceToBatch.h"
 #include "GruCompute.h"
 #include "Layer.h"
+#include "SequenceToBatch.h"
+#include "paddle/math/Matrix.h"

 namespace paddle {


--- a/paddle/gserver/layers/GruCompute.cpp
+++ b/paddle/gserver/layers/GruCompute.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/utils/Util.h"
 #include "GruCompute.h"
 #include "hl_recurrent_apply.cuh"
+#include "paddle/utils/Util.h"

 namespace paddle {


--- a/paddle/gserver/layers/GruCompute.h
+++ b/paddle/gserver/layers/GruCompute.h
@@ -14,9 +14,9 @@ limitations under the License. */

 #pragma once

-#include "paddle/utils/TypeDefs.h"
 #include "ModelConfig.pb.h"
 #include "hl_gpu.h"
+#include "paddle/utils/TypeDefs.h"

 namespace paddle {


--- a/paddle/gserver/layers/GruStepLayer.cpp
+++ b/paddle/gserver/layers/GruStepLayer.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "Layer.h"
 #include "GruCompute.h"
+#include "Layer.h"
 #include "paddle/utils/Stat.h"

 namespace paddle {

--- a/paddle/gserver/layers/IdentityProjection.cpp
+++ b/paddle/gserver/layers/IdentityProjection.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/utils/Stat.h"
 #include "Projection.h"
+#include "paddle/utils/Stat.h"

 namespace paddle {


--- a/paddle/gserver/layers/InterpolationLayer.cpp
+++ b/paddle/gserver/layers/InterpolationLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"

 namespace paddle {

--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -14,15 +14,15 @@ limitations under the License. */

 #include "paddle/utils/Util.h"

-#include "paddle/utils/Logging.h"
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Logging.h"

 #include "AddtoLayer.h"
+#include "CRFLayer.h"
 #include "CosSimLayer.h"
 #include "CostLayer.h"
-#include "ExpandConvLayer.h"
-#include "CRFLayer.h"
 #include "DataLayer.h"
+#include "ExpandConvLayer.h"
 #include "FullyConnectedLayer.h"
 #include "HierarchicalSigmoidLayer.h"
 #include "MaxLayer.h"

--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -14,18 +14,18 @@ limitations under the License. */

 #pragma once

-#include <memory>
-#include <functional>
 #include <paddle/parameter/Argument.h>
-#include "paddle/utils/ClassRegistrar.h"
+#include <functional>
+#include <memory>
+#include "ModelConfig.pb.h"
 #include "paddle/math/CpuSparseMatrix.h"
 #include "paddle/parameter/Parameter.h"
+#include "paddle/utils/ClassRegistrar.h"
 #include "paddle/utils/Util.h"
-#include "ModelConfig.pb.h"

-#include "paddle/gserver/activations/ActivationFunction.h"
 #include <paddle/parameter/ParallelParameter.h>
 #include <paddle/parameter/Weight.h>
+#include "paddle/gserver/activations/ActivationFunction.h"

 /// Macro for registering a layer type.
 /// Example: REGISTER_LAYER(crf_error, CRFDecodingErrorLayer);

--- a/paddle/gserver/layers/LinearChainCRF.cpp
+++ b/paddle/gserver/layers/LinearChainCRF.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <algorithm>
 #include "LinearChainCRF.h"
+#include <algorithm>

 namespace paddle {


--- a/paddle/gserver/layers/LinearChainCTC.cpp
+++ b/paddle/gserver/layers/LinearChainCTC.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <math.h>
 #include "LinearChainCTC.h"
+#include <math.h>
 #include <limits>

 namespace paddle {

--- a/paddle/gserver/layers/LstmCompute.cpp
+++ b/paddle/gserver/layers/LstmCompute.cpp
--- a/paddle/gserver/layers/LstmCompute.h
+++ b/paddle/gserver/layers/LstmCompute.h
--- a/paddle/gserver/layers/LstmLayer.cpp
+++ b/paddle/gserver/layers/LstmLayer.cpp
--- a/paddle/gserver/layers/LstmLayer.h
+++ b/paddle/gserver/layers/LstmLayer.h
--- a/paddle/gserver/layers/MDLstmLayer.cpp
+++ b/paddle/gserver/layers/MDLstmLayer.cpp
--- a/paddle/gserver/layers/MaxOutLayer.cpp
+++ b/paddle/gserver/layers/MaxOutLayer.cpp
--- a/paddle/gserver/layers/MixedLayer.cpp
+++ b/paddle/gserver/layers/MixedLayer.cpp
--- a/paddle/gserver/layers/MixedLayer.h
+++ b/paddle/gserver/layers/MixedLayer.h
--- a/paddle/gserver/layers/MultiplexLayer.cpp
+++ b/paddle/gserver/layers/MultiplexLayer.cpp
--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
--- a/paddle/gserver/layers/Operator.h
+++ b/paddle/gserver/layers/Operator.h
--- a/paddle/gserver/layers/OuterProdLayer.cpp
+++ b/paddle/gserver/layers/OuterProdLayer.cpp
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
--- a/paddle/gserver/layers/PoolProjectionLayer.cpp
+++ b/paddle/gserver/layers/PoolProjectionLayer.cpp
--- a/paddle/gserver/layers/PowerLayer.cpp
+++ b/paddle/gserver/layers/PowerLayer.cpp
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
--- a/paddle/gserver/layers/RecurrentLayerGroup.cpp
+++ b/paddle/gserver/layers/RecurrentLayerGroup.cpp
--- a/paddle/gserver/layers/ResizeLayer.cpp
+++ b/paddle/gserver/layers/ResizeLayer.cpp
--- a/paddle/gserver/layers/ScalingLayer.cpp
+++ b/paddle/gserver/layers/ScalingLayer.cpp
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
--- a/paddle/gserver/layers/SequenceConcatLayer.cpp
+++ b/paddle/gserver/layers/SequenceConcatLayer.cpp
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ b/paddle/gserver/layers/SequenceReshapeLayer.cpp
--- a/paddle/gserver/layers/SequenceToBatch.cpp
+++ b/paddle/gserver/layers/SequenceToBatch.cpp
--- a/paddle/gserver/layers/SequenceToBatch.h
+++ b/paddle/gserver/layers/SequenceToBatch.h
--- a/paddle/gserver/layers/SlopeInterceptLayer.cpp
+++ b/paddle/gserver/layers/SlopeInterceptLayer.cpp
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
--- a/paddle/gserver/layers/SumToOneNormLayer.cpp
+++ b/paddle/gserver/layers/SumToOneNormLayer.cpp
--- a/paddle/gserver/layers/TransLayer.cpp
+++ b/paddle/gserver/layers/TransLayer.cpp
--- a/paddle/gserver/layers/TransLayer.h
+++ b/paddle/gserver/layers/TransLayer.h
--- a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
+++ b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
--- a/paddle/gserver/layers/ValidationLayer.cpp
+++ b/paddle/gserver/layers/ValidationLayer.cpp
--- a/paddle/gserver/layers/ValidationLayer.h
+++ b/paddle/gserver/layers/ValidationLayer.h
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
--- a/paddle/gserver/tests/TestUtil.cpp
+++ b/paddle/gserver/tests/TestUtil.cpp
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
--- a/paddle/gserver/tests/test_MultinomialSampler.cpp
+++ b/paddle/gserver/tests/test_MultinomialSampler.cpp
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
--- a/paddle/math/MathUtils.cpp
+++ b/paddle/math/MathUtils.cpp
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
--- a/paddle/math/MatrixBitCode.cpp
+++ b/paddle/math/MatrixBitCode.cpp
--- a/paddle/math/MemoryHandle.cpp
+++ b/paddle/math/MemoryHandle.cpp
--- a/paddle/math/PoolAllocator.h
+++ b/paddle/math/PoolAllocator.h
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
--- a/paddle/math/Storage.h
+++ b/paddle/math/Storage.h
--- a/paddle/math/TensorEvaluate.h
+++ b/paddle/math/TensorEvaluate.h
--- a/paddle/math/TensorExpression.h
+++ b/paddle/math/TensorExpression.h
--- a/paddle/math/TrainingAlgorithmOp.h
+++ b/paddle/math/TrainingAlgorithmOp.h
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
--- a/paddle/math/tests/OriginalOptimizerApi.h
+++ b/paddle/math/tests/OriginalOptimizerApi.h
--- a/paddle/math/tests/TestUtils.h
+++ b/paddle/math/tests/TestUtils.h
--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ b/paddle/math/tests/test_GpuProfiler.cpp
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ b/paddle/math/tests/test_SIMDFunctions.cpp
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
--- a/paddle/parameter/FirstOrderOptimizer.cpp
+++ b/paddle/parameter/FirstOrderOptimizer.cpp
--- a/paddle/parameter/ParallelParameter.cpp
+++ b/paddle/parameter/ParallelParameter.cpp
--- a/paddle/parameter/ParallelParameter.h
+++ b/paddle/parameter/ParallelParameter.h
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
--- a/paddle/parameter/ParameterUpdateFunctions.h
+++ b/paddle/parameter/ParameterUpdateFunctions.h
--- a/paddle/parameter/ParameterUpdaterBase.cpp
+++ b/paddle/parameter/ParameterUpdaterBase.cpp
--- a/paddle/parameter/ParameterUpdaterHook.cpp
+++ b/paddle/parameter/ParameterUpdaterHook.cpp
--- a/paddle/parameter/Regularizer.cpp
+++ b/paddle/parameter/Regularizer.cpp
--- a/paddle/parameter/Weight.cpp
+++ b/paddle/parameter/Weight.cpp
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
--- a/paddle/pserver/BaseClient.cpp
+++ b/paddle/pserver/BaseClient.cpp
--- a/paddle/pserver/BaseClient.h
+++ b/paddle/pserver/BaseClient.h
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
--- a/paddle/pserver/LightNetwork.h
+++ b/paddle/pserver/LightNetwork.h
--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
--- a/paddle/pserver/ParameterServer2Main.cpp
+++ b/paddle/pserver/ParameterServer2Main.cpp
--- a/paddle/pserver/ProtoServer.h
+++ b/paddle/pserver/ProtoServer.h
--- a/paddle/pserver/SocketChannel.cpp
+++ b/paddle/pserver/SocketChannel.cpp
--- a/paddle/pserver/SparseParameterDistribution.h
+++ b/paddle/pserver/SparseParameterDistribution.h
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
--- a/paddle/pserver/test/test_ParameterServer2.cpp
+++ b/paddle/pserver/test/test_ParameterServer2.cpp
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
--- a/paddle/py_paddle/util.py
+++ b/paddle/py_paddle/util.py
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
--- a/paddle/scripts/travis/main.sh
+++ b/paddle/scripts/travis/main.sh
--- a/paddle/scripts/travis/precommit.sh
+++ b/paddle/scripts/travis/precommit.sh
--- a/paddle/trainer/MergeModel.cpp
+++ b/paddle/trainer/MergeModel.cpp
--- a/paddle/trainer/ParamUtil.cpp
+++ b/paddle/trainer/ParamUtil.cpp
--- a/paddle/trainer/ParamUtil.h
+++ b/paddle/trainer/ParamUtil.h
--- a/paddle/trainer/ParameterUpdater.h
+++ b/paddle/trainer/ParameterUpdater.h
--- a/paddle/trainer/RemoteParameterUpdater.cpp
+++ b/paddle/trainer/RemoteParameterUpdater.cpp
--- a/paddle/trainer/RemoteParameterUpdater.h
+++ b/paddle/trainer/RemoteParameterUpdater.h
--- a/paddle/trainer/Tester.h
+++ b/paddle/trainer/Tester.h
--- a/paddle/trainer/TesterConfig.h
+++ b/paddle/trainer/TesterConfig.h
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ b/paddle/trainer/ThreadParameterUpdater.h
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
--- a/paddle/trainer/TrainerConfigHelper.h
+++ b/paddle/trainer/TrainerConfigHelper.h
--- a/paddle/trainer/TrainerInternal.cpp
+++ b/paddle/trainer/TrainerInternal.cpp
--- a/paddle/trainer/TrainerInternal.h
+++ b/paddle/trainer/TrainerInternal.h
--- a/paddle/trainer/TrainerInternalConfig.h
+++ b/paddle/trainer/TrainerInternalConfig.h
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
--- a/paddle/trainer/tests/picojson.h
+++ b/paddle/trainer/tests/picojson.h
--- a/paddle/trainer/tests/test_Compare.cpp
+++ b/paddle/trainer/tests/test_Compare.cpp
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/trainer/tests/test_CompareTwoNets.cpp
--- a/paddle/trainer/tests/test_CompareTwoOpts.cpp
+++ b/paddle/trainer/tests/test_CompareTwoOpts.cpp
--- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
+++ b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
--- a/paddle/utils/BarrierStat.cpp
+++ b/paddle/utils/BarrierStat.cpp
--- a/paddle/utils/BarrierStat.h
+++ b/paddle/utils/BarrierStat.h
--- a/paddle/utils/CommandLineParser.cpp
+++ b/paddle/utils/CommandLineParser.cpp
--- a/paddle/utils/CommandLineParser.h
+++ b/paddle/utils/CommandLineParser.h
--- a/paddle/utils/CpuId.cpp
+++ b/paddle/utils/CpuId.cpp
--- a/paddle/utils/CpuId.h
+++ b/paddle/utils/CpuId.h
--- a/paddle/utils/CustomStackTrace.cpp
+++ b/paddle/utils/CustomStackTrace.cpp
--- a/paddle/utils/CustomStackTrace.h
+++ b/paddle/utils/CustomStackTrace.h
--- a/paddle/utils/Logging.cpp
+++ b/paddle/utils/Logging.cpp
--- a/paddle/utils/Logging.h
+++ b/paddle/utils/Logging.h
--- a/paddle/utils/PythonUtil.cpp
+++ b/paddle/utils/PythonUtil.cpp
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
--- a/paddle/utils/Queue.h
+++ b/paddle/utils/Queue.h
--- a/paddle/utils/Stat.cpp
+++ b/paddle/utils/Stat.cpp
--- a/paddle/utils/StringUtil.h
+++ b/paddle/utils/StringUtil.h
--- a/paddle/utils/Thread.h
+++ b/paddle/utils/Thread.h
--- a/paddle/utils/ThreadLocal.cpp
+++ b/paddle/utils/ThreadLocal.cpp
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
--- a/paddle/utils/Version.cpp
+++ b/paddle/utils/Version.cpp
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
--- a/paddle/utils/arch/osx/Locks.cpp
+++ b/paddle/utils/arch/osx/Locks.cpp
--- a/paddle/utils/tests/test_CommandLineParser.cpp
+++ b/paddle/utils/tests/test_CommandLineParser.cpp
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
--- a/paddle/utils/tests/test_CustomStackTracePrint.cpp
+++ b/paddle/utils/tests/test_CustomStackTracePrint.cpp
--- a/paddle/utils/tests/test_Logging.cpp
+++ b/paddle/utils/tests/test_Logging.cpp
--- a/paddle/utils/tests/test_SIMDFlags.cpp
+++ b/paddle/utils/tests/test_SIMDFlags.cpp
--- a/paddle/utils/tests/test_SpinLock.cpp
+++ b/paddle/utils/tests/test_SpinLock.cpp
--- a/paddle/utils/tests/test_Thread.cpp
+++ b/paddle/utils/tests/test_Thread.cpp
--- a/paddle/utils/tests/test_ThreadBarrier.cpp
+++ b/paddle/utils/tests/test_ThreadBarrier.cpp
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer_config_helpers/default_decorators.py
+++ b/python/paddle/trainer_config_helpers/default_decorators.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py
--- a/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
+++ b/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
--- a/third_party/gtest.BUILD
+++ b/third_party/gtest.BUILD
--- a/third_party/protobuf_test/BUILD
+++ b/third_party/protobuf_test/BUILD
--- a/third_party/protobuf_test/example_lib.cc
+++ b/third_party/protobuf_test/example_lib.cc