diff --git a/demo/mnist/.gitignore b/demo/mnist/.gitignore
index 810910fd5ca56f0cfd7051f3392a9f7ea010d7f0..8bd9837523ccf98e6e72d5b82934b7b104816217 100644
--- a/demo/mnist/.gitignore
+++ b/demo/mnist/.gitignore
@@ -4,3 +4,4 @@ mnist_vgg_model
 plot.png
 train.log
 *pyc
+.ipynb_checkpoints
diff --git a/demo/mnist/api_train.py b/demo/mnist/api_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..f301da382ff8a5bc16d9c18b956f78566ed4894f
--- /dev/null
+++ b/demo/mnist/api_train.py
@@ -0,0 +1,205 @@
+"""
+A very basic example for how to use current Raw SWIG API to train mnist network.
+
+Current implementation uses Raw SWIG, which means the API call is directly \
+passed to C++ side of Paddle.
+
+The user api could be simpler and carefully designed.
+"""
+import py_paddle.swig_paddle as api
+from py_paddle import DataProviderConverter
+import paddle.trainer.PyDataProvider2 as dp
+import numpy as np
+import random
+from mnist_util import read_from_mnist
+from paddle.trainer_config_helpers import *
+
+
+def optimizer_config():
+    settings(
+        learning_rate=1e-4,
+        learning_method=AdamOptimizer(),
+        batch_size=1000,
+        model_average=ModelAverage(average_window=0.5),
+        regularization=L2Regularization(rate=0.5))
+
+
+def network_config():
+    imgs = data_layer(name='pixel', size=784)
+    hidden1 = fc_layer(input=imgs, size=200)
+    hidden2 = fc_layer(input=hidden1, size=200)
+    inference = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
+    cost = classification_cost(
+        input=inference, label=data_layer(
+            name='label', size=10))
+    outputs(cost)
+
+
+def init_parameter(network):
+    assert isinstance(network, api.GradientMachine)
+    for each_param in network.getParameters():
+        assert isinstance(each_param, api.Parameter)
+        array_size = len(each_param)
+        array = np.random.uniform(-1.0, 1.0, array_size).astype('float32')
+        each_param.getBuf(api.PARAMETER_VALUE).copyFromNumpyArray(array)
+
+
+def generator_to_batch(generator, batch_size):
+    ret_val = list()
+    for each_item in generator:
+        ret_val.append(each_item)
+        if len(ret_val) == batch_size:
+            yield ret_val
+            ret_val = list()
+    if len(ret_val) != 0:
+        yield ret_val
+
+
+class BatchPool(object):
+    def __init__(self, generator, batch_size):
+        self.data = list(generator)
+        self.batch_size = batch_size
+
+    def __call__(self):
+        random.shuffle(self.data)
+        for offset in xrange(0, len(self.data), self.batch_size):
+            limit = min(offset + self.batch_size, len(self.data))
+            yield self.data[offset:limit]
+
+
+def input_order_converter(generator):
+    for each_item in generator:
+        yield each_item['pixel'], each_item['label']
+
+
+def main():
+    api.initPaddle("-use_gpu=false", "-trainer_count=4")  # use 4 cpu cores
+
+    # get enable_types for each optimizer.
+    # enable_types = [value, gradient, momentum, etc]
+    # For each optimizer(SGD, Adam), GradientMachine should enable different
+    # buffers.
+    opt_config_proto = parse_optimizer_config(optimizer_config)
+    opt_config = api.OptimizationConfig.createFromProto(opt_config_proto)
+    _temp_optimizer_ = api.ParameterOptimizer.create(opt_config)
+    enable_types = _temp_optimizer_.getParameterTypes()
+
+    # Create Simple Gradient Machine.
+    model_config = parse_network_config(network_config)
+    m = api.GradientMachine.createFromConfigProto(
+        model_config, api.CREATE_MODE_NORMAL, enable_types)
+
+    # This type check is not useful. Only enable type hint in IDE.
+    # Such as PyCharm
+    assert isinstance(m, api.GradientMachine)
+
+    # Initialize Parameter by numpy.
+    init_parameter(network=m)
+
+    # Create Local Updater. Local means not run in cluster.
+    # For a cluster training, here we can change to createRemoteUpdater
+    # in future.
+    updater = api.ParameterUpdater.createLocalUpdater(opt_config)
+    assert isinstance(updater, api.ParameterUpdater)
+
+    # Initialize ParameterUpdater.
+    updater.init(m)
+
+    # DataProvider Converter is a utility convert Python Object to Paddle C++
+    # Input. The input format is as same as Paddle's DataProvider.
+    converter = DataProviderConverter(
+        input_types=[dp.dense_vector(784), dp.integer_value(10)])
+
+    train_file = './data/raw_data/train'
+    test_file = './data/raw_data/t10k'
+
+    # start gradient machine.
+    # the gradient machine must be started before invoke forward/backward.
+    # not just for training, but also for inference.
+    m.start()
+
+    # evaluator can print error rate, etc. It is a C++ class.
+    batch_evaluator = m.makeEvaluator()
+    test_evaluator = m.makeEvaluator()
+
+    # Get Train Data.
+    # TrainData will stored in a data pool. Currently implementation is not care
+    # about memory, speed. Just a very naive implementation.
+    train_data_generator = input_order_converter(read_from_mnist(train_file))
+    train_data = BatchPool(train_data_generator, 512)
+
+    # outArgs is Neural Network forward result. Here is not useful, just passed
+    # to gradient_machine.forward
+    outArgs = api.Arguments.createArguments(0)
+
+    for pass_id in xrange(2):  # we train 2 passes.
+        updater.startPass()
+
+        for batch_id, data_batch in enumerate(train_data()):
+            # data_batch is input images.
+            # here, for online learning, we could get data_batch from network.
+
+            # Start update one batch.
+            pass_type = updater.startBatch(len(data_batch))
+
+            # Start BatchEvaluator.
+            # batch_evaluator can be used between start/finish.
+            batch_evaluator.start()
+
+            # forwardBackward is a shortcut for forward and backward.
+            # It is sometimes faster than invoke forward/backward separately,
+            # because in GradientMachine, it may be async.
+            m.forwardBackward(converter(data_batch), outArgs, pass_type)
+
+            for each_param in m.getParameters():
+                updater.update(each_param)
+
+            # Get cost. We use numpy to calculate total cost for this batch.
+            cost_vec = outArgs.getSlotValue(0)
+            cost_vec = cost_vec.copyToNumpyMat()
+            cost = cost_vec.sum() / len(data_batch)
+
+            # Make evaluator works.
+            m.eval(batch_evaluator)
+
+            # Print logs.
+            print 'Pass id', pass_id, 'Batch id', batch_id, 'with cost=', \
+                cost, batch_evaluator
+
+            batch_evaluator.finish()
+            # Finish batch.
+            #  * will clear gradient.
+            #  * ensure all values should be updated.
+            updater.finishBatch(cost)
+
+        # testing stage. use test data set to test current network.
+        updater.apply()
+        test_evaluator.start()
+        test_data_generator = input_order_converter(read_from_mnist(test_file))
+        for data_batch in generator_to_batch(test_data_generator, 512):
+            # in testing stage, only forward is needed.
+            m.forward(converter(data_batch), outArgs, api.PASS_TEST)
+            m.eval(test_evaluator)
+
+        # print error rate for test data set
+        print 'Pass', pass_id, ' test evaluator: ', test_evaluator
+        test_evaluator.finish()
+        updater.restore()
+
+        updater.catchUpWith()
+        params = m.getParameters()
+        for each_param in params:
+            assert isinstance(each_param, api.Parameter)
+            value = each_param.getBuf(api.PARAMETER_VALUE)
+            value = value.copyToNumpyArray()
+
+            # Here, we could save parameter to every where you want
+            print each_param.getName(), value
+
+        updater.finishPass()
+
+    m.finish()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/mnist/mnist_provider.py b/demo/mnist/mnist_provider.py
index 4635833d36b9f21c992d96910f3ac9094ccefd2c..888cfef1e7e3e1b4f556756c003eeb23e741cabe 100644
--- a/demo/mnist/mnist_provider.py
+++ b/demo/mnist/mnist_provider.py
@@ -1,5 +1,5 @@
 from paddle.trainer.PyDataProvider2 import *
-import numpy
+from mnist_util import read_from_mnist
 
 
 # Define a py data provider
@@ -8,27 +8,5 @@ import numpy
                  'label': integer_value(10)},
     cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, filename):  # settings is not used currently.
-    imgf = filename + "-images-idx3-ubyte"
-    labelf = filename + "-labels-idx1-ubyte"
-    f = open(imgf, "rb")
-    l = open(labelf, "rb")
-
-    f.read(16)
-    l.read(8)
-
-    # Define number of samples for train/test
-    if "train" in filename:
-        n = 60000
-    else:
-        n = 10000
-
-    images = numpy.fromfile(
-        f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
-    images = images / 255.0 * 2.0 - 1.0
-    labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
-
-    for i in xrange(n):
-        yield {"pixel": images[i, :], 'label': labels[i]}
-
-    f.close()
-    l.close()
+    for each in read_from_mnist(filename):
+        yield each
diff --git a/demo/mnist/mnist_util.py b/demo/mnist/mnist_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fd88ae7edc821296ca0accbf6dedc083e411744
--- /dev/null
+++ b/demo/mnist/mnist_util.py
@@ -0,0 +1,30 @@
+import numpy
+
+__all__ = ['read_from_mnist']
+
+
+def read_from_mnist(filename):
+    imgf = filename + "-images-idx3-ubyte"
+    labelf = filename + "-labels-idx1-ubyte"
+    f = open(imgf, "rb")
+    l = open(labelf, "rb")
+
+    f.read(16)
+    l.read(8)
+
+    # Define number of samples for train/test
+    if "train" in filename:
+        n = 60000
+    else:
+        n = 10000
+
+    images = numpy.fromfile(
+        f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
+    images = images / 255.0 * 2.0 - 1.0
+    labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
+
+    for i in xrange(n):
+        yield {"pixel": images[i, :], 'label': labels[i]}
+
+    f.close()
+    l.close()
diff --git a/demo/quick_start/api_predict.sh b/demo/quick_start/api_predict.sh
index c90d3b70548b3ef2a7e0e423c74cd97f1886c0fc..4d9aa9e8854ed79446a47dbc593f419cdda077b4 100755
--- a/demo/quick_start/api_predict.sh
+++ b/demo/quick_start/api_predict.sh
@@ -17,7 +17,7 @@ set -e
 #Note the default model is pass-00002, you shold make sure the model path
 #exists or change the mode path.
 #only test on trainer_config.lr.py
-model=output/pass-00001/
+model=output/model/pass-00001/
 config=trainer_config.lr.py
 label=data/labels.list
 dict=data/dict.txt
diff --git a/demo/quick_start/cluster/cluster_train.sh b/demo/quick_start/cluster/cluster_train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..aac9b89b14b98ac8e2db7def19e5f06c01682493
--- /dev/null
+++ b/demo/quick_start/cluster/cluster_train.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+# Should run pserver.sh before run this script.
+bin_dir=$(cd `dirname $0`; pwd)
+home_dir=$(cd "${bin_dir}/.."; pwd)
+source "$bin_dir/env.sh"
+
+model_dir="$bin_dir/output"
+log_file="$bin_dir/train.log"
+
+pushd "$home_dir"
+cfg=trainer_config.lr.py
+paddle train \
+  --config=$cfg \
+  --save_dir=${model_dir} \
+  --trainer_count=4 \
+  --local=0 \
+  --log_period=100 \
+  --num_passes=15 \
+  --use_gpu=false \
+  --show_parameter_stats_period=100 \
+  --test_all_data_in_one_period=1 \
+  --num_gradient_servers=1 \
+  --nics=`get_nics` \
+  --port=7164 \
+  --ports_num=1 \
+  --pservers="127.0.0.1" \
+  --comment="paddle_trainer" \
+  2>&1 | tee "$log_file"
+popd
diff --git a/demo/quick_start/cluster/env.sh b/demo/quick_start/cluster/env.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a404993835d0e479f65c89c5561855293b7b66f0
--- /dev/null
+++ b/demo/quick_start/cluster/env.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+function get_nics() {
+  machine=`uname -s`
+  local nics=""
+  if [ "$machine" == "Linux" ]; then
+    nics="lo"
+  elif [ "$machine" == "Darwin" ]; then
+    nics="lo0"
+  else
+    nics="unsupport"
+  fi
+  echo $nics
+}
diff --git a/demo/quick_start/cluster/pserver.sh b/demo/quick_start/cluster/pserver.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b187c1d9b9108a607ed310253d54ecc096f0e792
--- /dev/null
+++ b/demo/quick_start/cluster/pserver.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+bin_dir=$(cd `dirname $0`; pwd)
+source "$bin_dir/env.sh"
+
+paddle pserver \
+  --nics=`get_nics` \
+  --port=7164 \
+  --ports_num=1 \
+  --ports_num_for_sparse=1 \
+  --num_gradient_servers=1 \
+  --comment="paddle_pserver" \
+  2>&1 | tee 'pserver.log'
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index ea0ef25f00d4cb3232d5c6ee1f1e33abd2dadaee..7d425a05d46131d84ba895d0fefc3a592a9a36e1 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -72,7 +72,7 @@ PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需
 减少数据载入的耗时
 ++++++++++++++++++
 
-使用 :code:`pydataprovider`时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
+使用\ :code:`pydataprovider`\ 时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
 :code:`DataProvider` 缓存池的减小，和之前减小通过减小缓存池来减小内存占用的原理一致。
 
 ..  literalinclude:: src/reduce_min_pool_size.py
diff --git a/doc/tutorials/gan/gan.png b/doc/tutorials/gan/gan.png
new file mode 100644
index 0000000000000000000000000000000000000000..001ed6cc19e8911f9b10f63211c9658160b3a06e
Binary files /dev/null and b/doc/tutorials/gan/gan.png differ
diff --git a/doc/tutorials/gan/index_en.md b/doc/tutorials/gan/index_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..99c8d730117a469c89abb218eeacf66103c0cbed
--- /dev/null
+++ b/doc/tutorials/gan/index_en.md
@@ -0,0 +1,143 @@
+# Generative Adversarial Networks (GAN) 
+
+This demo implements GAN training described in the original [GAN paper](https://arxiv.org/abs/1406.2661) and deep convolutional generative adversarial networks [DCGAN paper](https://arxiv.org/abs/1511.06434).
+
+The high-level structure of GAN is shown in Figure. 1 below. It is composed of two major parts: a generator and a discriminator, both of which are based on neural networks. The generator takes in some kind of noise with a known distribution and transforms it into an image. The discriminator takes in an image and determines whether it is artificially generated by the generator or a real image. So the generator and the discriminator are in a competitive game in which generator is trying to generate image to look as real as possible to fool the discriminator, while the discriminator is trying to distinguish between real and fake images. 
+
+<p align="center">
+    <img src="./gan.png" width="500" height="300"> 
+</p>
+<p align="center">
+    Figure 1. GAN-Model-Structure
+    <a href="https://ishmaelbelghazi.github.io/ALI/">figure credit</a>
+</p>
+
+The generator and discriminator take turn to be trained using SGD. The objective function of the generator is for its generated images being classified as real by the discriminator, and the objective function of the discriminator is to correctly classify real and fake images. When the GAN model is trained to converge to the equilibrium state, the generator will transform the given noise distribution to the distribution of real images, and the discriminator will not be able to distinguish between real and fake images at all. 
+
+## Implementation of GAN Model Structure
+Since GAN model involves multiple neural networks, it requires to use paddle python API. So the code walk-through below can also partially serve as an introduction to the usage of Paddle Python API.
+
+There are three networks defined in gan_conf.py, namely **generator_training**, **discriminator_training** and **generator**. The relationship to the model structure we defined above is that **discriminator_training** is the discriminator, **generator** is the generator, and the **generator_training** combined the generator and discriminator since training generator would require the discriminator to provide loss function. This relationship is described in the following code:
+```python
+if is_generator_training:
+    noise = data_layer(name="noise", size=noise_dim)
+    sample = generator(noise)
+
+if is_discriminator_training:
+    sample = data_layer(name="sample", size=sample_dim)
+
+if is_generator_training or is_discriminator_training:
+    label = data_layer(name="label", size=1)
+    prob = discriminator(sample)
+    cost = cross_entropy(input=prob, label=label)
+    classification_error_evaluator(
+        input=prob, label=label, name=mode + '_error')
+    outputs(cost)
+
+if is_generator:
+    noise = data_layer(name="noise", size=noise_dim)
+    outputs(generator(noise))
+```
+
+In order to train the networks defined in gan_conf.py, one first needs to initialize a Paddle environment, parse the config, create GradientMachine from the config and create trainer from GradientMachine as done in the code chunk below:
+```python
+import py_paddle.swig_paddle as api
+# init paddle environment
+api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10',
+               '--log_period=100', '--gpu_id=' + args.gpu_id,
+               '--save_dir=' + "./%s_params/" % data_source)
+
+# Parse config
+gen_conf = parse_config(conf, "mode=generator_training,data=" + data_source)
+dis_conf = parse_config(conf, "mode=discriminator_training,data=" + data_source)
+generator_conf = parse_config(conf, "mode=generator,data=" + data_source)
+
+# Create GradientMachine
+dis_training_machine = api.GradientMachine.createFromConfigProto(
+dis_conf.model_config)
+gen_training_machine = api.GradientMachine.createFromConfigProto(
+gen_conf.model_config)
+generator_machine = api.GradientMachine.createFromConfigProto(
+generator_conf.model_config)
+
+# Create trainer
+dis_trainer = api.Trainer.create(dis_conf, dis_training_machine)
+gen_trainer = api.Trainer.create(gen_conf, gen_training_machine)
+```
+
+In order to balance the strength between generator and discriminator, we schedule to train whichever one is performing worse by comparing their loss function value. The loss function value can be calculated by a forward pass through the GradientMachine.
+```python
+def get_training_loss(training_machine, inputs):
+    outputs = api.Arguments.createArguments(0)
+    training_machine.forward(inputs, outputs, api.PASS_TEST)
+    loss = outputs.getSlotValue(0).copyToNumpyMat()
+    return numpy.mean(loss)
+```
+
+After training one network, one needs to sync the new parameters to the other networks. The code below demonstrates one example of such use case:
+```python
+# Train the gen_training
+gen_trainer.trainOneDataBatch(batch_size, data_batch_gen)
+
+# Copy the parameters from gen_training to dis_training and generator
+copy_shared_parameters(gen_training_machine,
+dis_training_machine)
+copy_shared_parameters(gen_training_machine, generator_machine)
+```
+
+
+## A Toy Example 
+With the infrastructure explained above, we can now walk you through a toy example of generating two dimensional uniform distribution using 10 dimensional Gaussian noise. 
+
+The Gaussian noises are generated using the code below:
+```python
+def get_noise(batch_size, noise_dim):
+    return numpy.random.normal(size=(batch_size, noise_dim)).astype('float32')
+```
+
+The real samples (2-D uniform) are generated using the code below:
+```python
+# synthesize 2-D uniform data in gan_trainer.py:114
+def load_uniform_data():
+    data = numpy.random.rand(1000000, 2).astype('float32')
+    return data
+```
+
+The generator and discriminator network are built using fully-connected layer and batch_norm layer, and are defined in gan_conf.py. 
+
+To train the GAN model, one can use the command below. The flag -d specifies the training data (cifar, mnist or uniform) and flag --useGpu specifies whether to use gpu for training (0 is cpu, 1 is gpu).  
+```bash
+$python gan_trainer.py -d uniform --useGpu 1
+```
+The generated samples can be found in ./uniform_samples/ and one example is shown below as Figure 2. One can see that it roughly recovers the 2D uniform distribution. 
+
+<p align="center">
+    <img src="./uniform_sample.png" width="300" height="300"> 
+</p>
+<p align="center">
+    Figure 2. Uniform Sample
+</p>
+
+## MNIST Example
+### Data preparation
+To download the MNIST data, one can use the following commands:
+```bash
+$cd data/
+$./get_mnist_data.sh
+```
+
+### Model description
+Following the DC-Gan paper (https://arxiv.org/abs/1511.06434), we use convolution/convolution-transpose layer in the discriminator/generator network to better deal with images. The details of the network structures are defined in gan_conf_image.py. 
+
+### Training the model
+To train the GAN model on mnist data, one can use the following command:
+```bash
+$python gan_trainer.py -d mnist --useGpu 1
+```
+The generated sample images can be found at ./mnist_samples/ and one example is shown below as Figure 3. 
+<p align="center">
+    <img src="./mnist_sample.png" width="300" height="300"> 
+</p>
+<p align="center">
+    Figure 3. MNIST Sample
+</p>
diff --git a/doc/tutorials/gan/mnist_sample.png b/doc/tutorials/gan/mnist_sample.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9c7bf7ddd7f148eac4fe347e9c38afaa8876760
Binary files /dev/null and b/doc/tutorials/gan/mnist_sample.png differ
diff --git a/doc/tutorials/gan/uniform_sample.png b/doc/tutorials/gan/uniform_sample.png
new file mode 100644
index 0000000000000000000000000000000000000000..4a96c45cae82673f5a1df986f2643a8026da7937
Binary files /dev/null and b/doc/tutorials/gan/uniform_sample.png differ
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index ed69bd764f30ac4047895ff539a0b70dfa5aac61..da6dad10cd807654f9ddd03beeb29cef69fc8de0 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -1,10 +1,12 @@
 set(API_SOURCES
     Arguments.cpp
     ConfigParser.cpp
+    Evaluator.cpp
     GradientMachine.cpp
     Matrix.cpp
     Parameter.cpp
     ParameterOptimizer.cpp
+    ParameterUpdater.cpp
     SequenceGenerator.cpp
     Trainer.cpp
     Util.cpp
@@ -63,6 +65,15 @@ install(DIRECTORY ${PROJ_ROOT}/paddle/dist/
 
 add_custom_target(python_api_wheel ALL DEPENDS
   ${PROJ_ROOT}/paddle/dist/.timestamp)
+add_dependencies(python_api_wheel python_swig_sources
+  paddle_parameter
+  paddle_math
+  paddle_utils
+  paddle_gserver
+  paddle_pserver
+  paddle_trainer
+  paddle_api
+  paddle_cuda)
 
 if(WITH_TESTING)
     add_subdirectory(test)
diff --git a/paddle/utils/DisableCopy.h b/paddle/api/Evaluator.cpp
similarity index 60%
rename from paddle/utils/DisableCopy.h
rename to paddle/api/Evaluator.cpp
index 41de98bbde664651803c8db4c0cd7216b2ff4231..c30e09876397e37ef9ed4ec3200d1aa372ceb609 100644
--- a/paddle/utils/DisableCopy.h
+++ b/paddle/api/Evaluator.cpp
@@ -11,13 +11,19 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <sstream>
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
 
-#pragma once
+Evaluator::Evaluator() : m(new EvaluatorPrivate()) {}
+Evaluator::~Evaluator() { delete m; }
 
-/**
- * Disable copy macro.
- */
-#define DISABLE_COPY(CLASS_NAME)                \
-  CLASS_NAME(CLASS_NAME &&) = delete;           \
-  CLASS_NAME(const CLASS_NAME &other) = delete; \
-  CLASS_NAME &operator=(const CLASS_NAME &other) = delete
+void Evaluator::start() { m->rawPtr->start(); }
+
+void Evaluator::finish() { m->rawPtr->finish(); }
+
+std::string Evaluator::toString() {
+  std::ostringstream sout;
+  m->rawPtr->printStats(sout);
+  return sout.str();
+}
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index 297eaa19bb9981c7f07c90763d76494b7910af93..66115f8293b905809639afff779abfdb2bb3a54e 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -64,6 +64,18 @@ GradientMachine* GradientMachine::createByModelConfig(
   return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
 }
 
+void GradientMachine::start() { m->machine->start(); }
+
+void GradientMachine::finish() { m->machine->finish(); }
+
+void GradientMachine::onPassEnd() { m->machine->onPassEnd(); }
+
+void GradientMachine::prefetch(const Arguments& inArgs) {
+  auto& in =
+      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
+  m->machine->prefetch(in);
+}
+
 void GradientMachine::forward(const Arguments& inArgs,
                               Arguments* outArgs,
                               PassType passType) {
@@ -158,3 +170,13 @@ SequenceGenerator* GradientMachine::asSequenceGenerator(
   r->setBeamSize(beam_size);
   return r;
 }
+
+Evaluator* GradientMachine::makeEvaluator() {
+  auto ev = new Evaluator();
+  ev->m->rawPtr = m->machine->makeEvaluator();
+  return ev;
+}
+
+void GradientMachine::eval(Evaluator* evaluator) {
+  m->machine->eval(evaluator->m->rawPtr);
+}
diff --git a/paddle/api/Paddle.swig b/paddle/api/Paddle.swig
index 9194a6371be9e00c037967464ee2b63c1e4f6192..3365927f9b59936244230bed439808fa7ead2c61 100644
--- a/paddle/api/Paddle.swig
+++ b/paddle/api/Paddle.swig
@@ -96,7 +96,9 @@ namespace std {
 %rename(__getitem__) Vector::get;
 %rename(__setitem__) Vector::set;
 %rename(__len__) Vector::getSize;
+%rename(__len__) Parameter::getSize;
 %rename(__call__) ParameterTraverseCallback::apply;
+%rename(__repr__) Evaluator::toString;
 
 %apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) { 
   (float* data, int dim1, int dim2) 
@@ -167,6 +169,7 @@ namespace std {
 %newobject GradientMachine::asSequenceGenerator;
 %newobject GradientMachine::getParameter;
 %newobject GradientMachine::getLayerOutput;
+%newobject GradientMachine::makeEvaluator;
 %newobject TrainerConfig::createFromTrainerConfigFile;
 %newobject TrainerConfig::getModelConfig;
 %newobject TrainerConfig::getOptimizationConfig;
@@ -174,6 +177,7 @@ namespace std {
 %newobject Parameter::getConfig;
 %newobject ParameterOptimizer::create;
 %newobject ParameterOptimizer::needSpecialTraversal;
+%newobject ParameterUpdater::createLocalUpdater;
 
 %feature("director") UpdateCallback;
 %feature("autodoc", 1); // To generate method stub, for code hint in ide
@@ -193,4 +197,4 @@ namespace std {
 %ignore OptimizationConfigPrivate;
 %ignore ParameterTraverseCallbackPrivate;
 %include "utils/GlobalConstants.h"
-%include "api/PaddleAPI.h"
\ No newline at end of file
+%include "api/PaddleAPI.h"
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 84a66719c33678fc4aeb038bb81a6b7c5d0c93fb..09c891871a5ca8571216d211203fe8643fc3a63f 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -20,15 +20,11 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/common.h"
 
 /// Import PaddlePaddle's enumeration into global namespace.
 using namespace paddle::enumeration_wrapper;  // NOLINT
 
-#define DISABLE_COPY_AND_ASSIGN(classname) \
-  classname(const classname& other);       \
-  classname& operator=(const classname& other)
-
 /**
  * @brief Initialize paddle.
  *
@@ -102,7 +98,7 @@ const size_t NO_SPARSE_ID = -1UL;
 struct MatrixPrivate;
 class Matrix {
   Matrix();  // User Cannot Create Matrix.
-  DISABLE_COPY_AND_ASSIGN(Matrix);
+  DISABLE_COPY(Matrix);
   static Matrix* createByPaddleMatrixPtr(void* sharedPtr);
 
 public:
@@ -242,7 +238,7 @@ private:
 
 struct VectorPrivate;
 class Vector {
-  DISABLE_COPY_AND_ASSIGN(Vector);
+  DISABLE_COPY(Vector);
   Vector();
   static Vector* createByPaddleVectorPtr(void* ptr);
 
@@ -322,7 +318,7 @@ private:
 struct IVectorPrivate;
 class IVector {
   IVector();
-  DISABLE_COPY_AND_ASSIGN(IVector);
+  DISABLE_COPY(IVector);
   static IVector* createByPaddleVectorPtr(void* ptr);
 
 public:
@@ -402,7 +398,7 @@ struct ArgumentsPrivate;
 class Arguments {
 private:
   Arguments();  // Internal Create.
-  DISABLE_COPY_AND_ASSIGN(Arguments);
+  DISABLE_COPY(Arguments);
 
 public:
   /**
@@ -472,7 +468,7 @@ enum GradientMatchineCreateMode {
 
 struct ParameterConfigPrivate;
 class ParameterConfig {
-  DISABLE_COPY_AND_ASSIGN(ParameterConfig);
+  DISABLE_COPY(ParameterConfig);
   ParameterConfig();
 
   /**
@@ -502,7 +498,7 @@ private:
 
 struct OptimizationConfigPrivate;
 class OptimizationConfig {
-  DISABLE_COPY_AND_ASSIGN(OptimizationConfig);
+  DISABLE_COPY(OptimizationConfig);
   OptimizationConfig();
 
 public:
@@ -519,6 +515,7 @@ private:
 
   friend class TrainerConfig;
   friend class ParameterOptimizer;
+  friend class ParameterUpdater;
   friend class Trainer;
 };
 
@@ -526,7 +523,7 @@ struct ParameterPrivate;
 class Parameter {
 private:
   Parameter();
-  DISABLE_COPY_AND_ASSIGN(Parameter);
+  DISABLE_COPY(Parameter);
 
 public:
   virtual ~Parameter();
@@ -549,6 +546,8 @@ public:
   ParameterConfig* getConfig();
   void setValueUpdated();
 
+  size_t getSize() const;
+
 private:
   static Parameter* createFromRawPtr(void* ptr);
   static Parameter* createFromSharedPtr(void* ptr);
@@ -557,6 +556,7 @@ private:
   ParameterPrivate* m;
   friend class UpdateCallbackWrapper;
   friend class GradientMachine;
+  friend class ParameterUpdater;
 };
 
 struct ModelConfigPrivate;
@@ -568,7 +568,7 @@ struct ModelConfigPrivate;
 class ModelConfig {
 private:
   ModelConfig();
-  DISABLE_COPY_AND_ASSIGN(ModelConfig);
+  DISABLE_COPY(ModelConfig);
 
 public:
   virtual ~ModelConfig();
@@ -589,7 +589,7 @@ struct TrainerConfigPrivate;
 class TrainerConfig {
 private:
   TrainerConfig();
-  DISABLE_COPY_AND_ASSIGN(TrainerConfig);
+  DISABLE_COPY(TrainerConfig);
 
 public:
   virtual ~TrainerConfig();
@@ -629,7 +629,7 @@ public:
 
 struct ParameterTraverseCallbackPrivate;
 class ParameterTraverseCallback {
-  DISABLE_COPY_AND_ASSIGN(ParameterTraverseCallback);
+  DISABLE_COPY(ParameterTraverseCallback);
   ParameterTraverseCallback();
 
 public:
@@ -651,7 +651,7 @@ private:
  */
 struct ParameterOptimizerPrivate;
 class ParameterOptimizer {
-  DISABLE_COPY_AND_ASSIGN(ParameterOptimizer);
+  DISABLE_COPY(ParameterOptimizer);
   ParameterOptimizer();
 
 public:
@@ -683,12 +683,12 @@ private:
 };
 
 class SequenceGenerator;
-
+class Evaluator;
 struct GradientMachinePrivate;
 class GradientMachine {
 private:
   GradientMachine();
-  DISABLE_COPY_AND_ASSIGN(GradientMachine);
+  DISABLE_COPY(GradientMachine);
 
 public:
   virtual ~GradientMachine();
@@ -714,6 +714,23 @@ public:
       GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
       const std::vector<int>& parameterTypes = defaultParamTypes);
 
+  /**
+   * @brief finish
+   */
+  void finish();
+
+  void start();
+
+  /**
+   * Prefetch row ids of sparse parameter.
+   */
+  void prefetch(const Arguments& inArgs);
+
+  /**
+   * Do some thing when train pass ended.
+   */
+  void onPassEnd();
+
   /**
    * The forward stage of GradientMachine.
    *
@@ -761,6 +778,10 @@ public:
       size_t max_length = 100UL,
       size_t beam_size = -1UL);
 
+  Evaluator* makeEvaluator();
+
+  void eval(Evaluator* evaluator);
+
 private:
   GradientMachinePrivate* m;
 
@@ -772,6 +793,109 @@ private:
   // Not to use c++ 11 init-list, so we use static var as function default arg.
   static std::vector<int> defaultParamTypes;
   friend class Trainer;
+  friend class ParameterUpdater;
+};
+
+struct ParameterUpdaterPrivate;
+class ParameterUpdater {
+private:
+  ParameterUpdater();
+
+public:
+  static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
+  ~ParameterUpdater();
+
+  /**
+   * @brief initialize Parameter Updater by GradientMachine.
+   * @param gm
+   */
+  void init(const GradientMachine& gm);
+
+  /**
+   * @brief begin of a training/testing of one pass.
+   */
+  void startPass();
+
+  /**
+   * @brief end of a traning/testing of one pass.
+   */
+  void finishPass();
+
+  /**
+   * @brief begin of a training/testing of one batch.
+   * @param data batch's size
+   * @return PassType, mostly will be training.
+   */
+  PassType startBatch(size_t batchSize);
+
+  /**
+   * @brief end of a traning/testing of one batch
+   * @param cost current batch cost.
+   */
+  void finishBatch(float cost);
+
+  /**
+   * @brief update a parameter (by local optimizer or by cluster pserver)
+   * @param param
+   */
+  void update(Parameter* param);
+
+  /**
+   * @brief restore the average parameter.
+   * @note It is only used in AverageOptimizer. Restore will get the current
+   * PARAMETER_VALUE back.
+   */
+  void restore();
+
+  /**
+   * @brief apply. Store the average parameter.
+   * @note It is only used in AverageOptimizer. Apply will store the current
+   * PARAMETER_VALUE to buffer, calcaualte current Average Parameter, and save
+   * it to PARAMETER_VALUE.
+   */
+  void apply();
+
+  /**
+   * @brief catchUpWith The Regularization will be delayed in many situations(
+   * pserver, local sparse). Catch Up means catch the regularization up, apply
+   * regularization to all params.
+   */
+  void catchUpWith();
+
+private:
+  ParameterUpdaterPrivate* m;
+};
+
+struct EvaluatorPrivate;
+class Evaluator {
+private:
+  Evaluator();
+  DISABLE_COPY(Evaluator);
+
+public:
+  ~Evaluator();
+
+  /**
+   * @brief begin an evaluate stage.
+   */
+  void start();
+
+  /**
+   * @brief end an evaluate stage.
+   */
+  void finish();
+
+  /**
+   * @brief toString will get a evaluate result.
+   *
+   * __repr__ method in python
+   */
+  std::string toString();
+
+private:
+  EvaluatorPrivate* m;
+
+  friend class GradientMachine;
 };
 
 struct TrainerPrivate;
@@ -780,7 +904,7 @@ private:
   TrainerPrivate* m;
   Trainer();
   Trainer(TrainerConfig* optConfig, GradientMachine* gm);
-  DISABLE_COPY_AND_ASSIGN(Trainer);
+  DISABLE_COPY(Trainer);
 
 public:
   virtual ~Trainer();
@@ -846,7 +970,7 @@ public:
 
 struct SequenceGeneratorPrivate;
 class SequenceGenerator {
-  DISABLE_COPY_AND_ASSIGN(SequenceGenerator);
+  DISABLE_COPY(SequenceGenerator);
   SequenceGenerator();
 
 public:
diff --git a/paddle/api/PaddleAPIPrivate.h b/paddle/api/PaddleAPIPrivate.h
index d2b56fc41c8aadb136ad6812f848e764e031073c..f41352bfec7c3333bde9509957aba8c5f373b9f2 100644
--- a/paddle/api/PaddleAPIPrivate.h
+++ b/paddle/api/PaddleAPIPrivate.h
@@ -11,12 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
+#pragma once
+#include <memory>
+#include "PaddleAPI.h"
+#include "paddle/gserver/evaluators/Evaluator.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/parameter/ParameterUpdaterBase.h"
 #include "paddle/trainer/TrainerConfigHelper.h"
 
-#pragma once
-
 struct GradientMachinePrivate {
   std::shared_ptr<paddle::GradientMachine> machine;
 
@@ -65,3 +67,31 @@ struct ArgumentsPrivate {
     return *(std::shared_ptr<T>*)(rawPtr);
   }
 };
+
+struct ParameterUpdaterPrivate {
+  std::unique_ptr<paddle::ParameterUpdater> updater;
+};
+
+struct ParameterPrivate {
+  std::shared_ptr<paddle::Parameter> sharedPtr;
+  paddle::Parameter* rawPtr;  // rawPtr only used in ParameterUpdater,
+                              // in other situation sharedPtr should
+                              // contains value.
+
+  ParameterPrivate() : sharedPtr(nullptr), rawPtr(nullptr) {}
+
+  paddle::Parameter* getPtr() {
+    if (sharedPtr) {
+      return sharedPtr.get();
+    } else {
+      return rawPtr;
+    }
+  }
+};
+
+struct EvaluatorPrivate {
+  paddle::Evaluator* rawPtr;
+
+  EvaluatorPrivate() : rawPtr(nullptr) {}
+  ~EvaluatorPrivate() { delete rawPtr; }
+};
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
index 4eed00a84a695f2c48ff93b33419ae2b3dd03768..ddc00d8d1af4c58d7e2233423bea916408bee92b 100644
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -14,21 +14,7 @@ limitations under the License. */
 
 #include "paddle/parameter/Parameter.h"
 #include "PaddleAPI.h"
-
-struct ParameterPrivate {
-  std::shared_ptr<paddle::Parameter> sharedPtr;
-  paddle::Parameter* rawPtr;
-
-  ParameterPrivate() : sharedPtr(nullptr), rawPtr(nullptr) {}
-
-  paddle::Parameter* getPtr() {
-    if (sharedPtr) {
-      return sharedPtr.get();
-    } else {
-      return rawPtr;
-    }
-  }
-};
+#include "PaddleAPIPrivate.h"
 
 Parameter::Parameter() : m(new ParameterPrivate()) {}
 
@@ -70,3 +56,5 @@ ParameterConfig* Parameter::getConfig() {
 size_t Parameter::getID() const { return m->getPtr()->getID(); }
 
 void Parameter::setValueUpdated() { m->getPtr()->setValueUpdated(); }
+
+size_t Parameter::getSize() const { return m->getPtr()->getSize(); }
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7cd8ed7e3907489a60f37090df6f51492def2612
--- /dev/null
+++ b/paddle/api/ParameterUpdater.cpp
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+
+#include "PaddleAPIPrivate.h"
+#include "paddle/trainer/ThreadParameterUpdater.h"
+
+ParameterUpdater::ParameterUpdater() : m(new ParameterUpdaterPrivate()) {}
+
+ParameterUpdater *ParameterUpdater::createLocalUpdater(
+    OptimizationConfig *config) {
+  auto param = new ParameterUpdater();
+  param->m->updater.reset(new paddle::SgdThreadUpdater(config->m->getConfig()));
+  return param;
+}
+
+ParameterUpdater::~ParameterUpdater() { delete m; }
+
+void ParameterUpdater::init(const GradientMachine &gm) {
+  m->updater->init(gm.m->machine->getNonStaticParameters());
+}
+
+void ParameterUpdater::startPass() { m->updater->startPass(); }
+
+void ParameterUpdater::finishPass() { m->updater->finishPass(); }
+
+PassType ParameterUpdater::startBatch(size_t batchSize) {
+  return m->updater->startBatch((int64_t)batchSize);
+}
+
+void ParameterUpdater::finishBatch(float cost) {
+  m->updater->finishBatch(cost);
+}
+
+void ParameterUpdater::update(Parameter *param) {
+  auto paddleParam = param->m->getPtr();
+  m->updater->update(paddleParam);
+}
+
+void ParameterUpdater::restore() { m->updater->restore(); }
+
+void ParameterUpdater::apply() { m->updater->apply(); }
+
+void ParameterUpdater::catchUpWith() { m->updater->catchUpWith(); }
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index 874f2fd044e9e86b44f8ca69f08bdfd3287d4749..db8f005929d90f718fc1ad42c60b68108ff55005 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -253,7 +253,7 @@ void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
   *view_m_data = new float[*dim1];
   if (auto cpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
     std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
-  } else if (auto gpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
+  } else if (auto gpuVec = dynamic_cast<paddle::GpuVector*>(m->vec.get())) {
     hl_memcpy_device2host(
         *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
   } else {
diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py
index 7c8206e3fe09704debf5268f02128cc59e72af8d..b4d27b1cc728f92b2210f30b69f3f5899fe81d65 100644
--- a/paddle/api/paddle_ld_flags.py
+++ b/paddle/api/paddle_ld_flags.py
@@ -141,9 +141,12 @@ try:
 
         def c_flag(self):
             if self.with_coverage:
-                return ["-fprofile-arcs", "-ftest-coverage", "-O0", "-g"]
+                return [
+                    "-fprofile-arcs", "-ftest-coverage", "-O0", "-g",
+                    "-std=c++11"
+                ]
             else:
-                return None
+                return ["-std=c++11"]
 except ImportError:
 
     class PaddleLDFlag(object):
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
index 84c5f2d5c91feb7896643d2c5f60a279ebe944e7..5b9884b786530aee91312c7547496c94cd7a89cb 100644
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -16,7 +16,31 @@ limitations under the License. */
 #define HL_BASE_H_
 
 #include <cstddef>
-#include "paddle/utils/TypeDefs.h"
+
+#ifdef PADDLE_TYPE_DOUBLE
+#define HL_FLOAT_MAX 3.40282347e+38F
+#define HL_FLOAT_MIN 1.17549435e-38F
+using real = double;
+#else
+#define HL_FLOAT_MAX 1.7976931348623157e+308
+#define HL_FLOAT_MIN 2.2250738585072014e-308
+using real = float;
+#endif
+
+/**
+ * The maximum input value for exp, used to avoid overflow problem.
+ * currently only used for tanh function.
+ */
+#define EXP_MAX_INPUT 40.0
+
+/**
+ * @brief DIVUP(x, y) is similar to ceil(x / y).
+ * @note  For CUDA, DIVUP will be used to specify
+ *        the size of blockDim.
+ */
+#ifndef DIVUP
+#define DIVUP(x, y) (((x) + (y)-1) / (y))
+#endif
 
 /**
  * HPPL is an internal high performance parallel computing library
@@ -181,46 +205,6 @@ typedef struct {
   size_t nnz;
 } _hl_sparse_matrix_s, *hl_sparse_matrix_s;
 
-#ifndef PADDLE_TYPE_DOUBLE
-/**
- * HPPL data type: real (float or double)
- *
- * if real == float
- *
- * HL_FLOAT_MAX: 3.40282347e+38F
- *
- * HL_FLOAT_MIN: 1.17549435e-38F
- */
-#define HL_FLOAT_MAX 3.40282347e+38F
-/**
- * if real == double
- *
- * HL_FLOAT_MAX: 1.7976931348623157e+308
- *
- * HL_FLOAT_MIN: 2.2250738585072014e-308
- */
-#define HL_FLOAT_MIN 1.17549435e-38F
-#else
-#define HL_FLOAT_MAX 1.7976931348623157e+308
-#define HL_FLOAT_MIN 2.2250738585072014e-308
-#endif
-
-/**
- * The maximum input value for exp, used to avoid overflow problem.
- *
- * Currently only used for tanh function.
- */
-#define EXP_MAX_INPUT 40.0
-
-/**
- * @brief DIVUP(x, y) is similar to ceil(x / y).
- * @note  For CUDA, DIVUP will be used to specify
- *        the size of blockDim.
- */
-#ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y)-1) / (y))
-#endif
-
 #ifdef __NVCC__
 
 #include "cuda_runtime.h"
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 9b7f7e36cedaa230ae0694d87cc033bd6fa6e652..5f031fc7c0761a8fe97eb16fe1dd8e0a1debfcdb 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -34,8 +34,8 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Queue.h"
 #include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/TypeDefs.h"
 #include "paddle/utils/Util.h"
+#include "paddle/utils/common.h"
 
 namespace paddle {
 /**
diff --git a/paddle/gserver/layers/GruCompute.h b/paddle/gserver/layers/GruCompute.h
index 42c0019319ac9f20f9c3349fb2429c30f03d682b..a56af21317d1d43c836f7fe599a4dc614804bfec 100644
--- a/paddle/gserver/layers/GruCompute.h
+++ b/paddle/gserver/layers/GruCompute.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "ModelConfig.pb.h"
 #include "hl_gpu.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/LstmCompute.h b/paddle/gserver/layers/LstmCompute.h
index 140a4c6ecf5cfaf1045cec3ca2db5d4f2e54aca4..0d65b4158ebdc04f199048bbba98317c89fc8beb 100644
--- a/paddle/gserver/layers/LstmCompute.h
+++ b/paddle/gserver/layers/LstmCompute.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "ModelConfig.pb.h"
 #include "hl_gpu.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/gserver/layers/MultinomialSampler.h
index 677b047029305549084770bdb5eadfeaafbfac8a..b48073c80b6f57cd86ceb80b9d749548c3acc1ac 100644
--- a/paddle/gserver/layers/MultinomialSampler.h
+++ b/paddle/gserver/layers/MultinomialSampler.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <memory>
 #include <random>
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 2933c20fbad930248c41969d88d45cf397b9dcf8..8f9bc9e823eb8062535920361899ce3cc06ec3a7 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <stdint.h>
 #include <cstddef>
 #include "TensorExpression.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 25ce09e346694298e4901e52ab1ec6a3a8044263..bda863de38675fe481544a7e82b69f445df361bd 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "MemoryHandle.h"
 #include "Vector.h"
 #include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/math/TensorExpression.h b/paddle/math/TensorExpression.h
index 9bd789e8c511f33d8415e421281e99eb10fc63fe..f3d60e400380f7d7d645559318837b0d7706661d 100644
--- a/paddle/math/TensorExpression.h
+++ b/paddle/math/TensorExpression.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <cstddef>
 #include "hl_tensor_ops.h"
 #include "paddle/utils/Logging.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index 8a24103bd4107035c8068c24ec3be6ec06957112..b4347a70f874a2a1bf933bbea4d1b15385f36090 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "BaseMatrix.h"
 #include "MemoryHandle.h"
 #include "paddle/utils/Thread.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/ParallelParameter.h b/paddle/parameter/ParallelParameter.h
index 417e386dc74d308a6c0aefa2640f0f37de8dbf1f..1ee220d2dc1a26b3f394ca673975cc827f450206 100644
--- a/paddle/parameter/ParallelParameter.h
+++ b/paddle/parameter/ParallelParameter.h
@@ -28,7 +28,7 @@ limitations under the License. */
 #include "paddle/parameter/ParameterUpdateFunctions.h"
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/Locks.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/common.h"
 
 #include "ParameterConfig.pb.h"
 
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 532c6770e596c33dfe7fd42f32157b2c6c19e18e..e05137b315f254795de26a5ff0ac977e7968f4d8 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -29,8 +29,8 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/TypeDefs.h"
 #include "paddle/utils/Util.h"
+#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/ParameterUpdateFunctions.h b/paddle/parameter/ParameterUpdateFunctions.h
index 2d277e47e7eafc118fa37343e93e8a331a260aa9..2cb379871716ffd9e75eede607276b6b3f200e6b 100644
--- a/paddle/parameter/ParameterUpdateFunctions.h
+++ b/paddle/parameter/ParameterUpdateFunctions.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/math/Vector.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
index 262afafbe2d61305a158d945fac2d3b265012cbd..ccf05ae1ca3ab76fbe9d36237969207768de4dd2 100644
--- a/paddle/pserver/BaseClient.h
+++ b/paddle/pserver/BaseClient.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/math/Matrix.h"
 #include "paddle/pserver/ProtoServer.h"
 #include "paddle/utils/Queue.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
index eed71ccb43b0fec76a74a7f00662c32c97c26ff4..70cfc6d70072f399ef97eef1a0e6111a127cbd9f 100644
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -26,8 +26,8 @@ limitations under the License. */
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Queue.h"
-#include "paddle/utils/TypeDefs.h"
 #include "paddle/utils/Util.h"
+#include "paddle/utils/common.h"
 
 #include "ParameterService.pb.h"
 
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
index b0cf22e1fb158e76fcee1ce6ef1f375995803ce6..79d1eb97ff149f4f5ca9a924c1b0b7ba629f1e33 100644
--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
@@ -32,7 +32,7 @@ limitations under the License. */
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/common.h"
 
 #include "ParameterService.pb.h"
 
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index edcefba6a854df518fd2eb8c1fea5b72c5f5d6a8..981d10afda2671be9e8f0da1a4bee755f7aa9d61 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -15,6 +15,7 @@
 import paddle.trainer.PyDataProvider2 as dp2
 import collections
 import swig_paddle
+import numpy
 
 __all__ = ['DataProviderConverter']
 
@@ -35,18 +36,18 @@ class IScanner(object):
 class DenseScanner(IScanner):
     def __init__(self, input_type, pos):
         IScanner.__init__(self, input_type, pos)
-        self.__mat__ = []
-        self.__height__ = 0
+        self.__mat__ = None
 
     def scan(self, dat):
-        self.__mat__.extend(dat)
-        self.__height__ += 1
+        if self.__mat__ is None:
+            self.__mat__ = numpy.array([dat], dtype='float32')
+        else:
+            self.__mat__ = numpy.append(self.__mat__, [dat], axis=0)
 
     def finish_scan(self, argument):
         assert isinstance(argument, swig_paddle.Arguments)
         assert isinstance(self.input_type, dp2.InputType)
-        m = swig_paddle.Matrix.createDense(self.__mat__, self.__height__,
-                                           self.input_type.dim, False)
+        m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True, False)
         argument.setSlotValue(self.pos, m)
 
 
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index b4c38a41b86683f89b6d02e9db97b75e9dca89ea..464ad632868bd1fd4d88547212421302ca0b2116 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -30,8 +30,10 @@ is_lin = (system == 'linux')
 # The extra links will passed from COMAKE
 #   because generate paddle LDFLAGS is too complicated to do in setup.py
 #   it just read COMAKE generated LDFLAGS.
+extra_comps = []
 extra_links = []
 obj = api.paddle_ld_flags.PaddleLDFlag()
+extra_comps = obj.c_flag()
 ldflags = obj.ldflag_str()
 if ldflags is not None:
   extra_links.extend(ldflags.split(" "))
@@ -51,20 +53,15 @@ elif is_osx == True:
 
 include_dirs = [np.get_include(), "../"]    # include numpy and paddle.
 
-extra_c = obj.c_flag()
-
-attr=dict()
-if extra_c is not None:
-  attr["extra_compile_args"] = extra_c
-
 setup(name="py_paddle",
   version="@PADDLE_VERSION@",
   ext_modules=[
     Extension('py_paddle._swig_paddle',      # Build SWIG Extension.
        ['Paddle_wrap.cxx'],
+       language = "c++",
        include_dirs = include_dirs,
        extra_link_args = extra_links,
-       **attr
+       extra_compile_args = extra_comps
     )
   ],
   packages=['py_paddle'],
diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/trainer/ThreadParameterUpdater.h
index 880f1f9ddc49a1193ce23901419d988cae84eb88..bc08a9e9f0eda1cab7776ba76c67e88add1028a9 100644
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ b/paddle/trainer/ThreadParameterUpdater.h
@@ -33,8 +33,8 @@ namespace paddle {
    because at the current moment, the merging on CPU is happening on the
    main thread, and the its parameter size can be much larger than the one GPU.
    Thus, for GPU, the parameter updates happens in updateImpl() function, which
-   is called by gradient machines as a callback function as a callback function
-   supplied to backward() and forwardBackward().
+   is called by gradient machines as a callback function supplied to backward()
+   and forwardBackward().
    For CPU, the parameter updates happens in separate threads maintained by this
    class.
  */
diff --git a/paddle/utils/CpuId.h b/paddle/utils/CpuId.h
index 7a354da75851ed7cca4e85e77714624634951f00..1218e8194c4e837ca880744f92e769a68ba474de 100644
--- a/paddle/utils/CpuId.h
+++ b/paddle/utils/CpuId.h
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "DisableCopy.h"
+#include "common.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
index 0f922f3548d97eb16ca897564faf1bf083f0d5ac..a21872e89ebc172b87c8b5c3731a89302f34f521 100644
--- a/paddle/utils/Locks.h
+++ b/paddle/utils/Locks.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <condition_variable>
 #include <mutex>
 
-#include "DisableCopy.h"
+#include "common.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index e5a89070f1a953d70a43321cb5417656c907ee9d..dc15ada5862d648af27aa1b0e8c8a5cce012ded8 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -26,12 +26,11 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "DisableCopy.h"
 #include "Logging.h"
 #include "TrainerConfig.pb.h"
+#include "common.h"
 
 #include "Flags.h"
-#include "TypeDefs.h"
 #include "hl_gpu.h"
 
 /**
diff --git a/paddle/utils/Version.h b/paddle/utils/Version.h
index d1a07d9485076e5382d47f7408fcbf032166b1ed..aa5df3243893145dbcc7e7ef2592555fc1c88fc9 100644
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <stddef.h>
 #include <iostream>
-#include "TypeDefs.h"
+#include "common.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/TypeDefs.h b/paddle/utils/common.h
similarity index 71%
rename from paddle/utils/TypeDefs.h
rename to paddle/utils/common.h
index c50a05e82daefd1273c896f3603957f4484ecd5d..202a9d980d8350c230daaf473dd34d4069479e5f 100644
--- a/paddle/utils/TypeDefs.h
+++ b/paddle/utils/common.h
@@ -14,13 +14,20 @@ limitations under the License. */
 
 #pragma once
 
+/**
+ * Disable copy macro.
+ */
+#define DISABLE_COPY(class_name)                \
+  class_name(class_name &&) = delete;           \
+  class_name(const class_name &other) = delete; \
+  class_name &operator=(const class_name &other) = delete
+
 namespace paddle {
+
 #ifdef PADDLE_TYPE_DOUBLE
-typedef double real;
+using real = double;
 #else
-typedef float real;
+using real = float;
 #endif
 
 }  // namespace paddle
-
-using paddle::real;
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 8397659fff03eb6a04f428326e8317040ee7923d..bdd0e001fe59875f6dfa4c64e99b52783ac1cee2 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -3419,8 +3419,35 @@ def register_parse_config_hook(f):
     _parse_config_hooks.add(f)
 
 
-def parse_config(config_file, config_arg_str):
+def update_g_config():
     '''
+    Update g_config after execute config_file or config_functions.
+    '''
+    for k, v in settings.iteritems():
+        if v is None:
+            continue
+        g_config.opt_config.__setattr__(k, v)
+
+    for k, v in trainer_settings.iteritems():
+        if v is None:
+            continue
+        g_config.__setattr__(k, v)
+
+    for name in g_config.model_config.input_layer_names:
+        assert name in g_layer_map, \
+            'input name "%s" does not correspond to a layer name' % name
+        assert (g_layer_map[name].type == "data" or g_layer_map[name].type == "data_trim"), \
+            'The type of input layer "%s" is not "data"' % name
+    for name in g_config.model_config.output_layer_names:
+        assert name in g_layer_map, \
+            'input name "%s" does not correspond to a layer name' % name
+    return g_config
+
+
+def parse_config(trainer_config, config_arg_str):
+    '''
+    @param trainer_config: can be a string of config file name or a function name
+    with config logic
     @param config_arg_str: a string of the form var1=val1,var2=val2. It will be
     passed to config script as a dictionary CONFIG_ARGS
     '''
@@ -3454,46 +3481,21 @@ def parse_config(config_file, config_arg_str):
     g_root_submodel.is_recurrent_layer_group = False
     g_current_submodel = g_root_submodel
 
-    # for paddle on spark, need support non-file config.
-    # you can use parse_config like below:
-    #
-    # from paddle.trainer.config_parser import parse_config
-    # def configs():
-    #    #your paddle config code, which is same as config file.
-    #
-    # config = parse_config(configs, "is_predict=1")
-    # # then you get config proto object.
-    if hasattr(config_file, '__call__'):
-        config_file.func_globals.update(
+    if hasattr(trainer_config, '__call__'):
+        trainer_config.func_globals.update(
             make_config_environment("", config_args))
-        config_file()
+        trainer_config()
     else:
-        execfile(config_file, make_config_environment(config_file, config_args))
-    for k, v in settings.iteritems():
-        if v is None:
-            continue
-        g_config.opt_config.__setattr__(k, v)
-
-    for k, v in trainer_settings.iteritems():
-        if v is None:
-            continue
-        g_config.__setattr__(k, v)
+        execfile(trainer_config,
+                 make_config_environment(trainer_config, config_args))
 
-    for name in g_config.model_config.input_layer_names:
-        assert name in g_layer_map, \
-            'input name "%s" does not correspond to a layer name' % name
-        assert (g_layer_map[name].type == "data" or g_layer_map[name].type == "data_trim"), \
-            'The type of input layer "%s" is not "data"' % name
-    for name in g_config.model_config.output_layer_names:
-        assert name in g_layer_map, \
-            'input name "%s" does not correspond to a layer name' % name
-    return g_config
+    return update_g_config()
 
 
-def parse_config_and_serialize(config_file, config_arg_str):
+def parse_config_and_serialize(trainer_config, config_arg_str):
     try:
-        config = parse_config(config_file, config_arg_str)
-        # logger.info(config)
+        config = parse_config(trainer_config, config_arg_str)
+        #logger.info(config)
         return config.SerializeToString()
     except:
         traceback.print_exc()
diff --git a/python/paddle/trainer_config_helpers/__init__.py b/python/paddle/trainer_config_helpers/__init__.py
index a2335768b92b66e3bfc0fb0d2562fb24ad291258..13155ebddbb49c502d9d4110704ab09f49825be2 100644
--- a/python/paddle/trainer_config_helpers/__init__.py
+++ b/python/paddle/trainer_config_helpers/__init__.py
@@ -20,6 +20,6 @@ from layers import *
 from networks import *
 from optimizers import *
 from attrs import *
-
+from config_parser_utils import *
 # This will enable operator overload for LayerOutput
-import math as layer_math
+import layer_math
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index 59bb18bfcab30540bd38ca8d1cb300813d30fee8..bf0208834600fef3bcf1b0496da8f5f77aea44c5 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -19,34 +19,34 @@ __all__ = [
 
 
 def convert_and_compare(x, Type):
-    """                                                                                                                                                                                                
-    Convert x to be the same type as Type and then convert back to                                                                                                                                      
-    check whether there is a loss of information                                                                                                                                                        
-    :param x: object to be checked                                                                                                                                                                      
-    :param Type: target type to check x over                                                                                                                                                           
-    
+    """
+    Convert x to be the same type as Type and then convert back to
+    check whether there is a loss of information
+    :param x: object to be checked
+    :param Type: target type to check x over
+
     """
     return type(x)(Type(x)) == x
 
 
 def is_compatible_with(x, Type):
-    """                                                                                                                                                                                                
-    Check if x has a type compatible with Type                                                                                                                                                         
-    :param x: object to be checked                                                                                                                                                                     
-    :param Type: target type to check x over                                                                                                                                                           
-    
+    """
+    Check if x has a type compatible with Type
+    :param x: object to be checked
+    :param Type: target type to check x over
+
     """
     if type(x) == Type:
         return True
     try:
         if float == Type or int == Type:
-            # avoid those types that can be converted to float/int but not very                                                                                                                            
-            # meaningful and  could potentially lead to error                                                                                                                                              
-            # i.e., str and bool typed value should not be used for initializing float/int variable                                                                                                        
+            # avoid those types that can be converted to float/int but not very
+            # meaningful and  could potentially lead to error
+            # i.e., str and bool typed value should not be used for initializing float/int variable
             if not isinstance(x, str) and not isinstance(x, bool):
                 return convert_and_compare(x, Type)
         elif bool == Type:
-            # should not use string type to initialize bool variable                                                                                                                                   
+            # should not use string type to initialize bool variable
             if not isinstance(x, str):
                 return convert_and_compare(x, Type)
         else:
@@ -88,6 +88,10 @@ class ParameterAttribute(object):
     :type learning_rate: float or None
     :param momentum: The parameter momentum. None means use global value.
     :type momentum: float or None
+    :param gradient_clipping_threshold: gradient clipping threshold. If gradient
+                                        value larger than some value, will be
+                                        clipped.
+    :type gradient_clipping_threshold: float
     :param sparse_update: Enable sparse update for this parameter. It will
                           enable both local and remote sparse update.
     :type sparse_update: bool
@@ -104,6 +108,7 @@ class ParameterAttribute(object):
                  l2_rate=None,
                  learning_rate=None,
                  momentum=None,
+                 gradient_clipping_threshold=None,
                  sparse_update=False):
         # initialize strategy.
         if is_static:
@@ -152,6 +157,11 @@ class ParameterAttribute(object):
             self.attr['sparse_update'] = True
             self.attr['sparse_remote_update'] = True
 
+        if gradient_clipping_threshold is not None and \
+                is_compatible_with(gradient_clipping_threshold, float):
+            self.attr['gradient_clipping_threshold'] = \
+                gradient_clipping_threshold
+
     def set_default_parameter_name(self, name):
         """
         Set default parameter name. If parameter not set, then will use default
diff --git a/python/paddle/trainer_config_helpers/config_parser.py b/python/paddle/trainer_config_helpers/config_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b91b8d2824cd89ac0d6da696492bd9289b6e5f4
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/config_parser.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer.config_parser as config_parser
+'''
+This file is a wrapper of formal config_parser. The main idea of this file is to 
+separete different config logic into different function, such as network configuration
+ and optimizer configuration.
+'''
+
+__all__ = [
+    "parse_trainer_config", "parse_network_config", "parse_optimizer_config"
+]
+
+
+def parse_trainer_config(trainer_conf, config_arg_str):
+    return config_parser.parse_config(trainer_conf, config_arg_str)
+
+
+def parse_network_config(network_conf):
+    config = config_parser.parse_config(network_conf, '')
+    return config.model_config
+
+
+def parse_optimizer_config(optimizer_conf):
+    config = config_parser.parse_config(optimizer_conf, '')
+    return config.opt_config
diff --git a/python/paddle/trainer_config_helpers/config_parser_utils.py b/python/paddle/trainer_config_helpers/config_parser_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..681b177a55f48d02a8ff792945dd7cc3b05cd976
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/config_parser_utils.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer.config_parser as config_parser
+'''
+This file is a wrapper of formal config_parser. The main idea of this file is to 
+separete different config logic into different function, such as network configuration
+ and optimizer configuration.
+'''
+
+__all__ = [
+    "parse_trainer_config", "parse_network_config", "parse_optimizer_config"
+]
+
+
+def parse_trainer_config(trainer_conf, config_arg_str):
+    return config_parser.parse_config(trainer_conf, config_arg_str)
+
+
+def parse_network_config(network_conf, config_arg_str=''):
+    config = config_parser.parse_config(network_conf, config_arg_str)
+    return config.model_config
+
+
+def parse_optimizer_config(optimizer_conf, config_arg_str=''):
+    config = config_parser.parse_config(optimizer_conf, config_arg_str)
+    return config.opt_config
diff --git a/python/paddle/trainer_config_helpers/math.py b/python/paddle/trainer_config_helpers/layer_math.py
similarity index 100%
rename from python/paddle/trainer_config_helpers/math.py
rename to python/paddle/trainer_config_helpers/layer_math.py