diff --git a/AUTHORS.md b/AUTHORS.md
new file mode 100644
index 0000000000000000000000000000000000000000..d5baee2161aa1d5360056e03ca67d5b2fe9ff7d2
--- /dev/null
+++ b/AUTHORS.md
@@ -0,0 +1,28 @@
+| Github account | name |
+|---|---|
+| reyoung | Yang Yu |
+| gangliao | Gang Liao |
+| luotao01 | Tao Luo |
+| jacquesqiao | Long-Fei Qiao |
+| qingqing01 | Qing-Qing Dang |
+| hedaoyuan | Dao-Yuan He |
+| wangyang59 | Yang Wang |
+| QiJune | Jun Qi |
+| tianbingsz | Tian-Bing Xu |
+| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
+| typhoonzero | Yi Wu |
+| backyes | Yan-Fei Wang |
+| pengli09 | Peng Li |
+| livc | Zhao Li |
+| Xreki | Yi-Qun Liu |
+| Yancey1989 | Xu Yan |
+| emailweixu | Wei Xu |
+| wen-bo-yang | Wen-Bo Yang |
+| helinwang | He-Lin Wang |
+| lcy-seso | Ying Cao |
+| Zrachel | Rui-Qing Zhang |
+| Haichao-Zhang | Hai-Chao Zhang |
+| gongweibao | Wei-Bao Gong |
+| lzhao4ever | Liang Zhao |
+| zhouxiao-coder | Xiao Zhou |
+| lipeng-unisound | Peng Li |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1a59db8c71bf3b1ea472c1ee56a1cd97de42dad8..aa4f1eaff9125f2ff11a6ef83e503acd56b79e21 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,19 +1,19 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
-set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
 
 include(system)
 
@@ -50,6 +50,7 @@ option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
+option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -75,6 +76,13 @@ endif(ANDROID)
 
 set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
+
+if (WITH_C_API AND WITH_PYTHON)
+  message(WARNING "It is suggest not embedded a python interpreter in Paddle "
+    "when using C-API. It will give an unpredictable behavior when using a "
+    "different Python interpreter from compiling.")
+endif()
+
 ########################################################################################
 
 include(external/zlib)      # download, build, install zlib
diff --git a/Dockerfile b/Dockerfile
index f12be36ceb764a535e8a87b7071757f1ef3dada7..c3ad0c9c2f6d619f2b2ef2bcf21429937d06dd6e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -46,6 +46,11 @@ RUN pip install --upgrade pip && \
     pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \
     pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
 
+# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
+# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
+RUN apt-get install -y libssl-dev libffi-dev
+RUN pip install certifi urllib3[secure]
+
 RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
     cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
     cd .. && rm -rf cmake-3.4.1
diff --git a/RELEASE.md b/RELEASE.md
index 9a09644b681b2ae4e922d92ed29500205c2a6ca4..146f7afa7dfbc152500b82fde28445ae3155c16c 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,8 +1,6 @@
 # Release v0.10.0
 
-We are glad to release version 0.10.0.  In this version, we are happy to
-release the
-new
+We are glad to release version 0.10.0.  In this version, we are happy to release the new 
 [Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/).
 
 - Our old Python API is kind of out of date.  It's hard to learn and hard to
diff --git a/authors b/authors
deleted file mode 100644
index daac4ec5d8173cba95df9f9b3c69c02b5256f5b2..0000000000000000000000000000000000000000
--- a/authors
+++ /dev/null
@@ -1,56 +0,0 @@
-Cao, Ying
-Cheng, Yujuan
-Dang, Qingqing
-Dong, Tengfei
-Du, Dalong
-Feng, Shouqiang
-Gao, Haoyuan
-Han, Baochang
-Han, Jinchen
-Hao, Nanyu
-He, Daoyuan
-He, Zhengyan
-Hou, Jue
-Huang, Chang
-Huang, Zhiheng
-Hu, Na
-Kong, Qi
-Liao, Gang
-Li, Bo
-Li, Jiajie
-Li, Jing
-Li, Lei
-Li, Peng
-Liu, Sheng
-Liu, Yuan
-Li, Yuze
-Luo, Heng
-Luo, Tao
-Lyu, Qin
-Mao, Hongyue
-Qian, Xiaojun
-Qiao, Longfei
-Qi, Jun
-Qin, Duohao
-Shen, Guolong
-Shi, Guangchuan
-Song, Xiang
-Wang, Helin
-Wang, Jiang
-Wang, Yanfei
-Wang, Yi
-Wang, Yong
-Weng, Renliang
-Xu, Tianbing
-Xu, Wei
-Xu, Xingyu
-Yan, Chong
-Yan, Chunwei
-Yang, Yi
-Yu, Yang
-Yu, Yinan
-Zhang, Jian
-Zhang, Ruiqing
-Zhang, Weide
-Zhao, Liang
-Zhou, Jie
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index b8bf1bb07a1f779354b2c10071264bf41d279f6c..aebb5d9fcb186005607c4849b70ecb61de771deb 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -5,7 +5,7 @@
 # If any cblas implementation found, the following variable will be set.
 #    CBLAS_PROVIDER  # one of MKL, ATLAS, OPENBLAS, REFERENCE
 #    CBLAS_INC_DIR   # the include directory for cblas.
-#    CBLAS_LIBS      # a list of libraries should be linked by paddle. 
+#    CBLAS_LIBS      # a list of libraries should be linked by paddle.
 #                    # Each library should be full path to object file.
 #
 # User should set one of MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT, REFERENCE_CBLAS_ROOT
@@ -63,11 +63,11 @@ set(ATLAS_LIB_SEARCH_PATHS
         /usr/lib/atlas
         /usr/lib/atlas-base   # special for ubuntu 14.04.
     )
-find_path(ATLAS_INC_DIR NAMES cblas.h 
+find_path(ATLAS_INC_DIR NAMES cblas.h
   PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
 find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
   PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3 
+find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
   PATHS ${ATLAS_LIB_SEARCH_PATHS})
 find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
   PATHS ${ATLAS_LIB_SEARCH_PATHS})
@@ -76,11 +76,12 @@ if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB AND NOT CBLAS_FOUND)
   set(CBLAS_PROVIDER ATLAS)
   set(CBLAS_INC_DIR ${ATLAS_INC_DIR})
   set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
-  add_definitions(-DPADDLE_USE_ATLAS)  
+  add_definitions(-DPADDLE_USE_ATLAS)
   message(STATUS "Found ATLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
   set(CBLAS_FOUND ON)
   if(ATLAS_CLAPACK_INC_DIR)
     add_definitions(-DPADDLE_USE_LAPACK)
+    set(CBLAS_INC_DIR ${CBLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
     message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
   endif()
   return()
@@ -124,7 +125,7 @@ endif()
 ## Then find the reference-cblas.  www.netlib.org/blas/
 
 
-set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH 
+set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
   "Folder contains reference-cblas")
 set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
   ${REFERENCE_CBLAS_ROOT}/include
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
index 38c636b30edc0af1c07255814e8bc2b1ad9514da..02a5c0b2c9be782c459a255c6ffd6ba6441f2693 100644
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -34,7 +34,7 @@ set(IGNORE_PATTERN
 #
 # first argument: target name to attach
 # rest arguments: source list to check code style.
-# 
+#
 # NOTE: If WITH_STYLE_CHECK is OFF, then this macro just do nothing.
 macro(add_style_check_target TARGET_NAME)
     if(WITH_STYLE_CHECK)
@@ -48,13 +48,17 @@ macro(add_style_check_target TARGET_NAME)
                 if(filename MATCHES ${pattern})
                     message(STATUS "DROP LINT ${filename}")
                     set(LINT OFF)
-                endif() 
+                endif()
             endforeach()
             if(LINT MATCHES ON)
-                add_custom_command(TARGET ${TARGET_NAME}
+                get_filename_component(base_filename ${filename} NAME)
+                set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
+                add_custom_command(OUTPUT ${CUR_GEN}
                     PRE_BUILD
                     COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
-                                "--filter=${STYLE_FILTER}" ${filename}
+                                "--filter=${STYLE_FILTER}"
+                                "--write-success=${CUR_GEN}" ${filename}
+                    DEPENDS ${filename}
                     WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
             endif()
         endforeach()
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 2df042d226af8308d00f7870e7d2de0eacfdf07e..a9db4e8ba410c718f1ee4d69f4551e2773c60125 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,7 +20,7 @@ FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION})
 IF(PROTOBUF_FOUND)
     EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
     STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}")
-    IF (${PROTOBUF_VERSION} VERSION_LESS "3.1.0")
+    IF ("${PROTOBUF_VERSION}" VERSION_LESS "3.1.0")
         SET(PROTOBUF_FOUND OFF)
     ENDIF()
 ENDIF(PROTOBUF_FOUND)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 7eb92efcb00fa18461e61e0508b485c13ef23a1f..7a996dea92b13bdac054a987a004a3d54ff02da2 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -197,3 +197,4 @@ if(CUDA_ARCH)
 endif()
 
 set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
+
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 3ca06665ab2385e34302a6bcce7ada549ea1e247..75a9d8fc25674e1dd0f5b73cd0ccde48204f63aa 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -28,6 +28,11 @@ ELSE(WIN32)
         STRING(REGEX MATCH "[0-9]+.[0-9]+" VERSION "${MACOSX_VERSION}")
         SET(MACOS_VERSION ${VERSION})
         SET(HOST_SYSTEM "macosx")
+        IF(NOT DEFINED ENV{MACOSX_DEPLOYMENT_TARGET})
+            # Set cache variable - end user may change this during ccmake or cmake-gui configure.
+            SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
+                "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
+        ENDIF()
     ELSE(APPLE)
 
         IF(EXISTS "/etc/issue")
diff --git a/demo/word2vec/train_v2.py b/demo/word2vec/api_train_v2.py
similarity index 76%
rename from demo/word2vec/train_v2.py
rename to demo/word2vec/api_train_v2.py
index 7d952b446f9db432062fc3305a6b65b0ad66dd47..c0940f0e56eafa22f8aeb7052c0ddc79d8862917 100644
--- a/demo/word2vec/train_v2.py
+++ b/demo/word2vec/api_train_v2.py
@@ -1,27 +1,40 @@
+import gzip
 import math
 
 import paddle.v2 as paddle
 
-dictsize = 1953
 embsize = 32
 hiddensize = 256
 N = 5
 
 
 def wordemb(inlayer):
-    wordemb = paddle.layer.table_projection(
+    wordemb = paddle.layer.embedding(
         input=inlayer,
         size=embsize,
         param_attr=paddle.attr.Param(
             name="_proj",
             initial_std=0.001,
             learning_rate=1,
-            l2_rate=0, ))
+            l2_rate=0,
+            sparse_update=True))
     return wordemb
 
 
 def main():
-    paddle.init(use_gpu=False, trainer_count=1)
+    # for local training
+    cluster_train = False
+
+    if not cluster_train:
+        paddle.init(use_gpu=False, trainer_count=1)
+    else:
+        paddle.init(
+            use_gpu=False,
+            trainer_count=2,
+            port=7164,
+            ports_num=1,
+            ports_num_for_sparse=1,
+            num_gradient_servers=1)
     word_dict = paddle.dataset.imikolov.build_dict()
     dict_size = len(word_dict)
     firstword = paddle.layer.data(
@@ -57,6 +70,9 @@ def main():
     def event_handler(event):
         if isinstance(event, paddle.event.EndIteration):
             if event.batch_id % 100 == 0:
+                with gzip.open("batch-" + str(event.batch_id) + ".tar.gz",
+                               'w') as f:
+                    trainer.save_parameter_to_tar(f)
                 result = trainer.test(
                     paddle.batch(
                         paddle.dataset.imikolov.test(word_dict, N), 32))
@@ -65,11 +81,15 @@ def main():
                     result.metrics)
 
     cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+
     parameters = paddle.parameters.create(cost)
-    adam_optimizer = paddle.optimizer.Adam(
+    adagrad = paddle.optimizer.AdaGrad(
         learning_rate=3e-3,
         regularization=paddle.optimizer.L2Regularization(8e-4))
-    trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
+    trainer = paddle.trainer.SGD(cost,
+                                 parameters,
+                                 adagrad,
+                                 is_local=not cluster_train)
     trainer.train(
         paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
         num_passes=30,
diff --git a/doc/api/v1/trainer_config_helpers/layers.rst b/doc/api/v1/trainer_config_helpers/layers.rst
index 24389c2d8574dfda4bec9298776aa6b1aee51535..75c1b35246486528524fd5baa04704249d5a9275 100644
--- a/doc/api/v1/trainer_config_helpers/layers.rst
+++ b/doc/api/v1/trainer_config_helpers/layers.rst
@@ -498,6 +498,12 @@ hsigmoid
     :members: hsigmoid
     :noindex:
 
+smooth_l1_cost
+--------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: smooth_l1_cost
+    :noindex:
+
 Check Layer 
 ============
 
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 2a02baf17ba0d1119a8d222024616ef8ae33f8d5..154cfe24432f3e43ed724a45273b4a582b45f73d 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -419,6 +419,11 @@ hsigmoid
 ..  autoclass:: paddle.v2.layer.hsigmoid
     :noindex:
 
+smooth_l1_cost
+--------------
+..  autoclass:: paddle.v2.layer.smooth_l1_cost
+    :noindex:
+
 Check Layer 
 ============
 
diff --git a/doc/design/dist/README.md b/doc/design/cluster_train/README.md
similarity index 86%
rename from doc/design/dist/README.md
rename to doc/design/cluster_train/README.md
index 1788208bcabca30f66cb1c80e80f6b824c0d9579..b88a8f382bfdbbf113d2bb99e12caaa09f9cd8c5 100644
--- a/doc/design/dist/README.md
+++ b/doc/design/cluster_train/README.md
@@ -17,12 +17,16 @@ A training job will be created once user asks Paddle cloud to train a model. The
 
 1. the *master process*, which dispatches tasks to
 1. one or more *trainer processes*, which run distributed training and synchronize gradients/models via
-1. one or more *parameter server processes*, where each holds a shard of the global model.
+1. one or more *parameter server processes*, where each holds a shard of the global model, and receive the uploaded gradients from every *trainer process*, so they can run the optimize functions to update their parameters.
 
 Their relation is illustrated in the following graph:
 
 <img src="src/paddle-model-sharding.png"/>
 
+By coordinating these processes, PaddlePaddle supports use both Synchronize Stochastic Gradient Descent (sync SGD) and Asynchronous Stochastic Gradient Descent (async SGD) to train user-defined neural network topologies.
+
+When training with sync SGD, parameter servers wait for all trainers to finish gradients update and then send the updated parameters to trainers, training can not proceed until the trainer received the updated parameters. This creates a synchronization point between trainers. When training with async SGD, each trainer upload gradient and download new parameters individually, without the synchronization with other trainers. Using asyc SGD will be faster in terms of time per pass, but have more noise in gradient since trainers are likely to have a stale model.
+
 ### Master Process
 
 The master process will:
@@ -31,7 +35,7 @@ The master process will:
 - Keep track of training progress on the dataset with [task queue](#task-queue). A training job will iterate on the dataset for a full pass until it goes into next pass.
 
 
-#### Task 
+#### Task
 
 A task is a data shard to be trained. The total number of tasks will be much bigger than the total number of trainers. The number of data instances inside a task will be much bigger than the mini-batch size.
 
@@ -78,7 +82,7 @@ The communication pattern between the trainers and the parameter servers depends
 - Synchronous Stochastic Gradient Descent (sync-SGD)
 
 	Parameter server will wait for all trainer finish n-th mini-batch calculation and send their gradients before broadcasting new parameters to every trainer. Every trainer will wait for the new parameters before starting n+1-th mini-batch.
-  
+
 - Asynchronous Stochastic Gradient Descent (async-SGD)
 
 	There will no synchronization between different trainers, and parameter server updates its parameter as soon as it receives new gradient:
@@ -118,8 +122,6 @@ When the master is started by the Kubernetes, it executes the following steps at
 1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers.
 1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update.
 
-The master process will kill itself if its etcd lease expires.
-
 When the master process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes.
 
 ### Trainer Process
@@ -132,6 +134,8 @@ When the trainer is started by the Kubernetes, it executes the following steps a
 
 If trainer's etcd lease expires, it will try set key `/trainer/<unique ID>` again so that the master process can discover the trainer again.
 
+When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from the TODO queue and go on training.
+
 ### Parameter Server Process
 
 When the parameter server is started by Kubernetes, it executes the following steps at startup:
@@ -140,11 +144,11 @@ When the parameter server is started by Kubernetes, it executes the following st
 1. Search through etcd keys `/ps/<index>` (`/ps/0`, `/ps/1`, ...) to find the first non-existant key whose index is smaller than the total number of parameter servers. Set the key using a transaction to avoid concurrent writes. The parameter server's index is inferred from the key name.
 
 	The desired number of parameter servers is 3:
-	
+
 	<img src="src/paddle-ps-0.png"/>
-	
+
 	The third parameter server joined:
-	
+
 	<img src="src/paddle-ps-1.png"/>
 
 1. The parameter server can load parameters if there are already saved parameters in the save path (inferred from its index).
@@ -153,6 +157,13 @@ When the parameter server is started by Kubernetes, it executes the following st
 If the parameter server's etcd lease expires, the parameter server will kill itself.
 
 
+## Parameter Server Checkpointing
+See [here](./checkpointing.md)
+
+## Store and dispatching trainning data
+See [here](./data_dispatch.md)
+
+
 ## Dynamic Scaling
 
 ### Trainer Scaling
diff --git a/doc/design/cluster_train/checkpointing.md b/doc/design/cluster_train/checkpointing.md
new file mode 100644
index 0000000000000000000000000000000000000000..c87ef2c7d2636208866d05456d5d44316d0bb200
--- /dev/null
+++ b/doc/design/cluster_train/checkpointing.md
@@ -0,0 +1,44 @@
+## 模型参数检查点（Checkpointing）
+模型数据检查点的实现，可以有效的避免parameter server的单点或多点同时故障。模型参数检查点通过定期向磁盘上保存一份存储在parameter server内存中的模型数据的完整镜像，来保证训练过程可以从中间状态重新启动。在一个不可中断并缺少备份的训练任务中，可以通过阶段性的保存每个parameter server的数据快照（snapshot）到 ***分布式存储服务*** 达到容灾的目的，比如每隔10分钟最新的快照，并删除更早的快照。在出现单点故障时，只需要恢复这台节点，或者将这台节点迁移到另一个节点并启动即可恢复训练任务。
+
+<img src="src/checkpointing.png" width="500"/>
+
+### 快照保存的设计如下：
+
+说明：
+
+* parameter server在集群中启动后，自动挂载分布式存储目录，并把快照保存到这个目录下。
+* ***注：每个parameter server的检查点各自独立保存，暂时不考虑多个parameter server同步的保存一个特定时间点的全局检查点，因为这样做也没法保证消除随机性。***
+
+检查点保存程序流程：
+
+1. 如果满足条件"每隔10分钟"时，parameter server会获取parameters内存的`read_lock`，启动一个新的线程开始保存检查点。如果已经正在执行保存检查点的线程，则忽略。由于对parameters的更新需要获取parameters内存的`write_lock`，所以在写入快照的过程中，parameter server会暂停参数更新并等待。
+2. parameter server生成一个UUID，向指定的目录中一个新的文件（文件名为此UUID）写入快照数据。在快照写入完成后，计算这个文件的MD5 sum。然后在etcd的`/checkpoints/[pserver_id]`中写入json内容：`{"uuid": [UUID], "md5", "MD5 sum", "timestamp": xxxx}`。
+3. 删除磁盘目录中不是当前uuid的快照文件。
+4. 释放对paramters内存的锁定，停止保存检查点的线程。
+
+这里需要用户额外注意，在您的实际环境中，训练任务的运行可能会占满trainer和parameter server之间的网络带宽，如果parameter server此时还需要通过网络访问分布式存储以保存快照，可能会造成网络拥塞，而出现阶段性的运行停滞。
+
+### 从快照恢复
+
+在parameter server第一次启动或任意时间parameter server故障后被Kubernetes重新启动，则需要回滚到上一个检查点：
+
+  1. 从etcd中读取节点：`/checkpoints/[pserver_id]`获取最新的检查点的文件uuid
+  1. 从磁盘文件中加载uuid文件名的检查点快照文件，并加载其中的参数
+  1. 如果上面两步出现错误，则使用启动参数定义的初始化方法初始化参数
+  1. 开始提供服务
+
+## TODO List
+### 推测执行/加速执行（TODO）
+在异构集群中，如果存在某些trainer执行速度过慢会影响整体集群的速度（如图中Trainer 1），此时master将负责启动一个新的Trainer（Accelerate Trainer 2），使用同样的训练数据block。哪个trainer先完成block的训练，则把另一个慢速的kill掉。
+
+### 动态扩容/缩容
+目前只考虑动态扩容trainer数量，可以减小系统复杂性。
+
+## 术语
+* model: 指深度学习训练之后得到的所有参数，使用这个神经网络可以完成对新数据的预测
+* parameters: 神经网络中的参数，包括权重w和偏置b。一个神经网络的模型由大量的参数组成
+* shard: 分片，通常指将一个整体拆分成多份的其中的一份。
+* model shard: 将一个神经网络参数拆分成多份，每个shard分别存储在其中一台parameter server之上
+* parameter block: 多个parameter block构成一个model shard
+* 单点故障: 任意时刻只可能同时有一台服务器故障。由于集群中同时存在两台机器故障的概率极低（（平均故障率*平均故障修复时间）^2）只对特殊在线系统考虑两台以上同时故障的容灾。
diff --git a/doc/design/cluster_train/data_dispatch.md b/doc/design/cluster_train/data_dispatch.md
new file mode 100644
index 0000000000000000000000000000000000000000..a3eb4e28db0782bbf88520d25023cf031e620a63
--- /dev/null
+++ b/doc/design/cluster_train/data_dispatch.md
@@ -0,0 +1,120 @@
+## 训练数据的存储和分发
+
+### 流程介绍
+生产环境中的训练数据集通常体积很大，并被存储在诸如Hadoop HDFS，Ceph，AWS S3之类的分布式存储之上。这些分布式存储服务通常会把数据切割成多个分片分布式的存储在多个节点之上。这样就可以在云端执行多种数据类计算任务，包括：
+
+* 数据预处理任务
+* Paddle训练任务
+* 在线模型预测服务
+
+<img src="src/paddle-cloud-in-data-center.png" width="500"/>
+
+在上图中显示了在一个实际生产环境中的应用（人脸识别）的数据流图。生产环境的日志数据会通过实时流的方式（Kafka）和离线数据的方式（HDFS）存储，并在集群中运行多个分布式数据处理任务，比如流式数据处理（online data process），离线批处理（offline data process）完成数据的预处理，提供给paddle作为训练数据。用于也可以上传labeled data到分布式存储补充训练数据。在paddle之上运行的深度学习训练输出的模型会提供给在线人脸识别的应用使用。
+
+### 训练数据的存储
+
+选择CephFS作为训练数据的存储服务。
+
+在Kubernetes上运行的不同的计算框架，可以通过Volume或PersistentVolume挂载存储空间到每个容器中。
+
+在CephFS存储系统中的公开目录，需要保存一些预置的公开数据集（比如MNIST, BOW, ImageNet数据集等），并且可以被提交的job直接使用。
+
+### 文件预处理
+
+在数据集可以被训练之前，文件需要预先被转换成PaddlePaddle集群内部的存储格式（SSTable）。我们提供两个转换方式：
+
+- 提供给用户本地转换的库，用户可以编写程序完成转换。
+- 用户可以上传自己的数据集，在集群运行MapReduce job完成转换。
+
+转换生成的文件名会是以下格式：
+
+```text
+name_prefix-aaaaa-of-bbbbb
+```
+
+"aaaaa"和"bbbbb"都是五位的数字，每一个文件是数据集的一个shard，"aaaaa"代表shard的index，"bbbbb"代表这个shard的最大index。
+
+比如ImageNet这个数据集可能被分成1000个shard，它们的文件名是：
+```text
+imagenet-00000-of-00999
+imagenet-00001-of-00999
+...
+imagenet-00999-of-00999
+```
+
+#### 转换库
+
+无论是在本地或是云端转换，我们都提供Python的转换库，接口是：
+```python
+def convert(output_path, reader, num_shards, name_prefix)
+```
+
+- `output_path`: directory in which output files will be saved.
+- `reader`: a [data reader](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#data-reader-interface), from which the convert program will read data instances.
+- `num_shards`: the number of shards that the dataset will be partitioned into.
+- `name_prefix`: the name prefix of generated files.
+
+`reader`每次输出一个data instance，这个instance可以是单个值，或者用tuple表示的多个值：
+
+```python
+yield 1 # 单个值
+yield numpy.random.uniform(-1, 1, size=28*28) # 单个值
+yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值
+```
+
+每个值的类型可以是整形、浮点型数据、字符串，或者由它们组成的list，以及numpy.ndarray。如果是其它类型，会被Pickle序列化成字符串。
+
+### 示例程序
+
+#### 使用转换库
+
+以下`reader_creator`生成的`reader`每次输出一个data instance，每个data instance包涵两个值：numpy.ndarray类型的值和整型的值：
+```python
+def reader_creator():
+	def reader():
+		for i in range(1000):
+			yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值
+	return reader
+```
+
+把`reader_creator`生成的`reader`传入`convert`函数即可完成转换：
+```python
+convert("./", reader_creator(), 100, random_images)
+```
+
+以上命令会在当前目录下生成100个文件：
+```text
+random_images-00000-of-00099
+random_images-00001-of-00099
+...
+random_images-00099-of-00099
+```
+
+#### 进行训练
+
+PaddlePaddle提供专用的[data reader creator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#python-data-reader-design-doc)，生成给定SSTable文件对应的data reader。**无论在本地还是在云端，reader的使用方式都是一致的**：
+
+```python
+# ...
+reader = paddle.reader.creator.SSTable("/home/random_images-*-of-*")
+batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128)
+trainer.train(batch_reader, ...)
+```
+
+以上代码的reader输出的data instance与生成数据集时，reader输出的data instance是一模一样的。
+
+### 上传训练文件
+
+使用下面命令，可以把本地的数据上传到存储集群中。
+
+```bash
+paddle cp filenames pfs://home/folder/
+```
+
+比如，把之前示例中转换完毕的random_images数据集上传到云端的`/home/`可以用以下指令：
+```bash
+paddle cp random_images-*-of-* pfs://home/
+```
+## TODO
+
+### 支持用户自定义的数据预处理job
diff --git a/doc/design/cluster_train/src/checkpointing.png b/doc/design/cluster_train/src/checkpointing.png
new file mode 100644
index 0000000000000000000000000000000000000000..c221e8474f90f37e31416cbb19c9452207a0d14c
Binary files /dev/null and b/doc/design/cluster_train/src/checkpointing.png differ
diff --git a/doc/design/cluster_train/src/data_dispatch.png b/doc/design/cluster_train/src/data_dispatch.png
new file mode 100644
index 0000000000000000000000000000000000000000..5bdcc24d6a6d193cb014f8c38b362451fded5e54
Binary files /dev/null and b/doc/design/cluster_train/src/data_dispatch.png differ
diff --git a/doc/design/cluster_train/src/paddle-cloud-in-data-center.png b/doc/design/cluster_train/src/paddle-cloud-in-data-center.png
new file mode 100644
index 0000000000000000000000000000000000000000..da5d1a77562480ad1d886f5f21dbd84001d3d508
Binary files /dev/null and b/doc/design/cluster_train/src/paddle-cloud-in-data-center.png differ
diff --git a/doc/design/dist/src/paddle-etcd.graffle b/doc/design/cluster_train/src/paddle-etcd.graffle
similarity index 100%
rename from doc/design/dist/src/paddle-etcd.graffle
rename to doc/design/cluster_train/src/paddle-etcd.graffle
diff --git a/doc/design/dist/src/paddle-etcd.png b/doc/design/cluster_train/src/paddle-etcd.png
similarity index 100%
rename from doc/design/dist/src/paddle-etcd.png
rename to doc/design/cluster_train/src/paddle-etcd.png
diff --git a/doc/design/dist/src/paddle-model-sharding.graffle b/doc/design/cluster_train/src/paddle-model-sharding.graffle
similarity index 100%
rename from doc/design/dist/src/paddle-model-sharding.graffle
rename to doc/design/cluster_train/src/paddle-model-sharding.graffle
diff --git a/doc/design/dist/src/paddle-model-sharding.png b/doc/design/cluster_train/src/paddle-model-sharding.png
similarity index 100%
rename from doc/design/dist/src/paddle-model-sharding.png
rename to doc/design/cluster_train/src/paddle-model-sharding.png
diff --git a/doc/design/dist/src/paddle-ps-0.png b/doc/design/cluster_train/src/paddle-ps-0.png
similarity index 100%
rename from doc/design/dist/src/paddle-ps-0.png
rename to doc/design/cluster_train/src/paddle-ps-0.png
diff --git a/doc/design/dist/src/paddle-ps-1.png b/doc/design/cluster_train/src/paddle-ps-1.png
similarity index 100%
rename from doc/design/dist/src/paddle-ps-1.png
rename to doc/design/cluster_train/src/paddle-ps-1.png
diff --git a/doc/design/dist/src/paddle-ps.graffle b/doc/design/cluster_train/src/paddle-ps.graffle
similarity index 100%
rename from doc/design/dist/src/paddle-ps.graffle
rename to doc/design/cluster_train/src/paddle-ps.graffle
diff --git a/doc/design/dist/src/paddle-task-queues.graffle b/doc/design/cluster_train/src/paddle-task-queues.graffle
similarity index 100%
rename from doc/design/dist/src/paddle-task-queues.graffle
rename to doc/design/cluster_train/src/paddle-task-queues.graffle
diff --git a/doc/design/dist/src/paddle-task-queues.png b/doc/design/cluster_train/src/paddle-task-queues.png
similarity index 100%
rename from doc/design/dist/src/paddle-task-queues.png
rename to doc/design/cluster_train/src/paddle-task-queues.png
diff --git a/doc/design/dist/src/paddle-task-states.graffle b/doc/design/cluster_train/src/paddle-task-states.graffle
similarity index 100%
rename from doc/design/dist/src/paddle-task-states.graffle
rename to doc/design/cluster_train/src/paddle-task-states.graffle
diff --git a/doc/design/dist/src/paddle-task-states.png b/doc/design/cluster_train/src/paddle-task-states.png
similarity index 100%
rename from doc/design/dist/src/paddle-task-states.png
rename to doc/design/cluster_train/src/paddle-task-states.png
diff --git a/doc/design/cluster_train/src/trainer.graffle b/doc/design/cluster_train/src/trainer.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..42384a3f059966e22e22f5fa4295cc9ead5cef83
Binary files /dev/null and b/doc/design/cluster_train/src/trainer.graffle differ
diff --git a/doc/design/cluster_train/src/trainer.png b/doc/design/cluster_train/src/trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/design/cluster_train/src/trainer.png differ
diff --git a/doc/design/images/replica.png b/doc/design/images/replica.png
new file mode 100644
index 0000000000000000000000000000000000000000..ef59e56b01d792a059279e6bb9a29f3db6a59a41
Binary files /dev/null and b/doc/design/images/replica.png differ
diff --git a/doc/design/images/two_phase_commit.png b/doc/design/images/two_phase_commit.png
new file mode 100644
index 0000000000000000000000000000000000000000..ef6f7317bd440cc7d9fe08fcbbf2b7a542f99049
Binary files /dev/null and b/doc/design/images/two_phase_commit.png differ
diff --git a/doc/design/multi_language_interface/why_plain_c.md b/doc/design/multi_language_interface/00.why_plain_c.md
similarity index 94%
rename from doc/design/multi_language_interface/why_plain_c.md
rename to doc/design/multi_language_interface/00.why_plain_c.md
index a3f41ca7b93de8a55d927c88812802ef12246182..a1443093342c5a3ed698fb6b52a751dfc7cb5319 100644
--- a/doc/design/multi_language_interface/why_plain_c.md
+++ b/doc/design/multi_language_interface/00.why_plain_c.md
@@ -58,32 +58,32 @@ typedef void* paddle_matrix;
 typedef int paddle_error;
 
 extern "C"
-paddle_error paddle_matrix_shape(paddle_matrix matrix,
-                                 uint64_t* width,
-                                 uint64_t* height);
+paddle_error paddle_matrix_get_shape(paddle_matrix matrix,
+                                     uint64_t* width,
+                                     uint64_t* height);
 ```
 而在CPP里面实现这个C的接口，文件 `paddle_matrix.cpp`
 
 ```cpp
-#include "paddle/math/matrix.hpp"
+#include "paddle/math/matrix.h"
 extern "C"
 paddle_error paddle_matrix_shape(paddle_matrix matrix,
                                  uint64_t *width,
                                  uint64_t *height) {
-  auto m = (paddle::math::matrix*)(matrix);
+  auto m = (paddle::capi::CMatrix*)(matrix);
   *width = m->width();
   *height = m->height();
 }
 ```
 
-其中`paddle/math/matrix.hpp`文件内容为:
+其中`paddle/capi/CMatrix.hpp`文件内容为:
 
 ```cpp
 namespace paddle {
 namespace math {  
 
-class Matrix {
-  //...
+class CMatrix {
+  std::shared_ptr<paddle::Matrix> mat;
 };
 
 }  // namespace math
@@ -113,6 +113,6 @@ class Matrix {
 | 手写多语言绑定 | 不使用SWIG | 使用SWIG需要多语言绑定的开发人员熟练掌握SWIG配置，社区参与困难。SWIG生成的代码不能保证多语言代码风格的一致性 |
 
 
-## 简单实现
+## 实现
 
-TBD
+参考[Inference implementation](01.inference_implementation.md)
diff --git a/doc/design/multi_language_interface/01.inference_implementation.md b/doc/design/multi_language_interface/01.inference_implementation.md
new file mode 100644
index 0000000000000000000000000000000000000000..9820284523246a062581f322616d196f575c9d29
--- /dev/null
+++ b/doc/design/multi_language_interface/01.inference_implementation.md
@@ -0,0 +1,131 @@
+# C-API 模型推断实现文档
+
+本文档描述Paddle C-API的实现细节。Paddle C-API是多语言API的基础部分。Paddle需要暴露的API很多。先实现模型推断的API，通过模型推断API的实现作为一个样例，来进行讨论。至于为什么需要C-API，请参考[Why Plain C](./00.why_plain_c.md)。
+
+## Table of Contents
+   * [C-API 模型推断实现文档](#c-api-模型推断实现文档)
+      * [暴露接口原则](#暴露接口原则)
+      * [目录结构](#目录结构)
+      * [实现方式](#实现方式)
+         * [capi.h](#capih)
+         * [具体某种类型的头文件](#具体某种类型的头文件)
+         * [capi_private.h](#capi_privateh)
+         * [具体某种类型的实现文件](#具体某种类型的实现文件)
+         * [libpaddle_capi_shared.{so, dylib}](#libpaddle_capi_sharedso-dylib)
+         * [libpaddle_capi_whole.a](#libpaddle_capi_wholea)
+         * [examples](#examples)
+      * [编译选项](#编译选项)
+
+
+## 暴露接口原则
+
+1. 所有的接口均为C接口。即使用`extern "C"`
+2. 除构造某种类型的函数(`paddle_matrix_create`等)，其他函数均返回`paddle_error`。且调用时不能抛出异常或出现运行时错误。
+3. 所有类型名为`paddle_类型名`，所有与类型相关的函数，函数名为`paddle_类型名_函数名`
+4. 如果某一个Paddle Core概念(GradientMachine/Matrix)需要被暴露到其他语言，那么
+	* 为了暴露的接口尽量简单。只暴露概念的接口，而不暴露概念的实现。即暴露`GradientMachine`或者`Matrix`但不暴露`RecurrentGradientMachine`和`CpuSparseMatrix`。
+	* 暴露这个概念必要函数。`必要`是指，即完成某一个任务的最少函数。
+5. 不在`capi`接口层做过多封装。
+	* 如果某一个Paddle概念必须要暴露，但是又过于琐碎。不在`capi`这一层进行封装，而是直接修改Paddle Core。让Paddle核心中，这一概念不再琐碎。
+
+
+## 目录结构
+
+```text
+Paddle
+  `-- paddle
+        `-- capi
+              `-- examples  # The example project for C-API.
+              `-- tests  # unittests for C-API
+              `-- capi.h  # C-API header file.
+              `-- capi_private.h  # The shared header file between implementation sources.
+              `-- matrix.{h, cpp}
+              `-- gradient_machine.{h, cpp}
+              `-- ...
+```
+
+
+Paddle的C-API目录结构如上图表所示。这个目录中除了`capi_private.h`之外的所有头文件，均会被安装到include/paddle路径下。C-API生成的二进制文件会被安装到`lib`目录下。即，安装后的目录结构为
+
+```text
+`-- include
+      `-- paddle
+             `-- capi.h
+             `-- matrix.h
+             `-- gradient_machine.h
+             `-- ...
+`-- lib
+     `-- libpaddle_capi_shared.{so, dylib}  # In mac, dynamic libary's file name extention is `dylib`
+     `-- libpaddle_capi_whole.a  # static library for all symbols of Paddle.
+```
+
+## 实现方式
+
+下面分别介绍某一类文件的实现方式。
+
+### capi.h
+
+`capi.h`是用户使用C-API时所唯一需要引入的头文件。在`capi.h`中，引入了类型的头文件，`matrix.h`, `gradient_machine.h`。在引入其他类型的头文件时，使用相对路径的引用方式。即`#include "matrix.h"`
+
+### 具体某种类型的头文件
+
+具体某种类型的头文件，即例如`matrix.h`，`gradient_machine.h`等。在这些头文件中，包含了某种类型的类型定义和暴露的全部函数。
+
+这个头文件不假设其他文件的引用顺序，即使用户直接引用某种类型的头文件，也不应该报错(虽然不鼓励这样)。如果某一个类型需要引用另一个类型，例如`gradient_machine`需要引用`matrix`，则直接引入另一种类型的头文件，即`#include "matrix.h"`。
+
+### capi_private.h
+
+`capi_prviate.h`是各个实现中共享的头文件，他主要包含了实际暴露的类型结构。在用户使用C-API时，Paddle的类型全部退化成`void *`，即`typedef paddle_matrix void*`。但，对于每种C-API暴露的类型，均是在`capi_private.h`中实现的结构体。
+
+```cpp
+struct CMatrix {
+   int type = MatrixType;
+   std::shared_ptr<paddle::Matrix> mat;
+};
+```
+
+通常，这个结构体包含两个项目。
+
+* `type`是一个类型的标志。对于每种类型，type字段均不尽相同。这样，即使C-API接受的类型全是`void *`，我们也可以确定每一个参数的类型。
+
+  ```cpp
+  void some_c_api_function(void* some_instance) {
+     int* type = (int *) some_instance;
+     switch (*type) {
+       case MatrixType:
+         CMatrix* mat = (CMatrix *) some_instance;
+         ...
+       ...
+     }
+  }
+  ```
+* 这个结构体中的另一个项目是，Paddle Core中这一类型接口的智能指针(shared_ptr)。
+	* 使用智能指针的原因是: 用户可以安全的释放某个C-API的实例，而不必在意Paddle Core是否还在使用这个实例。
+	* 例如，用户通过C-API获得了神经网络的参数实例。当用户使用完这个参数后，直接删除这个参数即可。即便Paddle Core中的模型还在使用这个参数，这个参数也不会一并删除。
+
+### 具体某种类型的实现文件
+
+具体某种类型的实现文件，即`matrix.cpp`, `gradient_machine.cpp`等文件。在这些文件中，使用C++ 11实现了C-API的接口，并且使用`extern "C"`导出这些接口。在实现过程中，对输入参数的安全性进行了必要的判断，并将C-API接口的参数转发给`Paddle Core`。
+
+### libpaddle\_capi_shared.{so, dylib}
+
+`libpaddle_capi_shared`是C-API导出的动态库。这个动态库的连接参数与Paddle的其他二进制(例如`paddle_trainer`)类似。用户可以直接使用这个动态库来引入Paddle C-API。具体使用方法为`-lpaddle_capi_shared`。
+
+### libpaddle\_capi_whole.a
+
+`libpaddle_capi_whole`是C-API导出的静态库。这个静态库包含了Paddle的全部符号。他是将`libpaddle_gserver.a`, `libpaddle_math.a`, `libpaddle_capi.a`等全部静态库中的目标文件全部打包后产生的文件。具体使用方法为`--whole-archive -lpaddle_capi_whole --no-whole-archive`。
+
+
+### examples
+
+在样例中，使用`C99`开发了模型预测的样例代码。具体请参考[example/README.md](../../../paddle/capi/examples/README.md)。
+
+## 编译选项
+
+C-API的编译选项默认关闭，打开这个编译选项，需要在cmake的时候，设置
+
+```bash
+cmake ${YOUR_SOURCE_ROOT} -DWITH_C_API=ON -DWITH_PYTHON=OFF -DWITH_SWIG_PY=OFF
+```
+
+编译C-API的时候推荐Paddle不嵌入Python解释器，也不生成`SWIG`接口，具体原因参考[Why Plain C](./00.why_plain_c.md)。
diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md
new file mode 100644
index 0000000000000000000000000000000000000000..3692a5248a355cfcfd1cfd0911d43d65166921b1
--- /dev/null
+++ b/doc/design/releasing_process.md
@@ -0,0 +1,58 @@
+# Paddle发行规范
+
+Paddle使用git-flow branching model做分支管理，使用[Semantic Versioning](http://semver.org/)标准表示Paddle版本号。
+
+Paddle每次发新的版本，遵循以下流程:
+
+1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
+2. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
+3. 对这个版本的提交，做如下几个操作:
+	* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
+	* 编译这个版本的Ubuntu Deb包。如果失败，修复Ubuntu Deb包编译问题，Patch号加一，返回第二步。
+	* 使用Regression Test List作为检查列表，测试Docker镜像/ubuntu安装包的功能正确性
+		* 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，返回第二步
+4. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
+5. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
+6. 协同完成Release Note的书写
+
+
+需要注意的是:
+
+* `release/版本号`分支一旦建立，一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭，方便测试人员测试Paddle的行为。
+* 在`release/版本号`分支存在的时候，如果有bugfix的行为，需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
+
+# Paddle 分支规范
+
+Paddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，并适应github的特性做了一些区别。
+
+* Paddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中:
+	* `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。
+	* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试，但并没有经过回归测试。
+	* `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。
+
+* 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，但所有fork的版本库的所有分支都相当于特性分支。
+	* 建议，开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
+	* 建议，开发者fork的版本库中，再基于`develop`版本fork出自己的功能分支。
+	* 当功能分支开发完毕后，向Paddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
+		* 在评审过程中，开发者修改自己的代码，可以继续在自己的功能分支提交代码。 
+
+* BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。
+
+# Paddle回归测试列表
+
+本列表说明Paddle发版之前需要测试的功能点。
+
+## Paddle Book中所有章节
+
+Paddle每次发版本首先要保证Paddle Book中所有章节功能的正确性。功能的正确性包括验证Paddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
+
+| | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| API.V2 + Docker + GPU  |  |  |  |  |  |  |  |  |
+| API.V2 + Docker + CPU  |  |  |  |  |  |  |  |  |
+| `paddle_trainer` + Docker + GPU |  |  |  |  |  |  |  |  |
+| `paddle_trainer` + Docker + CPU |  |  |  |  |  |  |  |  |
+| API.V2 + Ubuntu + GPU |  |  |  |  |  |  |  |  |
+| API.V2 + Ubuntu + CPU |  |  |  |  |  |  |  |  |
+| `paddle_trainer` + Ubuntu + GPU |  |  |  |  |  |  |  |  |
+| `paddle_trainer` + Ubuntu + CPU |  |  |  |  |  |  |  |  |
diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst
index 5051a892304fdc8b0f1a19a7d4560d5ee007c47d..d536f53abc031e9d279ace0e231a381a2f1e81b6 100644
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -8,7 +8,8 @@ PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两
 如何构建PaddlePaddle的文档
 ==========================
 
-PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式。构建PaddlePaddle文档需要准备的环境相对较复杂，所以我们推荐使用基于Docker来构建PaddlePaddle的文档。
+PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式，我们提供了一个构建脚本build_docs.sh来进行构建。
+PaddlePaddle文档需要准备的环境相对较复杂，所以我们推荐使用基于Docker来构建PaddlePaddle的文档。
 
 
 使用Docker构建PaddlePaddle的文档
@@ -16,39 +17,62 @@ PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式。
 
 使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档，即
 
-..	code-block:: bash
+..  code-block:: bash
 
-	cd TO_YOUR_PADDLE_CLONE_PATH
-	cd paddle/scripts/tools/build_docs
-	bash build_docs.sh
+    cd TO_YOUR_PADDLE_CLONE_PATH
+    cd paddle/scripts/tools/build_docs
+    bash build_docs.sh with_docker
 
-编译完成后，该目录下会生成如下两个子目录\:
+编译完成后，会在当前目录生成两个子目录\:
 
 * doc 英文文档目录
 * doc_cn 中文文档目录
 
 打开浏览器访问对应目录下的index.html即可访问本地文档。
 
-..	code-block:: bash
-
-	open doc_cn/index.html
 
 
 直接构建PaddlePaddle的文档
 --------------------------
 
-TBD
+因为PaddlePaddle的v2 api文档生成过程依赖于py_paddle Python包，用户需要首先确认py_paddle包已经安装。
+
+..  code-block:: bash
+
+    python -c "import py_paddle"
+
+如果提示错误，那么用户需要在本地编译安装PaddlePaddle，请参考 `源码编译文档 <http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_ 。
+注意，用户在首次编译安装PaddlePaddle时，请将WITH_DOC选项关闭。在编译安装正确之后，请再次确认py_paddle包已经安装，即可进行下一步操作。
+
+如果提示正确，可以执行以下命令编译生成文档，即
+
+..  code-block:: bash
+
+    cd TO_YOUR_PADDLE_CLONE_PATH
+    cd paddle/scripts/tools/build_docs
+    bash build_docs.sh local
+
+编译完成之后，会在当前目录生成两个子目录\:
+
+* doc 英文文档目录
+* doc_cn 中文文档目录
+
+打开浏览器访问对应目录下的index.html即可访问本地文档。
+
 
 如何书写PaddlePaddle的文档
 ==========================
 
-TBD
+PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程进行书写。
 
 如何更新www.paddlepaddle.org文档
 ================================
 
-TBD
+开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中，提交方式可参见 `贡献文档 <http://paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/develop/doc_cn/>`_ 和
+`英文文档 <http://www.paddlepaddle.org/develop/doc/>`_ 。
+
 
 
-..	_cmake: https://cmake.org/
-..	_sphinx: http://www.sphinx-doc.org/en/1.4.8/
+..  _cmake: https://cmake.org/
+..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 9d6d67e62c106b2298ce1ebae5633d03bba1e684..eff296bcb0174c71e05ee169c85236343a3e1164 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -9,6 +9,10 @@ add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
 
+if(WITH_C_API)
+    add_subdirectory(capi)
+endif()
+
 if(WITH_SWIG_PY)
   configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
           ${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index c4f5dca26cc6a5e9fdd23ee27b594ced29a25c7a..d51204012171c9887acd5f578f913143182efe36 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 #include <vector>
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
 #include "paddle/utils/Common.h"
 #include "paddle/utils/GlobalConstants.h"
 
@@ -468,8 +469,10 @@ private:
 };
 
 enum GradientMatchineCreateMode {
-  CREATE_MODE_NORMAL = 0,
-  CREATE_MODE_TESTING = 4
+  CREATE_MODE_NORMAL = paddle::GradientMachine::kNormal,
+  CREATE_MODE_SGD_SPARSE_CPU_TRAINING =
+      paddle::GradientMachine::kSgdSparseCpuTraining,
+  CREATE_MODE_TESTING = paddle::GradientMachine::kTesting
 };
 
 struct ParameterConfigPrivate;
@@ -817,7 +820,8 @@ private:
 public:
   static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
   static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
-                                               int passCount);
+                                               int passCount,
+                                               bool useSparseUpdater);
   ~ParameterUpdater();
 
   /**
@@ -855,6 +859,13 @@ public:
    */
   void update(Parameter* param);
 
+  /**
+   * @breif only get required sparse rows by default.
+   * @param fullSize: get full matrix parameter if *fullSize* set
+   * @param apply: get PARAMETER_APPLY on pserver if *apply* set
+   */
+  void getParametersRemote(bool fullSize = false, bool apply = false);
+
   /**
    * @brief restore the average parameter.
    * @note It is only used in AverageOptimizer. Restore will get the current
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp
index 75b0ae7cb6cc8c9ad0f8fe69963b7439a44bf55e..79921ea6e787f3c0ebecaad6a9a54bac92211320 100644
--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -29,10 +29,22 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(
 }
 
 ParameterUpdater *ParameterUpdater::createRemoteUpdater(
-    OptimizationConfig *config, int passCount) {
+    OptimizationConfig *config, int passCount, bool useSparseUpdater) {
   auto updater = new ParameterUpdater();
-  updater->m->updater.reset(new paddle::RemoteParameterUpdater(
-      config->m->getConfig(), passCount, nullptr));
+  auto remoteUpdater = new paddle::RemoteParameterUpdater(
+      config->m->getConfig(), passCount, nullptr);
+  if (useSparseUpdater) {
+    std::unique_ptr<paddle::ParameterUpdater> remoteUpdaterPtr(remoteUpdater);
+    auto sparseRemoteUpdater =
+        new paddle::SparseRemoteParameterUpdaterComposite(
+            config->m->getConfig(),
+            passCount,
+            false,
+            std::move(remoteUpdaterPtr));
+    updater->m->updater.reset(sparseRemoteUpdater);
+  } else {
+    updater->m->updater.reset(remoteUpdater);
+  }
   return updater;
 }
 
@@ -59,6 +71,10 @@ void ParameterUpdater::update(Parameter *param) {
   m->updater->update(paddleParam);
 }
 
+void ParameterUpdater::getParametersRemote(bool fullSize, bool apply) {
+  m->updater->getParametersRemote(fullSize, apply);
+}
+
 void ParameterUpdater::restore() { m->updater->restore(); }
 
 void ParameterUpdater::apply() { m->updater->apply(); }
diff --git a/paddle/capi/Arguments.cpp b/paddle/capi/Arguments.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b81ec69e60399af86f055d2258276ac06e0b13a
--- /dev/null
+++ b/paddle/capi/Arguments.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "arguments.h"
+#include "capi_private.h"
+
+using paddle::capi::cast;
+
+#define castArg(v) cast<paddle::capi::CArguments>(v)
+#define castIVec(v) cast<paddle::capi::CIVector>(v)
+
+extern "C" {
+paddle_arguments paddle_arguments_create_none() {
+  return new paddle::capi::CArguments();
+}
+
+paddle_error paddle_arguments_destroy(paddle_arguments args) {
+  if (args == nullptr) return kPD_NULLPTR;
+  delete castArg(args);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_get_size(paddle_arguments args, uint64_t* size) {
+  if (args == nullptr || size == nullptr) return kPD_NULLPTR;
+  *size = castArg(args)->args.size();
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_resize(paddle_arguments args, uint64_t size) {
+  if (args == nullptr) return kPD_NULLPTR;
+  castArg(args)->args.resize(size);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_set_value(paddle_arguments args,
+                                        uint64_t ID,
+                                        paddle_matrix mat) {
+  if (args == nullptr || mat == nullptr) return kPD_NULLPTR;
+  auto m = paddle::capi::cast<paddle::capi::CMatrix>(mat);
+  if (m->mat == nullptr) return kPD_NULLPTR;
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  a->args[ID].value = m->mat;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_get_value(paddle_arguments args,
+                                        uint64_t ID,
+                                        paddle_matrix mat) {
+  if (args == nullptr || mat == nullptr) return kPD_NULLPTR;
+  auto m = paddle::capi::cast<paddle::capi::CMatrix>(mat);
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  m->mat = a->args[ID].value;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_get_ids(paddle_arguments args,
+                                      uint64_t ID,
+                                      paddle_ivector ids) {
+  if (args == nullptr || ids == nullptr) return kPD_NULLPTR;
+  auto iv = castIVec(ids);
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  iv->vec = a->args[ID].ids;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_set_ids(paddle_arguments args,
+                                      uint64_t ID,
+                                      paddle_ivector ids) {
+  //! TODO(lizhao): Complete this method.
+  if (args == nullptr || ids == nullptr) return kPD_NULLPTR;
+  auto iv = paddle::capi::cast<paddle::capi::CIVector>(ids);
+  if (iv->vec == nullptr) return kPD_NULLPTR;
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  a->args[ID].ids = iv->vec;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_set_sequence_start_pos(paddle_arguments args,
+                                                     uint64_t ID,
+                                                     uint32_t nestedLevel,
+                                                     paddle_ivector seqPos) {
+  if (args == nullptr || seqPos == nullptr) return kPD_NULLPTR;
+  auto iv = paddle::capi::cast<paddle::capi::CIVector>(seqPos);
+  if (iv->vec == nullptr) return kPD_NULLPTR;
+  auto a = castArg(args);
+  return a->accessSeqPos(ID, nestedLevel, [&iv](paddle::ICpuGpuVectorPtr& ptr) {
+    ptr = std::make_shared<paddle::ICpuGpuVector>(iv->vec);
+  });
+}
+
+paddle_error paddle_arguments_get_sequence_start_pos(paddle_arguments args,
+                                                     uint64_t ID,
+                                                     uint32_t nestedLevel,
+                                                     paddle_ivector seqPos) {
+  if (args == nullptr || seqPos == nullptr) return kPD_NULLPTR;
+  auto iv = paddle::capi::cast<paddle::capi::CIVector>(seqPos);
+  auto a = castArg(args);
+  return a->accessSeqPos(ID, nestedLevel, [&iv](paddle::ICpuGpuVectorPtr& ptr) {
+    iv->vec = ptr->getMutableVector(false);
+  });
+}
+}
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1b52a79cebb1210b09fc9f30282bfd799a35dcf9
--- /dev/null
+++ b/paddle/capi/CMakeLists.txt
@@ -0,0 +1,73 @@
+if (WITH_DOUBLE)
+  set(PADDLE_FLOAT_TYPE double)
+else ()
+  set(PADDLE_FLOAT_TYPE float)
+endif()
+
+# config.h used for C-API. It will store Paddle building configuration as a
+# header. Make user just include PaddleCAPI.h then can get building
+# configuration without explicitly set -DPADDLE_WITH_DOUBLE when building their
+# libraries.
+configure_file(config.h.in config.h @ONLY)
+
+# PaddleCAPI.h is the only header we exposed. It currently only used for model
+# inference.
+file(GLOB CAPI_HEADERS *.h)
+set(CAPI_PRIVATE_HEADER capi_private.h)
+list(REMOVE_ITEM CAPI_HEADERS ${CAPI_PRIVATE_HEADER})
+file(GLOB CAPI_SOURCES *.cpp)
+
+# building paddle_capi
+add_library(paddle_capi STATIC ${CAPI_HEADERS} ${CAPI_PRIVATE_HEADER}
+  ${CAPI_SOURCES})
+
+target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+
+add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
+  ${CAPI_PRIVATE_HEADER})
+
+add_dependencies(paddle_capi gen_proto_cpp)
+
+
+# combine all paddle static libraries together, into libpaddle_capi_whole.a
+# user should use PaddleCAPI as -lpaddle_capi_whole
+set(capi_whole_library libpaddle_capi_whole.a)
+add_custom_target(paddle_capi_whole ALL
+        COMMAND mkdir -p o_files/capi && cd o_files/capi/ && ar -x $<TARGET_FILE:paddle_capi>
+        COMMAND mkdir -p o_files/utils && cd o_files/utils/ && ar -x $<TARGET_FILE:paddle_utils>
+        COMMAND mkdir -p o_files/parameter && cd o_files/parameter/ && ar -x $<TARGET_FILE:paddle_parameter>
+        COMMAND mkdir -p o_files/math && cd o_files/math/  && ar -x $<TARGET_FILE:paddle_math>
+        COMMAND mkdir -p o_files/cuda && cd o_files/cuda/ && ar -x $<TARGET_FILE:paddle_cuda>
+        COMMAND mkdir -p o_files/function && cd o_files/function/ && ar -x $<TARGET_FILE:paddle_function>
+        COMMAND mkdir -p o_files/gserver && cd o_files/gserver/ && ar -x $<TARGET_FILE:paddle_gserver>
+        COMMAND mkdir -p o_files/proto && cd o_files/proto/ && ar -x $<TARGET_FILE:paddle_proto>
+        COMMAND mkdir -p o_files/network && cd o_files/network/ && ar -x $<TARGET_FILE:paddle_network>
+        COMMAND mkdir -p o_files/pserver && cd o_files/pserver/ && ar -x $<TARGET_FILE:paddle_pserver>
+        COMMAND ar crs ${capi_whole_library} `find ./o_files -name '*.o'`
+        COMMAND rm -rf o_files
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS paddle_capi paddle_utils paddle_parameter paddle_math
+                paddle_cuda paddle_function paddle_gserver
+                paddle_proto paddle_pserver paddle_network
+        )
+set_target_properties(paddle_capi_whole
+  PROPERTIES IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library})
+
+add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
+target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+link_paddle_exe(paddle_capi_shared)
+
+# install library & headers.
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library} DESTINATION lib)
+install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
+install(TARGETS paddle_capi_shared DESTINATION lib)
+
+# this variable used for unittest
+set(PADDLE_CAPI_INC_PATH
+  ${CMAKE_CURRENT_BINARY_DIR}
+  ${CMAKE_CURRENT_SOURCE_DIR})
+
+if (WITH_TESTING)
+  add_subdirectory(tests)
+endif()
diff --git a/paddle/capi/Main.cpp b/paddle/capi/Main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f24561e9aafc1e900f6371ad3c7e5a45033a9ef
--- /dev/null
+++ b/paddle/capi/Main.cpp
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fenv.h>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+#include "capi_private.h"
+#include "main.h"
+#include "paddle/trainer/TrainerConfigHelper.h"
+#include "paddle/utils/Excepts.h"
+#include "paddle/utils/PythonUtil.h"
+
+static void initPaddle(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  paddle::initPython(argc, argv);
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+}
+
+extern "C" {
+paddle_error paddle_init(int argc, char** argv) {
+  std::vector<char*> realArgv;
+  realArgv.reserve(argc + 1);
+  realArgv.push_back(strdup(""));
+  for (int i = 0; i < argc; ++i) {
+    realArgv.push_back(argv[i]);
+  }
+  initPaddle(argc + 1, realArgv.data());
+  free(realArgv[0]);
+  return kPD_NO_ERROR;
+}
+}
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d898ebe2612d749ca261d35139d1cd45bd355eef
--- /dev/null
+++ b/paddle/capi/Matrix.cpp
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi_private.h"
+#include "hl_cuda.h"
+#include "matrix.h"
+
+#define cast(v) paddle::capi::cast<paddle::capi::CMatrix>(v)
+extern "C" {
+paddle_matrix paddle_matrix_create(uint64_t height,
+                                   uint64_t width,
+                                   bool useGpu) {
+  auto ptr = new paddle::capi::CMatrix();
+  ptr->mat = paddle::Matrix::create(height, width, false, useGpu);
+  return ptr;
+}
+
+paddle_matrix paddle_matrix_create_none() {
+  return new paddle::capi::CMatrix();
+}
+
+paddle_error paddle_matrix_destroy(paddle_matrix mat) {
+  if (mat == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  delete ptr;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_matrix_set_row(paddle_matrix mat,
+                                   uint64_t rowID,
+                                   paddle_real* rowArray) {
+  if (mat == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
+  paddle::real* buf = ptr->mat->getRowBuf(rowID);
+  size_t width = ptr->mat->getWidth();
+#ifndef PADDLE_ONLY_CPU
+  hl_memcpy(buf, rowArray, sizeof(paddle::real) * width);
+#else
+  std::copy(rowArray, rowArray + width, buf);
+#endif
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_matrix_get_row(paddle_matrix mat,
+                                   uint64_t rowID,
+                                   paddle_real** rawRowBuffer) {
+  if (mat == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
+  *rawRowBuffer = ptr->mat->getRowBuf(rowID);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_matrix_get_shape(paddle_matrix mat,
+                                     uint64_t* height,
+                                     uint64_t* width) {
+  if (mat == nullptr) return kPD_NULLPTR;
+  if (height != nullptr) {
+    *height = cast(mat)->mat->getHeight();
+  }
+  if (width != nullptr) {
+    *width = cast(mat)->mat->getWidth();
+  }
+  return kPD_NO_ERROR;
+}
+}
+
+paddle_matrix paddle_matrix_create_sparse(
+    uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu) {
+  auto ptr = new paddle::capi::CMatrix();
+  ptr->mat = paddle::Matrix::createSparseMatrix(
+      height,
+      width,
+      nnz,
+      isBinary ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
+      paddle::SPARSE_CSR,
+      false,
+      useGpu);
+  return ptr;
+}
+
+paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
+                                            int* rowArray,
+                                            uint64_t rowSize,
+                                            int* colArray,
+                                            uint64_t colSize,
+                                            float* valueArray,
+                                            uint64_t valueSize) {
+  if (mat == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (rowArray == nullptr || colArray == nullptr ||
+      (valueSize != 0 && valueArray == nullptr) || ptr->mat == nullptr) {
+    return kPD_NULLPTR;
+  }
+  if (auto sparseMat = dynamic_cast<paddle::CpuSparseMatrix*>(ptr->mat.get())) {
+    std::vector<int> row(rowSize);
+    row.assign(rowArray, rowArray + rowSize);
+    std::vector<int> col(colSize);
+    col.assign(colArray, colArray + colSize);
+    std::vector<paddle_real> val(valueSize);
+    if (valueSize) {
+      val.assign(valueArray, valueArray + valueSize);
+    }
+    sparseMat->copyFrom(row, col, val);
+    return kPD_NO_ERROR;
+  } else {
+    return kPD_NOT_SUPPORTED;
+  }
+}
diff --git a/paddle/capi/Vector.cpp b/paddle/capi/Vector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..564708e963b4068da074c1fcc9aac0fade0f65b9
--- /dev/null
+++ b/paddle/capi/Vector.cpp
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi_private.h"
+#include "vector.h"
+
+using paddle::capi::cast;
+
+extern "C" {
+
+paddle_ivector paddle_ivector_create_none() {
+  return new paddle::capi::CIVector();
+}
+
+paddle_ivector paddle_ivector_create(int* array,
+                                     uint64_t size,
+                                     bool copy,
+                                     bool useGPU) {
+  auto ptr = new paddle::capi::CIVector();
+  if (copy) {
+    ptr->vec = paddle::IVector::create(size, useGPU);
+    ptr->vec->copyFrom(array, size);
+  } else {
+    ptr->vec = paddle::IVector::create(array, size, useGPU);
+  }
+  return ptr;
+}
+
+paddle_error paddle_ivector_destroy(paddle_ivector ivec) {
+  if (ivec == nullptr) return kPD_NULLPTR;
+  delete cast<paddle::capi::CIVector>(ivec);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_ivector_get(paddle_ivector ivec, int** buffer) {
+  if (ivec == nullptr || buffer == nullptr) return kPD_NULLPTR;
+  auto v = cast<paddle::capi::CIVector>(ivec);
+  if (v->vec == nullptr) return kPD_NULLPTR;
+  *buffer = v->vec->getData();
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_ivector_resize(paddle_ivector ivec, uint64_t size) {
+  if (ivec == nullptr) return kPD_NULLPTR;
+  auto v = cast<paddle::capi::CIVector>(ivec);
+  if (v->vec == nullptr) return kPD_NULLPTR;
+  v->vec->resize(size);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_ivector_get_size(paddle_ivector ivec, uint64_t* size) {
+  if (ivec == nullptr) return kPD_NULLPTR;
+  auto v = cast<paddle::capi::CIVector>(ivec);
+  if (v->vec == nullptr) return kPD_NULLPTR;
+  *size = v->vec->getSize();
+  return kPD_NO_ERROR;
+}
+}
diff --git a/paddle/capi/arguments.h b/paddle/capi/arguments.h
new file mode 100644
index 0000000000000000000000000000000000000000..d71ea26a5d1aff130d974541532fda3b09bf6fe5
--- /dev/null
+++ b/paddle/capi/arguments.h
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_ARGUMENTS_H__
+#define __PADDLE_CAPI_ARGUMENTS_H__
+
+#include <stdint.h>
+#include "config.h"
+#include "error.h"
+#include "matrix.h"
+#include "vector.h"
+
+/**
+ * Arguments functions. Each argument means layer output. Arguments means a
+ * array of arguemnt.
+ */
+typedef void* paddle_arguments;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief paddle_arguments_create_none Create a array of arguments, which size
+ * is zero.
+ * @return Arguemnts
+ */
+PD_API paddle_arguments paddle_arguments_create_none();
+
+/**
+ * @brief paddle_arguments_destroy Destroy the arguments
+ * @param args arguments to destroy
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_destroy(paddle_arguments args);
+
+/**
+ * @brief paddle_arguments_get_size Get size of arguments array
+ * @param [in] args arguments array
+ * @param [out] size array size
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_get_size(paddle_arguments args,
+                                              uint64_t* size);
+
+/**
+ * @brief PDArgsResize Resize a arguments array.
+ * @param args arguments array.
+ * @param size target size of array
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_resize(paddle_arguments args,
+                                            uint64_t size);
+
+/**
+ * @brief PDArgsSetValue Set value matrix of one argument in array, which index
+ *        is `ID`.
+ * @param args arguments array
+ * @param ID array index
+ * @param mat matrix pointer
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_set_value(paddle_arguments args,
+                                               uint64_t ID,
+                                               paddle_matrix mat);
+
+/**
+ * @brief PDArgsGetValue Get value matrix of one argument in array, which index
+ *        is `ID`.
+ * @param [in] args arguments array
+ * @param [in] ID array index
+ * @param [out] mat matrix pointer
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_get_value(paddle_arguments args,
+                                               uint64_t ID,
+                                               paddle_matrix mat);
+
+/**
+ * @brief PDArgsGetIds Get the integer vector of one argument in array, which
+ *        index is `ID`.
+ * @param args arguments array
+ * @param ID array index
+ * @param ids integer vector pointer
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_get_ids(paddle_arguments args,
+                                             uint64_t ID,
+                                             paddle_ivector ids);
+
+/**
+ * @brief PDArgsSetIds Set the integer vector of one argument in array, which
+ *        index is `ID`.
+ * @param [in] args arguments array
+ * @param [in] ID array index
+ * @param [out] ids integer vector pointer
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_set_ids(paddle_arguments args,
+                                             uint64_t ID,
+                                             paddle_ivector ids);
+
+/**
+ * @brief PDArgsSetSequenceStartPos Set sequence start position vector of one
+ *        argument in array, which index is `ID`.
+ * @param args arguments array
+ * @param ID array index
+ * @param seqPos sequence position array.
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_arguments_set_sequence_start_pos(paddle_arguments args,
+                                        uint64_t ID,
+                                        uint32_t nestedLevel,
+                                        paddle_ivector seqPos);
+/**
+ * @brief PDArgsGetSequenceStartPos Get sequence start position vector of one
+ *        argument in array, which index is `ID`.
+ * @param [in] args arguments array
+ * @param [in] ID array index
+ * @param [out] seqPos sequence position array
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_arguments_get_sequence_start_pos(paddle_arguments args,
+                                        uint64_t ID,
+                                        uint32_t nestedLevel,
+                                        paddle_ivector seqPos);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/paddle/capi/capi.h b/paddle/capi/capi.h
new file mode 100644
index 0000000000000000000000000000000000000000..4097a1a35a64347f0d79b004371df26551e51bbe
--- /dev/null
+++ b/paddle/capi/capi.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_H__
+#define __PADDLE_CAPI_H__
+
+/**
+ * Paddle C API. It will replace SWIG as Multiple Language API for model
+ * training & inference. Currently it is only used in model infernece.
+ *
+ * NOTE: This is an experimental API, it could be changed.
+ */
+#include "arguments.h"
+#include "config.h"
+#include "error.h"
+#include "gradient_machine.h"
+#include "main.h"
+#include "matrix.h"
+#include "vector.h"
+
+#endif  // PADDLECAPI_H_
diff --git a/paddle/capi/capi_private.h b/paddle/capi/capi_private.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7cdbd5f6f347150c02764a86f8ffb0c068e872e
--- /dev/null
+++ b/paddle/capi/capi_private.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi.h"
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/parameter/Argument.h"
+#pragma once
+
+namespace paddle {
+namespace capi {
+
+enum CType { kIVECTOR = 0, kMATRIX, kARGUMENTS, kGRADIENT_MACHINE };
+
+#define STRUCT_HEADER CType type;
+
+struct CHeader {
+  STRUCT_HEADER
+};
+
+struct CIVector {
+  STRUCT_HEADER
+  IVectorPtr vec;
+
+  CIVector() : type(kIVECTOR) {}
+};
+
+struct CMatrix {
+  STRUCT_HEADER
+  MatrixPtr mat;
+
+  CMatrix() : type(kMATRIX) {}
+};
+
+struct CArguments {
+  STRUCT_HEADER
+  std::vector<paddle::Argument> args;
+
+  CArguments() : type(kARGUMENTS) {}
+
+  template <typename T>
+  paddle_error accessSeqPos(uint64_t ID, uint32_t nestedLevel, T callback) {
+    if (ID >= args.size()) return kPD_OUT_OF_RANGE;
+    switch (nestedLevel) {
+      case 0:
+        callback(args[ID].sequenceStartPositions);
+        break;
+      case 1:
+        callback(args[ID].subSequenceStartPositions);
+        break;
+      default:
+        return kPD_OUT_OF_RANGE;
+    }
+    return kPD_NO_ERROR;
+  }
+};
+
+struct CGradientMachine {
+  STRUCT_HEADER
+  paddle::GradientMachinePtr machine;
+
+  CGradientMachine() : type(kGRADIENT_MACHINE) {}
+};
+
+template <typename T>
+inline T* cast(void* ptr) {
+  return reinterpret_cast<T*>(ptr);
+}
+}  // namespace capi
+}  // namespace paddle
diff --git a/paddle/capi/config.h.in b/paddle/capi/config.h.in
new file mode 100644
index 0000000000000000000000000000000000000000..d205307588eb60b2e11accb9f825391f7c1453f2
--- /dev/null
+++ b/paddle/capi/config.h.in
@@ -0,0 +1,10 @@
+#ifndef __PADDLE_PADDLE_CAPI_CONFIG_H_INCLUDED__
+#define __PADDLE_PADDLE_CAPI_CONFIG_H_INCLUDED__
+
+typedef @PADDLE_FLOAT_TYPE@ paddle_real;
+
+// Since we only support linux and macos in compile, always use clang or
+// gcc 4.8+. DLL_IMPORT/DLL_EXPORT is as simple as below.
+#define PD_API __attribute__((visibility("default")))
+
+#endif
diff --git a/paddle/capi/error.h b/paddle/capi/error.h
new file mode 100644
index 0000000000000000000000000000000000000000..44d8c2040d1aad698398089baeee6f13c3deeb55
--- /dev/null
+++ b/paddle/capi/error.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_ERROR_H__
+#define __PADDLE_CAPI_ERROR_H__
+
+/**
+ * Error Type for Paddle API.
+ */
+typedef enum {
+  kPD_NO_ERROR = 0,
+  kPD_NULLPTR = 1,
+  kPD_OUT_OF_RANGE = 2,
+  kPD_PROTOBUF_ERROR = 3,
+  kPD_NOT_SUPPORTED = 4,
+  kPD_UNDEFINED_ERROR = -1,
+} paddle_error;
+
+#endif
diff --git a/paddle/capi/examples/.gitignore b/paddle/capi/examples/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..2caa0a5a298d8cec0d996c3774b6f42060a0d41a
--- /dev/null
+++ b/paddle/capi/examples/.gitignore
@@ -0,0 +1,2 @@
+*.bin
+build-*
diff --git a/paddle/capi/examples/README.md b/paddle/capi/examples/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..14013e281ff50279473dfc4da46aaef4f8b7ea9a
--- /dev/null
+++ b/paddle/capi/examples/README.md
@@ -0,0 +1,3 @@
+# C-API Example Usage
+
+* [Model Inference](./model_inference/README.md)
diff --git a/paddle/capi/examples/model_inference/README.md b/paddle/capi/examples/model_inference/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..58e6c83140b5f33ddfd1f027b6624a26f842a2f8
--- /dev/null
+++ b/paddle/capi/examples/model_inference/README.md
@@ -0,0 +1,42 @@
+# Use C-API for Model Inference
+
+There are several examples in this directory about how to use Paddle C-API for model inference.
+
+## Convert configuration file to protobuf binary.
+
+Firstly, the user should convert Paddle's model configuration file into a protobuf binary file. In each example directory, there is a file named `convert_protobin.sh`. It will convert `trainer_config.conf` into `trainer_config.bin`.
+
+The `convert_protobin.sh` is very simple, just invoke `dump_config` Python module to dump the binary file. The command line usages are:
+
+```bash
+python -m paddle.utils.dump_config YOUR_CONFIG_FILE 'CONFIG_EXTRA_ARGS' --binary > YOUR_CONFIG_FILE.bin
+```
+
+## Initialize paddle
+
+```c++
+char* argv[] = {"--use_gpu=False"};
+paddle_init(1, (char**)argv);
+```
+
+We must initialize global context before we invoke other interfaces in Paddle. The initialize commands just like the `paddle_trainer` command line arguments.  `paddle train --help`,  will show the list of arguments. The most important argument is `use_gpu` or not.
+
+## Load network and parameters
+
+```c
+paddle_gradient_machine machine;
+paddle_gradient_machine_create_for_inference(&machine, config_file_content, content_size));
+paddle_gradient_machine_load_parameter_from_disk(machine, "./some_where_to_params"));
+```
+
+The gradient machine is a Paddle concept, which represents a neural network can be forwarded and backward. We can create a gradient machine fo model inference, and load the parameter files from disk.
+
+Moreover, if we want to inference in multi-thread, we could create a thread local gradient machine which shared the same parameter by using `paddle_gradient_machine_create_shared_param` API. Please reference `multi_thread` as an example.
+
+## Create input
+
+The input of a neural network is an `arguments`. The examples in this directory will show how to construct different types of inputs for prediction. Please look at `dense`, `sparse_binary`, `sequence` for details.
+
+## Get inference
+
+After invoking `paddle_gradient_machine_forward`, we could get the output of the neural network.  The `value` matrix of output arguments will store the neural network output values. If the output is a `SoftmaxActivation`, the `value` matrix are the probabilities of each input samples. The height of output matrix is number of sample. The width is the number of categories.
diff --git a/paddle/capi/examples/model_inference/common/common.h b/paddle/capi/examples/model_inference/common/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..a78522e4a7c3cb34b341b7f4c89b53d32b72f114
--- /dev/null
+++ b/paddle/capi/examples/model_inference/common/common.h
@@ -0,0 +1,26 @@
+#ifndef __CAPI_EXAMPLE_COMMON_H__
+#define __CAPI_EXAMPLE_COMMON_H__
+#include <stdio.h>
+#include <stdlib.h>
+
+#define CHECK(stmt)                                                \
+  do {                                                             \
+    paddle_error __err__ = stmt;                                   \
+    if (__err__ != kPD_NO_ERROR) {                                 \
+      fprintf(stderr, "Invoke paddle error %d \n" #stmt, __err__); \
+      exit(__err__);                                               \
+    }                                                              \
+  } while (0)
+
+void* read_config(const char* filename, long* size) {
+  FILE* file = fopen(filename, "r");
+  if (file == NULL) return NULL;
+  fseek(file, 0L, SEEK_END);
+  *size = ftell(file);
+  fseek(file, 0L, SEEK_SET);
+  void* buf = malloc(*size);
+  fread(buf, 1, *size, file);
+  fclose(file);
+  return buf;
+}
+#endif
diff --git a/paddle/capi/examples/model_inference/dense/CMakeLists.txt b/paddle/capi/examples/model_inference/dense/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..008a488fd9e6fdca2c4cb92bf1b8c41fce1835a9
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/CMakeLists.txt
@@ -0,0 +1,6 @@
+project(dense)
+cmake_minimum_required(VERSION 2.8)
+aux_source_directory(. SRC_LIST)
+add_executable(${PROJECT_NAME} ${SRC_LIST})
+set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
+target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared)
diff --git a/paddle/capi/examples/model_inference/dense/convert_protobin.sh b/paddle/capi/examples/model_inference/dense/convert_protobin.sh
new file mode 100755
index 0000000000000000000000000000000000000000..30ffc316ecb76cd9c8e2b628f85484a990ac6da8
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/convert_protobin.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+python -m paddle.utils.dump_config trainer_config.py '' --binary > trainer_config.bin
diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/capi/examples/model_inference/dense/main.c
new file mode 100644
index 0000000000000000000000000000000000000000..3e6bd5285058a297c4574631e2a5c033b83936e8
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/main.c
@@ -0,0 +1,69 @@
+#include <paddle/capi.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=False"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  paddle_arguments in_args = paddle_arguments_create_none();
+
+  // There is only one input of this network.
+  CHECK(paddle_arguments_resize(in_args, 1));
+
+  // Create input matrix.
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+                                           /* size */ 784,
+                                           /* useGPU */ false);
+  srand(time(0));
+  paddle_real* array;
+
+  // Get First row.
+  CHECK(paddle_matrix_get_row(mat, 0, &array));
+
+  for (int i = 0; i < 784; ++i) {
+    array[i] = rand() / ((float)RAND_MAX);
+  }
+
+  CHECK(paddle_arguments_set_value(in_args, 0, mat));
+
+  paddle_arguments out_args = paddle_arguments_create_none();
+  CHECK(paddle_gradient_machine_forward(machine,
+                                        in_args,
+                                        out_args,
+                                        /* isTrain */ false));
+  paddle_matrix prob = paddle_matrix_create_none();
+
+  CHECK(paddle_arguments_get_value(out_args, 0, prob));
+
+  CHECK(paddle_matrix_get_row(prob, 0, &array));
+
+  printf("Prob: ");
+  for (int i = 0; i < 10; ++i) {
+    printf("%.2f ", array[i]);
+  }
+  printf("\n");
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_matrix_destroy(mat));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+
+  return 0;
+}
diff --git a/paddle/capi/examples/model_inference/dense/trainer_config.py b/paddle/capi/examples/model_inference/dense/trainer_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..873ec119e7a3d4debe50af2ba259ace50b0cbf7c
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/trainer_config.py
@@ -0,0 +1,18 @@
+from paddle.trainer_config_helpers import *
+
+img = data_layer(name='pixel', size=784)
+
+hidden = fc_layer(
+    input=img,
+    size=200,
+    param_attr=ParamAttr(name='hidden.w'),
+    bias_attr=ParamAttr(name='hidden.b'))
+
+prob = fc_layer(
+    input=hidden,
+    size=10,
+    act=SoftmaxActivation(),
+    param_attr=ParamAttr(name='prob.w'),
+    bias_attr=ParamAttr(name='prob.b'))
+
+outputs(prob)
diff --git a/paddle/capi/examples/model_inference/multi_thread/.gitignore b/paddle/capi/examples/model_inference/multi_thread/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..fab7372d796ea95c80d02df6caa7eb2b411a7ac1
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/.gitignore
@@ -0,0 +1,73 @@
+# This file is used to ignore files which are generated
+# ----------------------------------------------------------------------------
+
+*~
+*.autosave
+*.a
+*.core
+*.moc
+*.o
+*.obj
+*.orig
+*.rej
+*.so
+*.so.*
+*_pch.h.cpp
+*_resource.rc
+*.qm
+.#*
+*.*#
+core
+!core/
+tags
+.DS_Store
+.directory
+*.debug
+Makefile*
+*.prl
+*.app
+moc_*.cpp
+ui_*.h
+qrc_*.cpp
+Thumbs.db
+*.res
+*.rc
+/.qmake.cache
+/.qmake.stash
+
+# qtcreator generated files
+*.pro.user*
+
+# xemacs temporary files
+*.flc
+
+# Vim temporary files
+.*.swp
+
+# Visual Studio generated files
+*.ib_pdb_index
+*.idb
+*.ilk
+*.pdb
+*.sln
+*.suo
+*.vcproj
+*vcproj.*.*.user
+*.ncb
+*.sdf
+*.opensdf
+*.vcxproj
+*vcxproj.*
+
+# MinGW generated files
+*.Debug
+*.Release
+
+# Python byte code
+*.pyc
+
+# Binaries
+# --------
+*.dll
+*.exe
+
diff --git a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..98e411ddc02a46034e8f6ceb00657622d998c9f3
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
@@ -0,0 +1,8 @@
+project(multi_thread)
+cmake_minimum_required(VERSION 2.8)
+aux_source_directory(. SRC_LIST)
+add_executable(${PROJECT_NAME} ${SRC_LIST})
+find_package (Threads)
+set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
+target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared
+  ${CMAKE_THREAD_LIBS_INIT})
diff --git a/paddle/capi/examples/model_inference/multi_thread/convert_protobin.sh b/paddle/capi/examples/model_inference/multi_thread/convert_protobin.sh
new file mode 120000
index 0000000000000000000000000000000000000000..3c1b3533523cf1709720d11df7b8e311e0577fe7
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/convert_protobin.sh
@@ -0,0 +1 @@
+../dense/convert_protobin.sh
\ No newline at end of file
diff --git a/paddle/capi/examples/model_inference/multi_thread/main.c b/paddle/capi/examples/model_inference/multi_thread/main.c
new file mode 100644
index 0000000000000000000000000000000000000000..d7675cd80a52f752b1a8567dae34123978113831
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/main.c
@@ -0,0 +1,98 @@
+#include <paddle/capi.h>
+#include <pthread.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+#define NUM_THREAD 4
+#define NUM_ITER 1000
+
+pthread_mutex_t mutex;
+
+void* thread_main(void* gm_ptr) {
+  paddle_gradient_machine machine = (paddle_gradient_machine)(gm_ptr);
+  paddle_arguments in_args = paddle_arguments_create_none();
+  // Create input matrix.
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+                                           /* size */ 784,
+                                           /* useGPU */ false);
+  paddle_arguments out_args = paddle_arguments_create_none();
+  paddle_matrix prob = paddle_matrix_create_none();
+  for (int iter = 0; iter < NUM_ITER; ++iter) {
+    // There is only one input of this network.
+    CHECK(paddle_arguments_resize(in_args, 1));
+
+    paddle_real* array;
+
+    // Get First row.
+    CHECK(paddle_matrix_get_row(mat, 0, &array));
+
+    for (int i = 0; i < 784; ++i) {
+      array[i] = rand() / ((float)RAND_MAX);
+    }
+
+    CHECK(paddle_arguments_set_value(in_args, 0, mat));
+
+    CHECK(paddle_gradient_machine_forward(machine,
+                                          in_args,
+                                          out_args,
+                                          /* isTrain */ false));
+
+    CHECK(paddle_arguments_get_value(out_args, 0, prob));
+
+    CHECK(paddle_matrix_get_row(prob, 0, &array));
+
+    pthread_mutex_lock(&mutex);
+    printf("Prob: ");
+    for (int i = 0; i < 10; ++i) {
+      printf("%.2f ", array[i]);
+    }
+    printf("\n");
+    pthread_mutex_unlock(&mutex);
+  }
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_matrix_destroy(mat));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+  return NULL;
+}
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=False"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  srand(time(0));
+  pthread_mutex_init(&mutex, NULL);
+
+  pthread_t threads[NUM_THREAD];
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    paddle_gradient_machine thread_local_machine;
+    CHECK(paddle_gradient_machine_create_shared_param(
+        machine, buf, size, &thread_local_machine));
+    pthread_create(&threads[i], NULL, thread_main, thread_local_machine);
+  }
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    pthread_join(threads[i], NULL);
+  }
+
+  pthread_mutex_destroy(&mutex);
+
+  return 0;
+}
diff --git a/paddle/capi/examples/model_inference/multi_thread/trainer_config.py b/paddle/capi/examples/model_inference/multi_thread/trainer_config.py
new file mode 120000
index 0000000000000000000000000000000000000000..70cfb1f7f4cfe9afa6ccbd6f2f419aa286970bbe
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/trainer_config.py
@@ -0,0 +1 @@
+../dense/trainer_config.py
\ No newline at end of file
diff --git a/paddle/capi/examples/model_inference/sequence/.gitignore b/paddle/capi/examples/model_inference/sequence/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..fab7372d796ea95c80d02df6caa7eb2b411a7ac1
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sequence/.gitignore
@@ -0,0 +1,73 @@
+# This file is used to ignore files which are generated
+# ----------------------------------------------------------------------------
+
+*~
+*.autosave
+*.a
+*.core
+*.moc
+*.o
+*.obj
+*.orig
+*.rej
+*.so
+*.so.*
+*_pch.h.cpp
+*_resource.rc
+*.qm
+.#*
+*.*#
+core
+!core/
+tags
+.DS_Store
+.directory
+*.debug
+Makefile*
+*.prl
+*.app
+moc_*.cpp
+ui_*.h
+qrc_*.cpp
+Thumbs.db
+*.res
+*.rc
+/.qmake.cache
+/.qmake.stash
+
+# qtcreator generated files
+*.pro.user*
+
+# xemacs temporary files
+*.flc
+
+# Vim temporary files
+.*.swp
+
+# Visual Studio generated files
+*.ib_pdb_index
+*.idb
+*.ilk
+*.pdb
+*.sln
+*.suo
+*.vcproj
+*vcproj.*.*.user
+*.ncb
+*.sdf
+*.opensdf
+*.vcxproj
+*vcxproj.*
+
+# MinGW generated files
+*.Debug
+*.Release
+
+# Python byte code
+*.pyc
+
+# Binaries
+# --------
+*.dll
+*.exe
+
diff --git a/paddle/capi/examples/model_inference/sequence/CMakeLists.txt b/paddle/capi/examples/model_inference/sequence/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..71b73acba7cdea1c869ec6061df379c3f7cb45db
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sequence/CMakeLists.txt
@@ -0,0 +1,6 @@
+project(sequence)
+cmake_minimum_required(VERSION 2.8)
+aux_source_directory(. SRC_LIST)
+add_executable(${PROJECT_NAME} ${SRC_LIST})
+set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
+target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared)
diff --git a/paddle/capi/examples/model_inference/sequence/convert_protobin.sh b/paddle/capi/examples/model_inference/sequence/convert_protobin.sh
new file mode 120000
index 0000000000000000000000000000000000000000..3c1b3533523cf1709720d11df7b8e311e0577fe7
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sequence/convert_protobin.sh
@@ -0,0 +1 @@
+../dense/convert_protobin.sh
\ No newline at end of file
diff --git a/paddle/capi/examples/model_inference/sequence/main.c b/paddle/capi/examples/model_inference/sequence/main.c
new file mode 100644
index 0000000000000000000000000000000000000000..50bc0c9201f207eff7389bfbee3bc2e43261b19a
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sequence/main.c
@@ -0,0 +1,70 @@
+#include <paddle/capi.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=False"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  paddle_arguments in_args = paddle_arguments_create_none();
+
+  // There is only one input of this network.
+  CHECK(paddle_arguments_resize(in_args, 1));
+
+  // Create input ids.
+  int sentence_ids[] = {83, 48, 20, 84, 394, 853, 64, 53, 64};
+
+  paddle_ivector sentence = paddle_ivector_create(
+      sentence_ids, sizeof(sentence_ids) / sizeof(int), false, false);
+  CHECK(paddle_arguments_set_ids(in_args, 0, sentence));
+
+  int seq_pos_array[] = {0, sizeof(sentence_ids) / sizeof(int)};
+
+  paddle_ivector seq_pos = paddle_ivector_create(
+      seq_pos_array, sizeof(seq_pos_array) / sizeof(int), false, false);
+
+  CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos));
+
+  paddle_arguments out_args = paddle_arguments_create_none();
+  CHECK(paddle_gradient_machine_forward(machine,
+                                        in_args,
+                                        out_args,
+                                        /* isTrain */ false));
+  paddle_matrix prob = paddle_matrix_create_none();
+
+  CHECK(paddle_arguments_get_value(out_args, 0, prob));
+
+  paddle_real* array;
+
+  CHECK(paddle_matrix_get_row(prob, 0, &array));
+
+  printf("Prob: ");
+  for (int i = 0; i < 2; ++i) {
+    printf("%.2f ", array[i]);
+  }
+  printf("\n");
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_ivector_destroy(seq_pos));
+  CHECK(paddle_ivector_destroy(sentence));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+
+  return 0;
+}
diff --git a/paddle/capi/examples/model_inference/sequence/trainer_config.py b/paddle/capi/examples/model_inference/sequence/trainer_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bbc7a909aa03950ce621efa43fa47d9cdd016f8
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sequence/trainer_config.py
@@ -0,0 +1,13 @@
+from paddle.trainer_config_helpers import *
+
+WORD_DIM = 3000
+
+sentence = data_layer(name='sentence', size=WORD_DIM)
+sentence_embedding = embedding_layer(
+    input=sentence,
+    size=64,
+    param_attr=ParameterAttribute(
+        initial_max=1.0, initial_min=0.5))
+lstm = simple_lstm(input=sentence_embedding, size=64)
+lstm_last = last_seq(input=lstm)
+outputs(fc_layer(input=lstm_last, size=2, act=SoftmaxActivation()))
diff --git a/paddle/capi/examples/model_inference/sparse_binary/.gitignore b/paddle/capi/examples/model_inference/sparse_binary/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..fab7372d796ea95c80d02df6caa7eb2b411a7ac1
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sparse_binary/.gitignore
@@ -0,0 +1,73 @@
+# This file is used to ignore files which are generated
+# ----------------------------------------------------------------------------
+
+*~
+*.autosave
+*.a
+*.core
+*.moc
+*.o
+*.obj
+*.orig
+*.rej
+*.so
+*.so.*
+*_pch.h.cpp
+*_resource.rc
+*.qm
+.#*
+*.*#
+core
+!core/
+tags
+.DS_Store
+.directory
+*.debug
+Makefile*
+*.prl
+*.app
+moc_*.cpp
+ui_*.h
+qrc_*.cpp
+Thumbs.db
+*.res
+*.rc
+/.qmake.cache
+/.qmake.stash
+
+# qtcreator generated files
+*.pro.user*
+
+# xemacs temporary files
+*.flc
+
+# Vim temporary files
+.*.swp
+
+# Visual Studio generated files
+*.ib_pdb_index
+*.idb
+*.ilk
+*.pdb
+*.sln
+*.suo
+*.vcproj
+*vcproj.*.*.user
+*.ncb
+*.sdf
+*.opensdf
+*.vcxproj
+*vcxproj.*
+
+# MinGW generated files
+*.Debug
+*.Release
+
+# Python byte code
+*.pyc
+
+# Binaries
+# --------
+*.dll
+*.exe
+
diff --git a/paddle/capi/examples/model_inference/sparse_binary/CMakeLists.txt b/paddle/capi/examples/model_inference/sparse_binary/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c82195688902ac70346fd5204fb14e28886fb51f
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sparse_binary/CMakeLists.txt
@@ -0,0 +1,7 @@
+project(sparse_binary)
+cmake_minimum_required(VERSION 2.8)
+aux_source_directory(. SRC_LIST)
+add_executable(${PROJECT_NAME} ${SRC_LIST})
+find_package (Threads)
+set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
+target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared)
diff --git a/paddle/capi/examples/model_inference/sparse_binary/convert_protobin.sh b/paddle/capi/examples/model_inference/sparse_binary/convert_protobin.sh
new file mode 120000
index 0000000000000000000000000000000000000000..3c1b3533523cf1709720d11df7b8e311e0577fe7
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sparse_binary/convert_protobin.sh
@@ -0,0 +1 @@
+../dense/convert_protobin.sh
\ No newline at end of file
diff --git a/paddle/capi/examples/model_inference/sparse_binary/main.c b/paddle/capi/examples/model_inference/sparse_binary/main.c
new file mode 100644
index 0000000000000000000000000000000000000000..8ba67aee560239d3050c7f40198d20df99ec370e
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sparse_binary/main.c
@@ -0,0 +1,70 @@
+#include <paddle/capi.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=False"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  paddle_arguments in_args = paddle_arguments_create_none();
+
+  // There is only one input of this network.
+  CHECK(paddle_arguments_resize(in_args, 1));
+
+  // Create input matrix.
+  paddle_matrix mat = paddle_matrix_create_sparse(1, 784, 3, true, false);
+  srand(time(0));
+  paddle_real* array;
+  int colBuf[] = {9, 93, 109};
+  int rowBuf[] = {0, sizeof(colBuf) / sizeof(int)};
+
+  CHECK(paddle_matrix_sparse_copy_from(mat,
+                                       rowBuf,
+                                       sizeof(rowBuf) / sizeof(int),
+                                       colBuf,
+                                       sizeof(colBuf) / sizeof(int),
+                                       NULL,
+                                       0));
+
+  CHECK(paddle_arguments_set_value(in_args, 0, mat));
+
+  paddle_arguments out_args = paddle_arguments_create_none();
+  CHECK(paddle_gradient_machine_forward(machine,
+                                        in_args,
+                                        out_args,
+                                        /* isTrain */ false));
+  paddle_matrix prob = paddle_matrix_create_none();
+
+  CHECK(paddle_arguments_get_value(out_args, 0, prob));
+
+  CHECK(paddle_matrix_get_row(prob, 0, &array));
+
+  printf("Prob: ");
+  for (int i = 0; i < 10; ++i) {
+    printf("%.2f ", array[i]);
+  }
+  printf("\n");
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_matrix_destroy(mat));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+
+  return 0;
+}
diff --git a/paddle/capi/examples/model_inference/sparse_binary/trainer_config.py b/paddle/capi/examples/model_inference/sparse_binary/trainer_config.py
new file mode 120000
index 0000000000000000000000000000000000000000..70cfb1f7f4cfe9afa6ccbd6f2f419aa286970bbe
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sparse_binary/trainer_config.py
@@ -0,0 +1 @@
+../dense/trainer_config.py
\ No newline at end of file
diff --git a/paddle/capi/gradient_machine.cpp b/paddle/capi/gradient_machine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..00f76e0152366834eafc22df710cf3d6c7b8471f
--- /dev/null
+++ b/paddle/capi/gradient_machine.cpp
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gradient_machine.h"
+#include "capi_private.h"
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+
+#define cast(v) paddle::capi::cast<paddle::capi::CGradientMachine>(v)
+
+enum GradientMatchineCreateMode {
+  CREATE_MODE_NORMAL = 0,
+  CREATE_MODE_TESTING = 4
+};
+
+namespace paddle {
+
+class MyNeuralNetwork : public NeuralNetwork {
+public:
+  MyNeuralNetwork(const std::string& name, NeuralNetwork* network)
+      : NeuralNetwork(name, network) {}
+};
+
+NeuralNetwork* newCustomNerualNetwork(const std::string& name,
+                                      NeuralNetwork* network) {
+  return new MyNeuralNetwork(name, network);
+}
+}  // namespace paddle
+
+extern "C" {
+paddle_error paddle_gradient_machine_create_for_inference(
+    paddle_gradient_machine* machine, void* modelConfigProtobuf, int size) {
+  if (modelConfigProtobuf == nullptr) return kPD_NULLPTR;
+  paddle::ModelConfig config;
+  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
+      !config.IsInitialized()) {
+    return kPD_PROTOBUF_ERROR;
+  }
+
+  auto ptr = new paddle::capi::CGradientMachine();
+  ptr->machine.reset(paddle::GradientMachine::create(
+      config, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
+  *machine = ptr;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_destroy(paddle_gradient_machine machine) {
+  delete cast(machine);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_load_parameter_from_disk(
+    paddle_gradient_machine machine, const char* path) {
+  auto m = cast(machine);
+  if (m == nullptr || path == nullptr || m->machine == nullptr)
+    return kPD_NULLPTR;
+  m->machine->loadParameters(path);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_forward(paddle_gradient_machine machine,
+                                             paddle_arguments inArgs,
+                                             paddle_arguments outArgs,
+                                             bool isTrain) {
+  auto m = cast(machine);
+  auto in = paddle::capi::cast<paddle::capi::CArguments>(inArgs);
+  auto out = paddle::capi::cast<paddle::capi::CArguments>(outArgs);
+  if (m == nullptr || in == nullptr || out == nullptr || m->machine == nullptr)
+    return kPD_NULLPTR;
+  m->machine->forward(
+      in->args, &out->args, isTrain ? paddle::PASS_TRAIN : paddle::PASS_TEST);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_create_shared_param(
+    paddle_gradient_machine origin,
+    void* modelConfigProtobuf,
+    int size,
+    paddle_gradient_machine* slave) {
+  auto o = cast(origin);
+  if (origin == nullptr || slave == nullptr || o->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+  paddle::ModelConfig config;
+  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
+      !config.IsInitialized()) {
+    return kPD_PROTOBUF_ERROR;
+  }
+
+  std::unique_ptr<paddle::capi::CGradientMachine> ptr(
+      new paddle::capi::CGradientMachine());
+  auto nn = paddle::NeuralNetwork::create(config);
+  nn->init(config,
+           [&o](int paramId, paddle::Parameter* param) {
+             auto p = o->machine->getParameters()[paramId];
+             param->enableSharedType(paddle::PARAMETER_VALUE,
+                                     p->getBuf(paddle::PARAMETER_VALUE));
+           },
+           {paddle::PARAMETER_VALUE},
+           false);
+  ptr->machine.reset(nn);
+  *slave = ptr.release();
+  return kPD_NO_ERROR;
+}
+}
+
+paddle_error paddle_gradient_machine_randomize_param(
+    paddle_gradient_machine machine) {
+  auto m = cast(machine);
+  if (m == nullptr || m->machine == nullptr) return kPD_NULLPTR;
+  m->machine->randParameters();
+  return kPD_NO_ERROR;
+}
diff --git a/paddle/capi/gradient_machine.h b/paddle/capi/gradient_machine.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7e2dd9bf8037ed474971624d4518160604abe4d
--- /dev/null
+++ b/paddle/capi/gradient_machine.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_GRADIENT_MACHINE_H__
+#define __PADDLE_CAPI_GRADIENT_MACHINE_H__
+#include "arguments.h"
+#include "config.h"
+#include "error.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * @brief GradientMachine means a neural network.
+ */
+typedef void* paddle_gradient_machine;
+
+/**
+ * @brief Create a gradient machine used for model inference.
+ * @param [out] machine that used for model inference.
+ * @param [in] modelConfigProtobuf
+ * @param [in] size
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_gradient_machine_create_for_inference(
+    paddle_gradient_machine* machine, void* modelConfigProtobuf, int size);
+
+/**
+ * @brief Load parameter from disk.
+ * @param machine Gradient Machine.
+ * @param path local directory path.
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_gradient_machine_load_parameter_from_disk(
+    paddle_gradient_machine machine, const char* path);
+
+/**
+ * @brief Forward a gradient machine
+ * @param machine Gradient machine
+ * @param inArgs input arguments
+ * @param outArgs output arguments
+ * @param isTrain is train or not
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_forward(paddle_gradient_machine machine,
+                                paddle_arguments inArgs,
+                                paddle_arguments outArgs,
+                                bool isTrain);
+
+/**
+ * @brief Create a gradient machine, which parameters are shared from another
+ *        gradient machine.
+ * @param [in] origin gradient machine
+ * @param [in] modelConfigProtobuf model config protobuf
+ * @param [in] size of model config buffer.
+ * @param [out] slave gradient machine, the output value.
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_create_shared_param(paddle_gradient_machine origin,
+                                            void* modelConfigProtobuf,
+                                            int size,
+                                            paddle_gradient_machine* slave);
+
+PD_API paddle_error
+paddle_gradient_machine_randomize_param(paddle_gradient_machine machine);
+
+/**
+ * @brief Destroy a gradient machine
+ * @param machine that need to destroy
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_destroy(paddle_gradient_machine machine);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/paddle/capi/main.h b/paddle/capi/main.h
new file mode 100644
index 0000000000000000000000000000000000000000..893ebcbd58dd24cf835fb2005865c94c9ba2a810
--- /dev/null
+++ b/paddle/capi/main.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_MAIN_H__
+#define __PADDLE_CAPI_MAIN_H__
+#include "config.h"
+#include "error.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Initialize Paddle.
+ */
+PD_API paddle_error paddle_init(int argc, char** argv);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/paddle/capi/matrix.h b/paddle/capi/matrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..f15f7f3bbbd1457617111f827d2182ae6b7d9fdb
--- /dev/null
+++ b/paddle/capi/matrix.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_MATRIX_H__
+#define __PADDLE_CAPI_MATRIX_H__
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "config.h"
+#include "error.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Matrix functions. Return will be a paddle_error type.
+ */
+typedef void* paddle_matrix;
+
+/**
+ * @brief paddle_matrix_create Create a dense matrix
+ * @param height matrix height.
+ * @param width matrix width
+ * @param useGpu use GPU of not
+ * @return Matrix handler
+ */
+PD_API paddle_matrix paddle_matrix_create(uint64_t height,
+                                          uint64_t width,
+                                          bool useGpu);
+
+/**
+ * @brief paddle_matrix_create_sparse Create a sparse matrix.
+ * @param height the matrix height.
+ * @param width the matrix width.
+ * @param nnz the number of non-zero elements.
+ * @param isBinary is binary (either 1 or 0 in matrix) or not.
+ * @param useGpu is using GPU or not.
+ * @return paddle_matrix.
+ */
+PD_API paddle_matrix paddle_matrix_create_sparse(
+    uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
+
+/**
+ * @brief paddle_matrix_destroy Destroy a matrix.
+ * @param mat
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_matrix_destroy(paddle_matrix mat);
+
+/**
+ * @brief paddle_matrix_set_row Set a row to matrix.
+ * @param mat Target Matrix
+ * @param rowID Index of row
+ * @param rowArray Row data.
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
+                                          uint64_t rowID,
+                                          paddle_real* rowArray);
+
+/**
+ * @brief PDMatGetRow Get raw row buffer from matrix
+ * @param [in] mat Target matrix
+ * @param [in] rowID Index of row.
+ * @param [out] rawRowBuffer Row Buffer
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
+                                          uint64_t rowID,
+                                          paddle_real** rawRowBuffer);
+
+/**
+ * @brief PDMatCreateNone Create None Matrix
+ * @return
+ */
+PD_API paddle_matrix paddle_matrix_create_none();
+
+/**
+ * @brief PDMatGetShape get the shape of matrix
+ * @param mat target matrix
+ * @param height The height of matrix
+ * @param width The width of matrix
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_matrix_get_shape(paddle_matrix mat,
+                                            uint64_t* height,
+                                            uint64_t* width);
+
+/**
+ * @brief paddle_matrix_sparse_copy_from Copy from a CSR format matrix
+ * @param [out] mat output matrix
+ * @param [in] rowArray row array. The array slices in column array.
+ * @param [in] rowSize length of row array.
+ * @param [in] colArray the column array. It means the non-zero element indices
+ * in each row.
+ * @param [in] colSize length of column array.
+ * @param [in] valueArray the value array. It means the non-zero elemnt values.
+ * NULL if the matrix is binary.
+ * @param [in] valueSize length of value array. Zero if the matrix is binary.
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
+                                                   int* rowArray,
+                                                   uint64_t rowSize,
+                                                   int* colArray,
+                                                   uint64_t colSize,
+                                                   float* valueArray,
+                                                   uint64_t valueSize);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/paddle/capi/tests/.gitignore b/paddle/capi/tests/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..7ab6be95e397fa8f0339294a00c2f057bc116792
--- /dev/null
+++ b/paddle/capi/tests/.gitignore
@@ -0,0 +1,2 @@
+w
+b
diff --git a/paddle/capi/tests/CMakeLists.txt b/paddle/capi/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d73f6b7733950bd472a46afb21694aac943fc909
--- /dev/null
+++ b/paddle/capi/tests/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_unittest(capi_test_mats test_Vector.cpp
+  test_Matrix.cpp test_Arguments.cpp)
+
+target_include_directories(capi_test_mats PUBLIC ${PADDLE_CAPI_INC_PATH})
+target_link_libraries(capi_test_mats paddle_capi)
+
+
+add_unittest_without_exec(capi_test_gradientMachine test_GradientMachine.cpp)
+target_include_directories(capi_test_gradientMachine PUBLIC
+  ${PADDLE_CAPI_INC_PATH})
+target_link_libraries(capi_test_gradientMachine paddle_capi)
+add_test(NAME capi_test_gradientMachine
+  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine
+  WORKING_DIRECTORY ${PROJ_ROOT}/paddle/capi/tests)
diff --git a/paddle/capi/tests/test_Arguments.cpp b/paddle/capi/tests/test_Arguments.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4792ceb49a7816f47ebf9b653d7f34e08f4a85bf
--- /dev/null
+++ b/paddle/capi/tests/test_Arguments.cpp
@@ -0,0 +1,129 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <functional>
+#include "capi.h"
+#include "gtest/gtest.h"
+#include "paddle/utils/ThreadLocal.h"
+
+static std::vector<paddle_real> randomBuffer(size_t bufSize) {
+  auto& eng = paddle::ThreadLocalRandomEngine::get();
+  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
+  std::vector<paddle_real> retv;
+  retv.reserve(bufSize);
+  for (size_t i = 0; i < bufSize; ++i) {
+    retv.push_back(dist(eng));
+  }
+  return retv;
+}
+
+TEST(CAPIArguments, create) {
+  //! TODO(yuyang18): Test GPU Code.
+  paddle_arguments args = paddle_arguments_create_none();
+  uint64_t size;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(args, &size));
+  ASSERT_EQ(0UL, size);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+TEST(CAPIArguments, value) {
+  paddle_arguments args = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
+
+  paddle_matrix mat = paddle_matrix_create(128, 64, false);
+  for (size_t i = 0; i < 128; ++i) {
+    std::vector<paddle_real> sampleBuf = randomBuffer(64);
+    paddle_matrix_set_row(mat, i, sampleBuf.data());
+  }
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(args, 0, mat));
+
+  paddle_matrix val = paddle_matrix_create_none();
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(args, 0, val));
+
+  for (size_t i = 0; i < 128; ++i) {
+    paddle_real* row1;
+    paddle_real* row2;
+
+    ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, i, &row1));
+    ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(val, i, &row2));
+    ASSERT_EQ(row1, row2);
+  }
+
+  paddle_ivector ivec = paddle_ivector_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+TEST(CAPIArguments, ids) {
+  paddle_arguments args = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
+
+  paddle_ivector ivec;
+  int array[3] = {1, 2, 3};
+  ivec = paddle_ivector_create(array, 3, true, false);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_ids(args, 0, ivec));
+
+  paddle_ivector val = paddle_ivector_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_ids(args, 0, val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+template <typename T1, typename T2>
+void testSequenceHelper(T1 setter, T2 getter) {
+  paddle_arguments args = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
+
+  paddle_ivector ivec;
+  int array[3] = {1, 2, 3};
+  ivec = paddle_ivector_create(array, 3, true, false);
+  ASSERT_EQ(kPD_NO_ERROR, setter(args, 0, ivec));
+
+  paddle_ivector val = paddle_ivector_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, getter(args, 0, val));
+  uint64_t size;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get_size(val, &size));
+
+  int* rawBuf;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get(val, &rawBuf));
+  for (size_t i = 0; i < size; ++i) {
+    ASSERT_EQ(array[i], rawBuf[i]);
+  }
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+TEST(CAPIArguments, Sequence) {
+  auto testSequence = [](uint32_t nestedLevel) {
+    testSequenceHelper(std::bind(paddle_arguments_set_sequence_start_pos,
+                                 std::placeholders::_1,
+                                 std::placeholders::_2,
+                                 nestedLevel,
+                                 std::placeholders::_3),
+                       std::bind(paddle_arguments_get_sequence_start_pos,
+                                 std::placeholders::_1,
+                                 std::placeholders::_2,
+                                 nestedLevel,
+                                 std::placeholders::_3));
+  };
+  for (uint32_t i = 0; i < 2; ++i) {  // test seq and sub-seq.
+    testSequence(i);
+  }
+}
diff --git a/paddle/capi/tests/test_GradientMachine.cpp b/paddle/capi/tests/test_GradientMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..89aa64608dd79ea8a8f5add724d9ea79e5abff16
--- /dev/null
+++ b/paddle/capi/tests/test_GradientMachine.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/gserver/gradientmachines/GradientMachine.h>
+#include <paddle/trainer/TrainerConfigHelper.h>
+#include <stdlib.h>
+#include <string.h>
+#include <type_traits>
+#include "capi.h"
+#include "paddle/utils/ThreadLocal.h"
+
+static std::vector<paddle_real> randomBuffer(size_t bufSize) {
+  auto& eng = paddle::ThreadLocalRandomEngine::get();
+  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
+  std::vector<paddle_real> retv;
+  retv.reserve(bufSize);
+  for (size_t i = 0; i < bufSize; ++i) {
+    retv.push_back(dist(eng));
+  }
+  return retv;
+}
+
+TEST(GradientMachine, testPredict) {
+  //! TODO(yuyang18): Test GPU Code.
+  paddle::TrainerConfigHelper config("./test_predict_network.py");
+  std::string buffer;
+  ASSERT_TRUE(config.getModelConfig().SerializeToString(&buffer));
+  paddle_gradient_machine machine;
+
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_create_for_inference(
+                &machine, &buffer[0], (int)buffer.size()));
+  std::unique_ptr<paddle::GradientMachine> gm(
+      paddle::GradientMachine::create(config.getModelConfig()));
+  ASSERT_NE(nullptr, gm);
+  gm->randParameters();
+  gm->saveParameters("./");
+
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_load_parameter_from_disk(machine, "./"));
+
+  paddle_gradient_machine machineSlave;
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_create_shared_param(
+                machine, &buffer[0], (int)buffer.size(), &machineSlave));
+  std::swap(machineSlave, machine);
+  paddle_arguments outArgs = paddle_arguments_create_none();
+
+  paddle_arguments inArgs = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(inArgs, 1));
+  paddle_matrix mat = paddle_matrix_create(1, 100, false);
+  static_assert(std::is_same<paddle_real, paddle::real>::value, "");
+
+  auto data = randomBuffer(100);
+  paddle_real* rowPtr;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
+  memcpy(rowPtr, data.data(), data.size() * sizeof(paddle_real));
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(inArgs, 0, mat));
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_forward(machine, inArgs, outArgs, false));
+
+  uint64_t sz;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(outArgs, &sz));
+  ASSERT_EQ(1UL, sz);
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(outArgs, 0, mat));
+  std::vector<paddle::Argument> paddleInArgs;
+  std::vector<paddle::Argument> paddleOutArgs;
+  paddleInArgs.resize(1);
+  paddleInArgs[0].value =
+      paddle::Matrix::create(data.data(), 1, 100, false, false);
+
+  gm->forward(paddleInArgs, &paddleOutArgs, paddle::PASS_TEST);
+
+  auto matPaddle = paddleOutArgs[0].value;
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(matPaddle->getHeight(), height);
+  ASSERT_EQ(matPaddle->getWidth(), width);
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
+  for (size_t i = 0; i < width; ++i) {
+    ASSERT_NEAR(matPaddle->getData()[i], rowPtr[i], 1e-5);
+  }
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(inArgs));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(outArgs));
+  std::swap(machineSlave, machine);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machineSlave));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machine));
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  std::vector<char*> argvs;
+  argvs.push_back(strdup("--use_gpu=false"));
+  paddle_init((int)argvs.size(), argvs.data());
+  for (auto each : argvs) {
+    free(each);
+  }
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/capi/tests/test_Matrix.cpp b/paddle/capi/tests/test_Matrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4bf9a9d6a9f9161561e9e5612edd2c93cab7ac5b
--- /dev/null
+++ b/paddle/capi/tests/test_Matrix.cpp
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi.h"
+#include "gtest/gtest.h"
+
+TEST(CAPIMatrix, create) {
+  //! TODO(yuyang18): Test GPU Code.
+  paddle_matrix mat = paddle_matrix_create(128, 32, false);
+  std::vector<paddle_real> sampleRow;
+  sampleRow.resize(32);
+  for (size_t i = 0; i < sampleRow.size(); ++i) {
+    sampleRow[i] = 1.0 / (i + 1.0);
+  }
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_row(mat, 0, sampleRow.data()));
+  ASSERT_EQ(kPD_OUT_OF_RANGE,
+            paddle_matrix_set_row(mat, 128, sampleRow.data()));
+
+  paddle_real* arrayPtr;
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &arrayPtr));
+  for (size_t i = 0; i < sampleRow.size(); ++i) {
+    ASSERT_NEAR(sampleRow[i], arrayPtr[i], 1e-5);
+  }
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(128UL, height);
+  ASSERT_EQ(32UL, width);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
+
+TEST(CAPIMatrix, createNone) {
+  paddle_matrix mat = paddle_matrix_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
diff --git a/paddle/capi/tests/test_Vector.cpp b/paddle/capi/tests/test_Vector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..365160dc9a08e6b6fc07fb685d5149d1e078da9b
--- /dev/null
+++ b/paddle/capi/tests/test_Vector.cpp
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi.h"
+#include "gtest/gtest.h"
+
+TEST(CAPIVector, create) {
+  //! TODO(yuyang18): Test GPU Code.
+  paddle_ivector vec;
+  int array[3] = {1, 2, 3};
+  vec = paddle_ivector_create(array, 3, true, false);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_resize(vec, 1000));
+  uint64_t size;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get_size(vec, &size));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(vec));
+}
+
+TEST(CAPIVector, createNone) {
+  paddle_ivector vec = paddle_ivector_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(vec));
+}
diff --git a/paddle/capi/tests/test_predict_network.py b/paddle/capi/tests/test_predict_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..82ef5cb1a70398df65ace3c802076743c3ebe341
--- /dev/null
+++ b/paddle/capi/tests/test_predict_network.py
@@ -0,0 +1,13 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=100)
+
+x = data_layer(name='x', size=100)
+
+y = fc_layer(
+    input=x,
+    size=100,
+    bias_attr=ParamAttr(name='b'),
+    param_attr=ParamAttr(name='w'))
+
+outputs(y)
diff --git a/paddle/capi/vector.h b/paddle/capi/vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..a92aeff16425779bf63a7ffd7217709b6bf3cd05
--- /dev/null
+++ b/paddle/capi/vector.h
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_VECTOR_H__
+#define __PADDLE_CAPI_VECTOR_H__
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "config.h"
+#include "error.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Int Vector Functions. Return will be a paddle_error type.
+ */
+typedef void* paddle_ivector;
+
+/**
+ * @brief Create an none int vector. It just a handler and store nothing. Used
+ *        to get output from other api.
+ * @return None int vector.
+ */
+PD_API paddle_ivector paddle_ivector_create_none();
+
+/**
+ * @brief paddle_ivector_create create a paddle int vector
+ * @param array: input array.
+ * @param size: input array size.
+ * @param copy: memory copy or just use same memory. True if copy.
+ * @param useGPU: True if use GPU
+ * @return paddle_error
+ */
+PD_API paddle_ivector paddle_ivector_create(int* array,
+                                            uint64_t size,
+                                            bool copy,
+                                            bool useGPU);
+
+/**
+ * @brief paddle_ivector_destroy destory an int vector.
+ * @param ivec vector to be destoried.
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_ivector_destroy(paddle_ivector ivec);
+
+/**
+ * @brief paddle_ivector_get get raw buffer stored inside this int vector. It
+ * could be GPU memory if this int vector is stored in GPU.
+ * @param [in] ivec int vector
+ * @param [out] buffer the return buffer pointer.
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_ivector_get(paddle_ivector ivec, int** buffer);
+
+/**
+ * @brief paddle_ivector_resize resize the int vector.
+ * @param [in] ivec: int vector
+ * @param [in] size: size to change
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_ivector_resize(paddle_ivector ivec, uint64_t size);
+
+/**
+ * @brief paddle_ivector_get_size get the size of int vector.
+ * @param [in] ivec: int vector
+ * @param [out] size: return size of this int vector.
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_ivector_get_size(paddle_ivector ivec,
+                                            uint64_t* size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
index 0f5d6a848d406d14984a0b6edad8192dab42e88b..1b25172ca5c0c4e64db01806fb8239af7e06d90d 100644
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -28,11 +28,12 @@ void testMatrixProjectionForward(int context_start,
                std::max(0, (int)(context_start + context_length - 1));
   if (pad == 0) is_padding = false;
 
-  FunctionCompare test("ContextProjectionForward",
-                       FuncConfig()
-                           .set("context_length", context_length)
-                           .set("context_start", context_start)
-                           .set("begin_pad", std::max(0, -context_start)));
+  FunctionCompare test(
+      "ContextProjectionForward",
+      FuncConfig()
+          .set("context_length", context_length)
+          .set("context_start", context_start)
+          .set("begin_pad", (size_t)std::max(0, -context_start)));
 
   // prepare input arguments
   test.addSequence(SequenceIdArg(TensorShape{batch_size}));
@@ -51,7 +52,7 @@ void testMatrixProjectionForward(int context_start,
 }
 
 void testMatrixProjectionBackward(int context_start,
-                                  int context_length,
+                                  size_t context_length,
                                   bool is_padding,
                                   size_t batch_size,
                                   size_t input_dim) {
@@ -59,13 +60,14 @@ void testMatrixProjectionBackward(int context_start,
                std::max(0, (int)(context_start + context_length - 1));
   if (pad == 0) is_padding = false;
 
-  FunctionCompare test("ContextProjectionBackward",
-                       FuncConfig()
-                           .set("context_length", context_length)
-                           .set("context_start", context_start)
-                           .set("begin_pad", std::max(0, -context_start))
-                           .set("is_padding", is_padding)
-                           .set("total_pad", pad));
+  FunctionCompare test(
+      "ContextProjectionBackward",
+      FuncConfig()
+          .set("context_length", context_length)
+          .set("context_start", context_start)
+          .set("begin_pad", (size_t)std::max(0, -context_start))
+          .set("is_padding", is_padding)
+          .set("total_pad", pad));
 
   // prepare input arguments
   test.addSequence(SequenceIdArg(TensorShape{batch_size}));
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 6ae60102b3e431727c0954e8b8073bfe0534f8ee..3159026e6b92355ba7480b09535388c969a504e2 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -518,7 +518,7 @@ void TrainerThread::computeThread() {
         backward();
         break;
       case MultiGradientMachine::TASK_COPY_IN_ARGS:
-        copyInArgs();
+        batchSize_ = copyInArgs();
         inArgsCopied_ = true;
         multiMachine_->waitForCopyInArgs();
         break;
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 4ae5b828707eb8412e98cbefcf3949d62e81ad1e..69d5830dd2a1afb93948eacec1cb4309cf8c6109 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -217,7 +217,7 @@ void SmoothL1CostLayer::forwardImp(Matrix& output,
     targetCpu->copyFrom(target);
     outputCpu->copyFrom(output);
     labelCpu->copyFrom(*label.value);
-    targetCpu->smoothL1(*outputCpu, *(labelCpu));
+    targetCpu->smoothL1(*outputCpu, *labelCpu);
     target.copyFrom(*targetCpu);
   } else {
     target.smoothL1(output, *label.value);
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index 569a6840f0d4432cc827219f590b821df115c7ea..14c0b33ec1a628521ae2d694dda8da553c29fd38 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -164,9 +164,11 @@ public:
  * tasks.
  * \f[
  * L =
- *   (output - label)^2 * 0.5  / -1 < (output - label) < 1 /
- *   (output - label) - 0.5    / otherwise  /
+ *   0.5 * x^2    if / -1 < |x| < 1 /
+ *   |x| - 0.5    / otherwise /
  * \f]
+ *
+ * x = output - label
  */
 class SmoothL1CostLayer : public CostLayer {
 public:
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 7c4bea072157aac17787afab184b51c09ff656f2..47182c9ecc695f4d79089d06d6a1a61b878ce409 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -14,20 +14,18 @@ limitations under the License. */
 
 #pragma once
 
-#include <paddle/parameter/Argument.h>
 #include <functional>
 #include <memory>
 #include "ModelConfig.pb.h"
 #include "paddle/function/Function.h"
+#include "paddle/gserver/activations/ActivationFunction.h"
 #include "paddle/math/CpuSparseMatrix.h"
+#include "paddle/parameter/Argument.h"
 #include "paddle/parameter/Parameter.h"
+#include "paddle/parameter/Weight.h"
 #include "paddle/utils/ClassRegistrar.h"
 #include "paddle/utils/Util.h"
 
-#include <paddle/parameter/ParallelParameter.h>
-#include <paddle/parameter/Weight.h>
-#include "paddle/gserver/activations/ActivationFunction.h"
-
 /// Macro for registering a layer type.
 /// Example: REGISTER_LAYER(crf_error, CRFDecodingErrorLayer);
 #define REGISTER_LAYER(__type_name, __class_name) \
diff --git a/paddle/gserver/layers/TransLayer.cpp b/paddle/gserver/layers/TransLayer.cpp
index d1fa90f38415c53bd1c56df4a6c4be0508004bc6..4150f1727d8a1a3c1ed21b01944040977d2db315 100644
--- a/paddle/gserver/layers/TransLayer.cpp
+++ b/paddle/gserver/layers/TransLayer.cpp
@@ -56,7 +56,14 @@ void TransLayer::backward(const UpdateCallback& callback) {
     return;
   }
   MatrixPtr preGrad = getInputGrad(0);
-  outputGrad->transpose(preGrad, false);
+  if (preGrad) {
+    MatrixPtr transGrad = Matrix::create(preGrad->getHeight(),
+                                         preGrad->getWidth(),
+                                         /* trans= */ false,
+                                         preGrad->useGpu());
+    outputGrad->transpose(transGrad, false);
+    preGrad->add(*transGrad);
+  }
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 193b876c31626cee2e6763b3ab540e1a808fe1b0..e1e8e7fae7ca4c96206d60703db1f35aa1196875 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1679,13 +1679,29 @@ TEST(Layer, smooth_l1) {
   TestConfig config;
   config.layerConfig.set_type("smooth_l1");
 
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 200, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 200, 0});
+  config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "smooth_l1", 100, false, useGpu, false);
+  }
+}
+
+TEST(Layer, TransLayer) {
+  TestConfig config;
+  const int height = 128;
+  const int width = 1028;
+  config.layerConfig.set_type("trans");
+  config.layerConfig.set_size(width);
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ height * width, /* paraSize= */ 0});
   config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "smooth_l1", 100, false, useGpu, false, 2.0);
+    testLayerGrad(config, "trans", height, /* trans= */ false, useGpu);
   }
 }
 
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index 150850da4d49a2320acc70ed370cf8728d5c9def..4a846397e6cf3100f948af46874b0739e32bf4a5 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include <paddle/gserver/gradientmachines/GradientMachine.h>
+#include <paddle/parameter/ParameterUpdateFunctions.h>
 #include <paddle/trainer/Trainer.h>
 #include <paddle/trainer/TrainerInternal.h>
 #include <paddle/utils/PythonUtil.h>
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 55a7344495f8e57dc95095ab1b81b45008fa9acc..6ac61be0bf1b7a4e308705617faf5af2886a4082 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -3616,17 +3616,18 @@ void CpuMatrix::smoothL1(Matrix& output, Matrix& label) {
   CHECK_EQ(output.getHeight(), numSamples);
   CHECK_EQ(label.getWidth(), dim);
   CHECK_EQ(getWidth(), (size_t)1);
-  real* out = output.getData();
+
   real* cost = getData();
+  real* out = output.getData();
   real* lbl = label.getData();
 
-  for (size_t i = 0; i < numSamples; ++i, out += dim, cost += dim, lbl += dim) {
+  for (size_t i = 0; i < numSamples; ++i, out += dim, lbl += dim) {
     for (size_t j = 0; j < dim; ++j) {
-      cost[j] = std::fabs(out[j] - lbl[j]);
-      if (cost[j] < 1.0)
-        cost[j] = 0.5 * cost[j] * cost[j];
+      real absVal = std::fabs(out[j] - lbl[j]);
+      if (absVal < 1.0)
+        cost[i] += 0.5 * absVal * absVal;
       else
-        cost[j] = cost[j] - 0.5;
+        cost[i] += absVal - 0.5;
     }
   }
 }
@@ -3640,17 +3641,20 @@ void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label) {
   CHECK_EQ(label.getHeight(), numSamples);
   CHECK_EQ(output.getHeight(), numSamples);
   CHECK_EQ(label.getWidth(), dim);
-  CHECK_EQ(getWidth(), (size_t)1);
+  CHECK_EQ(getWidth(), dim);
+
   real* out = output.getData();
-  real* cost = getData();
   real* lbl = label.getData();
+  real* grad = getData();
 
-  // f'(x) = x         if |x| < 1
-  //       = sign(x)   otherwise
-  for (size_t i = 0; i < numSamples; ++i, out += dim, cost += dim, lbl += dim) {
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim, lbl += dim) {
     for (size_t j = 0; j < dim; ++j) {
-      cost[j] = out[j] - lbl[j];
-      if (std::fabs(cost[j]) >= 1) cost[j] = (0 < cost[j]) - (cost[j] < 0);
+      real val = out[j] - lbl[j];
+      if (std::fabs(val) < 1) {
+        grad[j] += val;
+      } else {
+        grad[j] += (real(0) < val) - (val < real(0));
+      }
     }
   }
 }
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index dd19fe516fbf724a86479e6f27032614ab4c6106..5210fe3fa1f3e221d7025edbc8a511d74ddaed51 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -235,8 +235,10 @@ TEST(Matrix, unary) {
       testMatrixTranspose(height, width);
       testMatrixRotate(height, width);
     }
-    // inverse
+// inverse
+#ifdef PADDLE_USE_LAPACK
     testMatrixInverse(height);
+#endif
   }
 }
 
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 645bf737990638df042723ed827d0823cb201e72..6d9365af2d14673146d9e427138bf6dd5f5b41b6 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -570,7 +570,7 @@ void Argument::poolSequenceWithStride(const Argument& input,
 
   CHECK(input.sequenceStartPositions);
   CHECK_EQ(input.hasSubseq(), 0UL);
-  CHECK_GT(stride, 0) << "stride must larger than 0";
+  CHECK_GT(stride, 0UL) << "stride must larger than 0";
   size_t numSequences = input.getNumSequences();
   ICpuGpuVector::resizeOrCreate(
       sequenceStartPositions, numSequences + 1, false);
diff --git a/paddle/parameter/ParallelParameter.cpp b/paddle/parameter/ParallelParameter.cpp
deleted file mode 100644
index cea77e5b1787c25ecb9ccd42e948bf90973fd4cb..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParallelParameter.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include "paddle/utils/Logging.h"
-
-#include "ParallelParameter.h"
-
-namespace paddle {
-
-UpdateFunction paramUpdateFunctions[UPDATE_TYPE_NUM] = {
-    nullptr,  // &ParallelParameter::singleUpdate,  /* single thread */
-    nullptr,  // &ParallelParameter::controlUpdate,    /* controller thread */
-    &ParallelParameter::majorUpdate, /* major thread */
-    &ParallelParameter::minorUpdate, /* minor thread */
-
-    nullptr,                         /* master */
-    &ParallelParameter::slaveUpdate, /* slave */
-};
-ParallelParameterPtr ParallelParameter::create(TrainerRole role,
-                                               ParameterPtr localParam,
-                                               int asyncCount) {
-  ParallelParameterPtr ptr = nullptr;
-  switch (role) {
-    case TRAINER_ROLE_CONTROL:
-    case TRAINER_ROLE_MAJOR:
-    case TRAINER_ROLE_MINOR:
-      ptr = std::make_shared<SyncParameter>(role, localParam);
-      break;
-    case TRAINER_ROLE_MASTER:
-    case TRAINER_ROLE_SLAVE:
-      ptr = std::make_shared<AsyncParameter>(role, asyncCount, localParam);
-      break;
-    default:
-      LOG(FATAL) << "unknown role " << role << "\n";
-  }
-  return ptr;
-}
-void ParallelParameter::syncUpdate(TrainerRole role, real learnRate) {
-  if (paramUpdateFunctions[role]) {
-    (this->*paramUpdateFunctions[role])(learnRate);
-  }
-}
-
-void SyncParameter::attachControlParam(ParallelParameterPtr controler) {
-  controlParam_ = controler;
-}
-
-void SyncParameter::attachMajorParam(ParallelParameterPtr partner) {
-  majorPartners_.push_back(partner);
-  if (role_ == TRAINER_ROLE_CONTROL) {
-    localParam_->setSharedCount(majorPartners_.size());
-  }
-  // partnerParam_ = partner;
-}
-
-void SyncParameter::attachMinorParam(ParallelParameterPtr partner,
-                                     int deviceId) {
-  minorPartners_.push_back(partner);
-  minorDeviceIds_.push_back(deviceId);
-  // partnerParam_ = partner;
-}
-
-void SyncParameter::waitAllMajorGradReady() {
-  for (size_t i = 0; i < majorPartners_.size(); i++) {
-    majorPartners_[i]->waitGradReady();
-    partnerParam_ = majorPartners_[i]->getLocalParameter();
-    VectorPtr localGrad = localParam_->getBuf(PARAMETER_GRADIENT);
-    VectorPtr patnrGrad = partnerParam_->getBuf(PARAMETER_GRADIENT);
-    if (FLAGS_use_gpu) hl_set_device(minorDeviceIds_[i]);
-    localGrad->add(*patnrGrad);
-  }
-}
-
-void SyncParameter::synchronizeParamter() {
-  valueSem_->wait();
-  if (role_ == TRAINER_ROLE_MINOR) {
-    /* copy the value from controller */
-    VectorPtr cntrlVec =
-        (controlParam_->getLocalParameter())->getBuf(PARAMETER_VALUE);
-    VectorPtr localVec = localParam_->getBuf(PARAMETER_VALUE);
-    localVec->copyFrom(*cntrlVec);
-
-    /* dispatch the value to major */
-    for (size_t i = 0; i < majorPartners_.size(); i++) {
-      VectorPtr majorVec =
-          (majorPartners_[i]->getLocalParameter())->getBuf(PARAMETER_VALUE);
-      majorVec->copyFrom(*localVec);
-      majorPartners_[i]->postValueReady();
-    }
-  }
-}
-
-void SyncParameter::singleUpdate(real learnRate) {
-  CHECK(role_ == TRAINER_ROLE_SINGLE);
-  localParam_->updateWithGradient(learnRate);
-}
-
-void SyncParameter::controlUpdate(const UpdateCallback &callBack) {
-  CHECK(role_ == TRAINER_ROLE_CONTROL);
-  CHECK(gradSem_ != NULL && valueSem_ != NULL);
-  CHECK(majorPartners_.size());
-
-  /* update */
-  if (callBack) {
-    callBack(localParam_.get());
-    localParam_->clearGradient();
-  }
-
-  for (size_t i = 0; i < minorPartners_.size(); i++) {
-    minorPartners_[i]->postValueReady();
-  }
-}
-
-void SyncParameter::majorUpdate(real learnRate) {
-  (void)learnRate;
-  CHECK(role_ == TRAINER_ROLE_MAJOR);
-  CHECK(gradSem_ != NULL && valueSem_ != NULL);
-  CHECK(minorPartners_.size() && controlParam_);
-
-  /* wait the minor-Gradient is ready */
-  for (size_t i = 0; i < minorPartners_.size(); i++) {
-    minorPartners_[i]->waitGradReady();
-    partnerParam_ = minorPartners_[i]->getLocalParameter();
-    VectorPtr localGrad = localParam_->getBuf(PARAMETER_GRADIENT);
-    VectorPtr minorGrad = partnerParam_->getBuf(PARAMETER_GRADIENT);
-    localGrad->add(*minorGrad);
-  }
-
-  /* notice the controller that the gradient is ready */
-  gradSem_->post();
-}
-
-void SyncParameter::minorUpdate(real learnRate) {
-  (void)learnRate;
-  CHECK(role_ == TRAINER_ROLE_MINOR);
-  CHECK(gradSem_ != NULL && valueSem_ != NULL);
-
-  // notice the major that the gradient is ready
-  gradSem_->post();
-}
-
-AsyncParameter::AsyncParameter(TrainerRole role,
-                               int asyncCount,
-                               ParameterPtr localParam)
-    : ParallelParameter(role, localParam) {
-  asyncCount_ = asyncCount;
-  accumCounter_ = 0;
-  gradientAccum_ = Vector::create(localParam->getSize(), localParam->useGpu());
-  gradientAccum_->zeroMem();
-}
-
-void AsyncParameter::slaveUpdate(real learnRate) {
-  /* increase the accumCounter_ */
-  accumCounter_++;
-
-  /* accumulate the gradient to the buffer */
-  VectorPtr grad = localParam_->getBuf(PARAMETER_GRADIENT);
-  gradientAccum_->add(*grad);
-
-  /* if need to be synchronized with the master */
-  if (accumCounter_ == asyncCount_) {
-    gradSem_->post();
-    // accumCounter_ = 0; NOTICE: the upper-function need to reset the counter
-  } else {  // self update
-    localParam_->updateWithGradient(learnRate);
-  }
-  localParam_->clearGradient();
-}
-
-bool AsyncParameter::masterUpdate(ParallelParameterPtr slaveParam,
-                                  const UpdateCallback &callback) {
-  CHECK(slaveParam && callback);
-
-  /* wait the slave is ready */
-  if (!slaveParam->timeWaitGradReady(5)) {
-    return false;
-  }
-
-  AsyncParameter *asyncParam = dynamic_cast<AsyncParameter *>(slaveParam.get());
-
-  /* get the accum-gradient to update local parameter */
-  VectorPtr slaveVec = asyncParam->getAccum();
-  localParam_->getBuf(PARAMETER_GRADIENT)->copyFrom(*slaveVec);
-  callback(localParam_.get());
-  // slaveVec->zeroMem();
-
-  /* copy the newest parameter-value to the slave */
-  slaveVec = (slaveParam->getLocalParameter())->getBuf(PARAMETER_VALUE);
-  slaveVec->copyFrom(*(localParam_->getBuf(PARAMETER_VALUE)));
-
-  /* release the semphore */
-  slaveParam->postValueReady();
-
-  return true;
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/ParallelParameter.h b/paddle/parameter/ParallelParameter.h
deleted file mode 100644
index 2e7c18b8084dc25b9f2f7630390bb4553ac703c9..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParallelParameter.h
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-
-#include <sys/time.h>
-#include <unistd.h>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "hl_gpu.h"
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/ParameterUpdateFunctions.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Locks.h"
-
-#include "ParameterConfig.pb.h"
-
-namespace paddle {
-
-class ParallelParameter;
-class SyncParameter;
-class AsyncParameter;
-
-typedef std::shared_ptr<ParallelParameter> ParallelParameterPtr;
-
-const int UPDATE_TYPE_NUM = 32;
-
-/**
- * TrainRole denotes the role of current training, different roles have
- * different jobs.
- *
- * control, major, minor are three kinds of role to support mutiple GPUs
- * parallel SGD training. SM on GPU card has two groups, each group
- * consist of a major and a minor.
- *
- * @param    single  single GPU card single thread training.
- *
- *
- * @param    control current parameter updates via control role,
- *                   not participate in real training. control role is
- *                   responsible for merging all major's gradient and
- *                   update parameter value.
- *
- * @param    major   major role paticipates in real training, when local
- *                   gradient is ready, merge its corresponding minor's
- *                   gradient and notify controller: this group's gradient
- *                   is already ready.
- *
- * @param    minor   minor role participates in real training, when local
- *                   gradient is ready, only notify its corresponding major.
- *                   In order to maximum apportion jobs, after controller
- *                   updates the paramemter value, each group's minior
- *                   reponses to dispatch the latest model into local and
- *                   major.
- */
-enum TrainerRole {
-  TRAINER_ROLE_SINGLE,
-  TRAINER_ROLE_CONTROL,
-  TRAINER_ROLE_MAJOR,
-  TRAINER_ROLE_MINOR,
-  TRAINER_ROLE_MASTER,
-  TRAINER_ROLE_SLAVE
-};
-typedef void (ParallelParameter::*UpdateFunction)(real learnRate);
-
-class ParallelParameter {
-public:
-  static ParallelParameterPtr create(TrainerRole role,
-                                     ParameterPtr localParam,
-                                     int asyncCount = 1);
-
-  ParallelParameter(TrainerRole role, ParameterPtr localParam) {
-    role_ = role;
-    gradSem_.reset(new Semaphore(0));
-    valueSem_.reset(new Semaphore(0));
-    localParam_ = localParam;
-  }
-
-  virtual ~ParallelParameter() {}
-
-  ParameterPtr getLocalParameter() { return localParam_; }
-  bool timeWaitGradReady(int sec) {
-    struct timespec ts;
-    ts.tv_nsec = 0;
-    ts.tv_sec = time(NULL) + sec;
-    return gradSem_->timeWait(&ts);
-  }
-  void waitGradReady() { gradSem_->wait(); }
-  void postValueReady() { valueSem_->post(); }
-
-  void syncUpdate(TrainerRole role, real learnRate);
-
-  virtual void synchronizeParamter() = 0;
-
-  /**
-   * for synchronous
-   */
-  virtual void singleUpdate(real learnRate) { (void)learnRate; }
-
-  virtual void controlUpdate(const UpdateCallback& callback) { (void)callback; }
-
-  virtual void majorUpdate(real learnRate) { (void)learnRate; }
-
-  virtual void minorUpdate(real learnRate) { (void)learnRate; }
-
-  /**
-   * for asynchronous
-   */
-  virtual void slaveUpdate(real learnRate) { (void)learnRate; }
-
-protected:
-  TrainerRole role_;
-  ParameterPtr localParam_;
-  std::unique_ptr<Semaphore>
-      gradSem_;  /// wether the local parameter-gradient is ready
-  std::unique_ptr<Semaphore>
-      valueSem_;  /// wether the local parameter-value is updated
-};
-
-/**
- * this class is designed for multi-threading training.
- *
- * "Synchronous" means multiple GPUs calculate 1/4 mini-Batch,
- * but will get only one gradient
- */
-class SyncParameter : public ParallelParameter {
-public:
-  SyncParameter(TrainerRole role, ParameterPtr localParam)
-      : ParallelParameter(role, localParam) {
-    controlParam_ = nullptr;
-    majorPartners_.clear();
-    minorPartners_.clear();
-  }
-  ~SyncParameter() {
-    majorPartners_.clear();
-    minorPartners_.clear();
-  }
-  void attachControlParam(ParallelParameterPtr controler);
-
-  void attachMajorParam(ParallelParameterPtr partner);
-
-  void attachMinorParam(ParallelParameterPtr partner, int deviceId);
-
-  void waitAllMajorGradReady();
-
-  void synchronizeParamter();
-
-  void singleUpdate(real learnRate);
-
-  void controlUpdate(const UpdateCallback& callback);
-
-  void majorUpdate(real learnRate);
-
-  void minorUpdate(real learnRate);
-
-  std::vector<ParallelParameterPtr>& getMajorPartners() {
-    return majorPartners_;
-  }
-
-  std::vector<ParallelParameterPtr>& getMinorPartners() {
-    return minorPartners_;
-  }
-
-private:
-  // The following variables are used in a multithreaded training situation
-  // partnerParam_ is local-parameter's partner
-  // controlParam_ is the controller-thread 's parameter
-  ParameterPtr partnerParam_;
-  std::vector<ParallelParameterPtr> majorPartners_;
-  std::vector<ParallelParameterPtr> minorPartners_;
-  std::vector<int> minorDeviceIds_;
-  ParallelParameterPtr controlParam_;
-};
-
-class AsyncParameter : public ParallelParameter {
-public:
-  AsyncParameter(TrainerRole role, int asyncCount, ParameterPtr localParam);
-
-  void clearCounter() { accumCounter_ = 0; }
-
-  VectorPtr getAccum() { return gradientAccum_; }
-
-  void synchronizeParamter() {
-    if (accumCounter_ == asyncCount_) {
-      valueSem_->wait();
-      clearCounter();
-      gradientAccum_->zeroMem();
-    }
-  }
-
-  /**
-   * When asynchronous training, update strategy including slave and master.
-   *
-   * slave: If in range asyncCount, adopting self-update method.
-   *        If beyond asyncCount, waiting for master to update.
-   */
-  void slaveUpdate(real learnRate);
-
-  /**
-   * When asynchronous training, update strategy including slave and master.
-   *
-   * master: it only polls slaves, do not training data.
-   *         If slave's gradient is ready, fetch it.
-   *         Update master's parameter, then copy it into
-   *         corresponding slave.
-   */
-  bool masterUpdate(ParallelParameterPtr slaveParam,
-                    const UpdateCallback& callback);
-
-private:
-  /**
-   * When asynchronous training, every aysnc trainer needs to
-   * accumulate a number of batch gradient.
-   *
-   * gradientAccum_ is used to save the sum of gradients.
-   */
-  VectorPtr gradientAccum_;
-
-  /// Asynchronous count.
-  int asyncCount_;
-  /// Accumulate counter of current gradients.
-  int accumCounter_;
-};
-
-typedef std::map<std::string, ParallelParameterPtr> ParallelParameterMap;
-
-}  // namespace paddle
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index 1ccded818796798105a889df978618688b56ed36..b8efabbe2a0b54edec64f6cee62b44c76ca7bf10 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -271,55 +271,6 @@ SparsePrefetchRowCpuMatrix* Parameter::getPrefetchMatrix() {
   return nullptr;
 }
 
-void Parameter::updateWithGradient(real learningRate) {
-  sgdUpdate(learningRate * config_.learning_rate(),
-            config_.momentum(),
-            config_.decay_rate(),
-            bufs_[PARAMETER_VALUE].get(),
-            bufs_[PARAMETER_GRADIENT].get(),
-            bufs_[PARAMETER_MOMENTUM].get());
-}
-
-void Parameter::updateWithGradient(real learningRate,
-                                   MatrixPtr gradMat,
-                                   IVectorPtr t0,
-                                   int currentTime,
-                                   bool fini) {
-  SparseRowCpuMatrix* sparseMat =
-      dynamic_cast<SparseRowCpuMatrix*>(gradMat.get());
-  CHECK(sparseMat);
-  CHECK_EQ(config_.momentum(), 0.0f)
-      << "not support momentum in sparse input sgd";
-  bool useL1 = (config_.decay_rate_l1() != 0.0f);
-  sparseMat->sgdUpdate(*bufs_[PARAMETER_VALUE],
-                       *t0,
-                       learningRate * config_.learning_rate(),
-                       currentTime,
-                       useL1 ? config_.decay_rate_l1() : config_.decay_rate(),
-                       useL1,
-                       fini);
-}
-
-void Parameter::updateWithGradient(real learningRate,
-                                   VectorPtr gradVec,
-                                   bool normalUpdate) {
-  if (normalUpdate) {
-    sgdUpdate(learningRate * config_.learning_rate(),
-              config_.momentum(),
-              config_.decay_rate(),
-              bufs_[PARAMETER_VALUE].get(),
-              gradVec.get(),
-              bufs_[PARAMETER_MOMENTUM].get());
-  } else {
-    size_t size = gradVec->getSize();
-    real* mom = bufs_[PARAMETER_MOMENTUM]->getData();
-    real* grad = gradVec->getData();
-    real* value = bufs_[PARAMETER_VALUE]->getData();
-    hl_matrix_add(mom, grad, mom, 1, size, 1.0f, learningRate);
-    hl_matrix_add(value, grad, value, 1, size, 1.0f, learningRate);
-  }
-}
-
 void Parameter::incUpdate(const UpdateCallback& callback) {
   // Static parameter is fixed, and does not need to be updated
   if (isStatic()) {
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 72c8336799133ad3f5855b0c1aa06639179ff70a..36d2b65f3bd1056a4ac6a1029000fe4cce6420ce 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -223,29 +223,6 @@ public:
 
   bool isValueUpdated() const { return updated_; }
 
-  /**
-   * Update bufs_[PARAMETER_VALUE] using bufs_[PARAMETER_GRADIENT]
-   */
-  void updateWithGradient(real learningRate);
-
-  /**
-   * Update bufs_[PARAMETER_VALUE] using sparse row grad matrix.
-   *
-   * @see SparseRowCpuMatrix::sgdUpdate for more information.
-   */
-  void updateWithGradient(real learningRate,
-                          MatrixPtr gradMat,
-                          IVectorPtr t0,
-                          int currentTime,
-                          bool fini = false);
-
-  /**
-   * This function is used to calculate multiple gpus, but only as a candidate
-   */
-  void updateWithGradient(real learningRate,
-                          VectorPtr grad,
-                          bool normalUpdate = true);
-
   /**
    * Save parameter value to a file
    */
diff --git a/paddle/scripts/cpplint.py b/paddle/scripts/cpplint.py
index 157ce7b44ac3cfe3a8ca5eda78e959cf7be4cc5b..dff4339ea33b72e22104a56183e3302067dc583d 100644
--- a/paddle/scripts/cpplint.py
+++ b/paddle/scripts/cpplint.py
@@ -58,6 +58,7 @@ _USAGE = """
 Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
                    [--counting=total|toplevel|detailed] [--root=subdir]
                    [--linelength=digits]
+                   [--write-success=success_status_file]
         <file> [file] ...
 
   The style guidelines this tries to follow are those in
@@ -499,6 +500,8 @@ _line_length = 80
 # This is set by --extensions flag.
 _valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
 
+_write_success = None
+
 
 def ParseNolintSuppressions(filename, raw_line, linenum, error):
     """Updates the global list of error-suppressions.
@@ -6337,7 +6340,7 @@ def ParseArguments(args):
     try:
         (opts, filenames) = getopt.getopt(args, '', [
             'help', 'output=', 'verbose=', 'counting=', 'filter=', 'root=',
-            'linelength=', 'extensions='
+            'linelength=', 'extensions=', 'write-success='
         ])
     except getopt.GetoptError:
         PrintUsage('Invalid arguments.')
@@ -6382,6 +6385,9 @@ def ParseArguments(args):
                 _valid_extensions = set(val.split(','))
             except ValueError:
                 PrintUsage('Extensions must be comma seperated list.')
+        elif opt == '--write-success':
+            global _write_success
+            _write_success = val
 
     if not filenames:
         PrintUsage('No files were specified.')
@@ -6408,6 +6414,10 @@ def main():
         ProcessFile(filename, _cpplint_state.verbose_level)
     _cpplint_state.PrintErrorCounts()
 
+    if _cpplint_state.error_count == 0 and _write_success is not None:
+        with open(_write_success, 'a'):
+            os.utime(_write_success, None)
+
     sys.exit(_cpplint_state.error_count > 0)
 
 
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 10bd9d0a6da175c8f691602782e2173d4e6e538c..4172063d923f939dac7229573bc087ec8c62b844 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -52,7 +52,13 @@ if [ ${WITH_DOC} == "ON" ]; then
           -DWITH_SWIG_PY=ON \
           -DWITH_STYLE_CHECK=OFF
     make paddle_docs paddle_docs_cn
+    DOC_DIR="/paddle/paddle/scripts/tools/build_docs/"
+    mkdir -p $DOC_DIR/doc
+    mkdir -p $DOC_DIR/doc_cn
+    cp -r /paddle/build_doc/doc/en/html/* $DOC_DIR/doc
+    cp -r /paddle/build_doc/doc/cn/html/* $DOC_DIR/doc_cn
     popd
+    rm -rf /paddle/build_doc
 fi
 # generate deb package for current build
 cpack -D CPACK_GENERATOR='DEB' ..
diff --git a/paddle/scripts/tools/build_docs/Dockerfile b/paddle/scripts/tools/build_docs/Dockerfile
deleted file mode 100644
index 78dc756bd1175019d90fc852635497fea1eb55e2..0000000000000000000000000000000000000000
--- a/paddle/scripts/tools/build_docs/Dockerfile
+++ /dev/null
@@ -1,7 +0,0 @@
-FROM paddledev/paddle:cpu-devel-latest
-COPY build.sh /
-RUN pip install sphinx &&\
-    pip install sphinx_rtd_theme &&\
-    apt install -y doxygen graphviz &&\
-    pip install recommonmark numpy protobuf==2.6.1
-CMD /build.sh
diff --git a/paddle/scripts/tools/build_docs/build.sh b/paddle/scripts/tools/build_docs/build.sh
deleted file mode 100755
index a23b6e61d45926e77015365627bfb7dca303ac65..0000000000000000000000000000000000000000
--- a/paddle/scripts/tools/build_docs/build.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-set -ex
-
-mkdir -p /build
-cd /build
-cmake /paddle -DWITH_DOC=ON
-make paddle_docs paddle_docs_cn -j `nproc`
-mkdir -p /output/doc
-mkdir -p /output/doc_cn
-cp -r doc/html/* /output/doc/
-cp -r doc_cn/html/* /output/doc_cn/
-cd /
-rm -rf /paddle/build
diff --git a/paddle/scripts/tools/build_docs/build_docs.sh b/paddle/scripts/tools/build_docs/build_docs.sh
index 9f8b80435c8fb17907d7da52c864a448f0d8d136..00123dcb87d6147d8ccea645c4fd605239760388 100755
--- a/paddle/scripts/tools/build_docs/build_docs.sh
+++ b/paddle/scripts/tools/build_docs/build_docs.sh
@@ -1,4 +1,36 @@
 #!/bin/bash
 set -e
-docker build . -t paddle_build_doc
-docker run --rm -v $PWD/../../../../:/paddle -v $PWD:/output paddle_build_doc
+function usage(){
+        echo "usage: build_doc [--help] [<args>]"
+        echo "This script generates doc and doc_cn in the script's directory."
+        echo "These are common commands used in various situations:"
+        echo "    with_docker       build doc and doc_cn with docker"
+        echo "    local             build doc and doc_cn locally"
+}
+
+
+case "$1" in
+    "with_docker")
+        docker run --rm -v $PWD/../../../../:/paddle \
+            -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_DOC=ON" paddledev/paddle:dev
+        ;;
+    "local")
+        mkdir -p doc
+        mkdir -p doc_cn
+        PADDLE_SOURCE_DIR=$PWD/../../../../
+        mkdir -p $PADDLE_SOURCE_DIR/build_doc
+        pushd $PADDLE_SOURCE_DIR/build_doc
+        cmake .. -DWITH_DOC=ON
+        make paddle_docs paddle_docs_cn
+        popd
+        cp -r $PADDLE_SOURCE_DIR/build_doc/doc/en/html/* doc
+        cp -r $PADDLE_SOURCE_DIR/build_doc/doc/cn/html/* doc_cn
+        rm -rf $PADDLE_SOURCE_DIR/build_doc
+        ;;
+    "--help")
+        usage
+        ;;
+    *)
+        usage
+        ;;
+esac
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index dc89419c40f8d527a3de0dc90ede0397f6f650c5..32e31fe2c446fb5d5e2df0819749a60cb8afdfd2 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2119,6 +2119,7 @@ define_cost('MultiBinaryLabelCrossEntropy', 'multi_binary_label_cross_entropy')
 define_cost('SoftBinaryClassCrossEntropy', 'soft_binary_class_cross_entropy')
 define_cost('HuberTwoClass', 'huber')
 define_cost('SumCost', 'sum_cost')
+define_cost('SmoothL1Cost', 'smooth_l1')
 
 
 @config_layer('hsigmoid')
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index 7b76e87f045e638d0a78e1ef5a191d465b7d79d7..7ae9e5cb3050fa6f70fa84785a1ddbdc68c70235 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -196,7 +196,7 @@ class ExtraLayerAttribute(object):
                       <https://www.cs.toronto.edu/~hinton/absps/
                       JMLRdropout.pdf>`_.
     :type drop_rate: float
-    :param device: device ID of layer. device=-1, use CPU. device>0, use GPU.
+    :param device: device ID of layer. device=-1, use CPU. device>=0, use GPU.
                    The details allocation in parallel_nn please refer to `here
                    <http://www.paddlepaddle.org/doc/ui/cmd_argument/
                    use_case.html#case-2-specify-layers-in-different-devices>`_.
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 3a0c5cb27c321d3722c2bb87c47b8b6cfd4d2a44..31652613fb3a55636b32babbc4bde60d65776c61 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -117,6 +117,7 @@ __all__ = [
     'spp_layer',
     'pad_layer',
     'eos_layer',
+    'smooth_l1_cost',
     'layer_support',
 ]
 
@@ -202,6 +203,7 @@ class LayerType(object):
     SOFT_BIN_CLASS_CROSS_ENTROPY = "soft_binary_class_cross_entropy"
     MULTI_BIN_LABEL_CROSS_ENTROPY = "multi_binary_label_cross_entropy"
     SUM_COST = "sum_cost"
+    SMOOTH_L1 = "smooth_l1"
 
     @staticmethod
     def is_layer_type(type_name):
@@ -1348,9 +1350,9 @@ def last_seq(input,
     """
     Get Last Timestamp Activation of a sequence.
 
-    If stride > 0, this layer slides a window whose size is determined by stride, 
-    and return the last value of the window as the output. Thus, a long sequence 
-    will be shorten. Note that for sequence with sub-sequence, the default value 
+    If stride > 0, this layer slides a window whose size is determined by stride,
+    and return the last value of the window as the output. Thus, a long sequence
+    will be shorten. Note that for sequence with sub-sequence, the default value
     of stride is -1.
 
     The simple usage is:
@@ -1364,7 +1366,7 @@ def last_seq(input,
     :type name: basestring
     :param input: Input layer name.
     :type input: LayerOutput
-    :param stride: window size.  
+    :param stride: window size.
     :type stride: Int
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
@@ -1404,9 +1406,9 @@ def first_seq(input,
     """
     Get First Timestamp Activation of a sequence.
 
-    If stride > 0, this layer slides a window whose size is determined by stride, 
-    and return the first value of the window as the output. Thus, a long sequence 
-    will be shorten. Note that for sequence with sub-sequence, the default value 
+    If stride > 0, this layer slides a window whose size is determined by stride,
+    and return the first value of the window as the output. Thus, a long sequence
+    will be shorten. Note that for sequence with sub-sequence, the default value
     of stride is -1.
 
     The simple usage is:
@@ -1420,7 +1422,7 @@ def first_seq(input,
     :type name: basestring
     :param input: Input layer name.
     :type input: LayerOutput
-    :param stride: window size.  
+    :param stride: window size.
     :type stride: Int
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
@@ -1560,7 +1562,7 @@ def seq_reshape_layer(input,
                       bias_attr=None):
     """
     A layer for reshaping the sequence. Assume the input sequence has T instances,
-    the dimension of each instance is M, and the input reshape_size is N, then the 
+    the dimension of each instance is M, and the input reshape_size is N, then the
     output sequence has T*M/N instances, the dimension of each instance is N.
 
     Note that T*M/N must be an integer.
@@ -2117,8 +2119,8 @@ def img_conv_layer(input,
     :param trans: true if it is a convTransLayer, false if it is a convLayer
     :type trans: bool
     :param layer_type: specify the layer_type, default is None. If trans=True,
-                       layer_type has to be "exconvt" or "cudnn_convt", 
-                       otherwise layer_type has to be either "exconv" or 
+                       layer_type has to be "exconvt" or "cudnn_convt",
+                       otherwise layer_type has to be either "exconv" or
                        "cudnn_conv"
     :type layer_type: String
     :return: LayerOutput object.
@@ -2336,9 +2338,9 @@ def spp_layer(input,
 
     ..  code-block:: python
 
-        spp = spp_layer(input=data, 
-                        pyramid_height=2, 
-                        num_channels=16, 
+        spp = spp_layer(input=data,
+                        pyramid_height=2,
+                        num_channels=16,
                         pool_type=MaxPooling())
 
     :param name: layer name.
@@ -2432,7 +2434,7 @@ def img_cmrnorm_layer(input,
     The example usage is:
 
     ..  code-block:: python
-    
+
         norm = img_cmrnorm_layer(input=net, size=5)
 
     :param name: layer name.
@@ -2493,7 +2495,7 @@ def batch_norm_layer(input,
     The example usage is:
 
     ..  code-block:: python
-    
+
         norm = batch_norm_layer(input=net, act=ReluActivation())
 
     :param name: layer name.
@@ -2794,11 +2796,11 @@ def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
     """
     Concat sequence a with sequence b.
 
-    Inputs: 
+    Inputs:
       - a = [a1, a2, ..., an]
       - b = [b1, b2, ..., bn]
       - Note that the length of a and b should be the same.
-        
+
     Output: [a1, b1, a2, b2, ..., an, bn]
 
     The example usage is:
@@ -3634,9 +3636,15 @@ def beam_search(step,
                 simple_rnn += last_time_step_output
             return simple_rnn
 
+        generated_word_embedding = GeneratedInput(
+                               size=target_dictionary_dim,
+                               embedding_name="target_language_embedding",
+                               embedding_size=word_vector_dim)
+
         beam_gen = beam_search(name="decoder",
                                step=rnn_step,
-                               input=[StaticInput(encoder_last)],
+                               input=[StaticInput(encoder_last),
+                                      generated_word_embedding],
                                bos_id=0,
                                eos_id=1,
                                beam_size=5)
@@ -3655,7 +3663,8 @@ def beam_search(step,
                  You can refer to the first parameter of recurrent_group, or
                  demo/seqToseq/seqToseq_net.py for more details.
     :type step: callable
-    :param input: Input data for the recurrent unit
+    :param input: Input data for the recurrent unit, which should include the
+                  previously generated words as a GeneratedInput object.
     :type input: list
     :param bos_id: Index of the start symbol in the dictionary. The start symbol
                    is a special token for NLP task, which indicates the
@@ -5322,8 +5331,6 @@ def multi_binary_label_cross_entropy(input,
     :type input: LayerOutput
     :param label: The input label.
     :type input: LayerOutput
-    :param type: The type of cost.
-    :type type: basestring
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring
     :param coeff: The coefficient affects the gradient in the backward.
@@ -5352,3 +5359,52 @@ def multi_binary_label_cross_entropy(input,
         LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
         parents=[input, label],
         size=1)
+
+
+@wrap_name_default()
+@layer_support()
+def smooth_l1_cost(input, label, name=None, layer_attr=None):
+    """
+    This is a L1 loss but more smooth. It requires that the
+    size of input and label are equal. The formula is as follows,
+
+    .. math::
+
+        L = \sum_{i} smooth_{L1}(input_i - label_i)
+
+    in which
+
+    .. math::
+
+        smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if}  \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases}
+
+    More details can be found by referring to `Fast R-CNN
+    <https://arxiv.org/pdf/1504.08083v2.pdf>`_
+
+    .. code-block:: python
+
+       cost = smooth_l1_cost(input=input_layer,
+                             label=label_layer)
+
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param label: The input label.
+    :type input: LayerOutput
+    :param name: The name of this layers. It is not necessary.
+    :type name: None|basestring
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert isinstance(label, LayerOutput)
+    assert input.size == label.size
+
+    Layer(
+        name=name,
+        type=LayerType.SMOOTH_L1,
+        inputs=[input.name, label.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.SMOOTH_L1, parents=[input, label], size=1)
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index 93dd7796c246ae81a146759df7e0c19e334375f1..6c860fd49702ebc93612114011361efb885c62ec 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -9,8 +9,7 @@ add_test(NAME test_reset_hook
         ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
     WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
 
-add_paddle_exe(protobuf_equal
-  ProtobufEqualMain.cpp)
+add_paddle_exe(protobuf_equal ProtobufEqualMain.cpp)
 add_test(NAME test_layerHelpers
   COMMAND
   ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index c9178e3c6a46a2d663ec368569e529e780b76a6f..c5dc8e1aab08d38936d8636c219571d0cf6f4906 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -5,6 +5,6 @@ last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
 test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
-test_seq_concat_reshape)
+test_seq_concat_reshape test_pad test_smooth_l1)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index ee5961af75ebb33af52f9add645f793015288f4e..8a318879630cd491573afcaf798dda2ca75e335d 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -11,6 +11,9 @@ for conf in ${configs[*]}
 do
     echo "Generating " $conf
     $1 -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unittest
+    if [ ! -f "$protostr/$conf.protostr" ]; then 
+        cp $protostr/$conf.protostr.unittest $protostr/$conf.protostr
+    fi
     cat ${conf}.py |$1 test_config_parser_for_non_file_config.py > $protostr/$conf.protostr.non_file_config.unittest
 done
 
@@ -18,5 +21,8 @@ for conf in ${whole_configs[*]}
 do
     echo "Generating " $conf
     $1 -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unittest
+    if [ ! -f "$protostr/$conf.protostr" ]; then 
+        cp $protostr/$conf.protostr.unittest $protostr/$conf.protostr
+    fi
     cat ${conf}.py |$1 test_config_parser_for_non_file_config.py --whole > $protostr/$conf.protostr.non_file_config.unittest
 done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..15c6ab4dc8e61dedc10acaa49db7d8ae136d4952
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
@@ -0,0 +1,120 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 2016
+  active_type: ""
+  height: 48
+  width: 42
+}
+layers {
+  name: "__conv_0__"
+  type: "exconv"
+  size: 32256
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___conv_0__.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 1
+      stride: 1
+      padding: 1
+      groups: 1
+      filter_channels: 1
+      output_x: 42
+      img_size: 42
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 1
+      output_y: 48
+      img_size_y: 48
+    }
+  }
+  bias_parameter_name: "___conv_0__.wbias"
+  num_filters: 16
+  shared_biases: true
+  height: 48
+  width: 42
+}
+layers {
+  name: "__pool_0__"
+  type: "pool"
+  size: 8064
+  active_type: ""
+  inputs {
+    input_layer_name: "__conv_0__"
+    pool_conf {
+      pool_type: "max-projection"
+      channels: 16
+      size_x: 2
+      stride: 2
+      output_x: 21
+      img_size: 42
+      padding: 0
+      size_y: 2
+      stride_y: 2
+      output_y: 24
+      img_size_y: 48
+      padding_y: 0
+    }
+  }
+  height: 24
+  width: 21
+}
+layers {
+  name: "__pad_0__"
+  type: "pad"
+  size: 14175
+  active_type: ""
+  inputs {
+    input_layer_name: "__pool_0__"
+    pad_conf {
+      image_conf {
+        channels: 16
+        img_size: 21
+        img_size_y: 24
+      }
+      pad_c: 2
+      pad_c: 3
+      pad_h: 1
+      pad_h: 2
+      pad_w: 3
+      pad_w: 1
+    }
+  }
+  height: 27
+  width: 25
+}
+parameters {
+  name: "___conv_0__.w0"
+  size: 144
+  initial_mean: 0.0
+  initial_std: 0.471404520791
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___conv_0__.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__pad_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__conv_0__"
+  layer_names: "__pool_0__"
+  layer_names: "__pad_0__"
+  input_layer_names: "data"
+  output_layer_names: "__pad_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..4aa041ea2e173a6cc2ab21e3c9ea703601929cde
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr
@@ -0,0 +1,40 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__smooth_l1_cost_0__"
+  type: "smooth_l1"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  coeff: 1.0
+}
+input_layer_names: "input"
+input_layer_names: "label"
+output_layer_names: "__smooth_l1_cost_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "label"
+  layer_names: "__smooth_l1_cost_0__"
+  input_layer_names: "input"
+  input_layer_names: "label"
+  output_layer_names: "__smooth_l1_cost_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_pad.py b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
index bb5f13410dbbbaeea9e28c271d33a15fb3000dcf..491e8c8caab38eb7c24e5461107ab5a9d63b12ef 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
@@ -2,7 +2,7 @@ from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
 
-data = data_layer(name='data', size=2304, height=48, width=42)
+data = data_layer(name='data', size=2016, height=48, width=42)
 
 conv = img_conv_layer(
     input=data,
@@ -13,8 +13,7 @@ conv = img_conv_layer(
     act=LinearActivation(),
     bias_attr=True)
 
-pool = img_pool_layer(
-    input=conv, num_channels=8, pool_size=2, stride=2, pool_type=MaxPooling())
+pool = img_pool_layer(input=conv, pool_size=2, stride=2, pool_type=MaxPooling())
 
 pad = pad_layer(input=pool, pad_c=[2, 3], pad_h=[1, 2], pad_w=[3, 1])
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py b/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
new file mode 100644
index 0000000000000000000000000000000000000000..66629662dd9166766daaf707409b720f56ef1405
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
@@ -0,0 +1,7 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+lbl = data_layer(name='label', size=300)
+smooth_l1 = smooth_l1_cost(input=data, label=lbl)
+
+outputs(smooth_l1)
diff --git a/python/paddle/utils/dump_config.py b/python/paddle/utils/dump_config.py
index 73bf349c46726163d664c374aa47598871b90106..d27af7f76246a4c9db9a43c17715506d82031b9c 100644
--- a/python/paddle/utils/dump_config.py
+++ b/python/paddle/utils/dump_config.py
@@ -20,6 +20,7 @@ __all__ = []
 
 if __name__ == '__main__':
     whole_conf = False
+    binary = False
     if len(sys.argv) == 2:
         conf = parse_config(sys.argv[1], '')
     elif len(sys.argv) == 3:
@@ -28,6 +29,8 @@ if __name__ == '__main__':
         conf = parse_config(sys.argv[1], sys.argv[2])
         if sys.argv[3] == '--whole':
             whole_conf = True
+        elif sys.argv[3] == '--binary':
+            binary = True
     else:
         raise RuntimeError()
 
@@ -36,4 +39,7 @@ if __name__ == '__main__':
     if whole_conf:
         print conf
     else:
-        print conf.model_config
+        if binary:
+            sys.stdout.write(conf.model_config.SerializeToString())
+        else:
+            print conf.model_config
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
index 41fda1e8f24cdef13d8ab3645862814100a1cd4c..81af0a8e66a44a3476206147684d81bcac1be372 100644
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -80,7 +80,7 @@ def train100():
 
 def test100():
     """
-    CIFAR-100 test set cretor.
+    CIFAR-100 test set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
     [0, 1] and label in [0, 9].
@@ -107,7 +107,7 @@ def train10():
 
 def test10():
     """
-    CIFAR-10 test set cretor.
+    CIFAR-10 test set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
     [0, 1] and label in [0, 9].
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index c1347d3c66da858104858bfb6739d84051322146..435556b2921b7976bbc61160ce3812949981c9e7 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -93,7 +93,7 @@ def train():
 
 def test():
     """
-    MNIST test set cretor.
+    MNIST test set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
     [0, 1] and label in [0, 9].
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index feefd7d758ba09f5d8f818ca1b12b00c5f0e9797..5e99d4a241b7fe2b0f9ff4ba191db4b341c4d30e 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -38,12 +38,35 @@ class Optimizer(object):
         assert isinstance(tmp, swig_api.ParameterOptimizer)
         return tmp.getParameterTypes()
 
-    def create_local_updater(self):
+    def __create_local_updater__(self):
         return swig_api.ParameterUpdater.createLocalUpdater(self.__opt_conf__)
 
-    def create_remote_updater(self, pass_num):
-        return swig_api.ParameterUpdater.createRemoteUpdater(self.__opt_conf__,
-                                                             pass_num)
+    def __create_remote_updater__(self, pass_num, use_sparse_updater):
+        return swig_api.ParameterUpdater.createRemoteUpdater(
+            self.__opt_conf__, pass_num, use_sparse_updater)
+
+    def create_updater(self, is_local, num_passes, use_sparse_updater):
+        """
+        create proper parameter_updater by configuration.
+        :param is_local: create local or remote parameter updater
+        :param num_passes: remote parameter updater will use this to config
+        parameter server.
+        :param use_sparse_updater: when use remote updater, if some parameter is
+        sparse, updater should do some extra thing:
+
+        ..  code-block:: python
+
+            if use_sparse_remote_updater:
+                        gradient_machine.prefetch(in_args)
+                        parameter_updater.getParametersRemote()
+        :return: parameter_updater
+        """
+        if is_local:
+            parameter_updater = self.__create_local_updater__()
+        else:
+            parameter_updater = self.__create_remote_updater__(
+                num_passes, use_sparse_updater)
+        return parameter_updater
 
 
 class Momentum(Optimizer):
diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py
index 737b6bf1e2eb60281d4d6e92667d9fe91e243704..ff28c85c53dc8255b6ad5e3975b07f72a9a64e4b 100644
--- a/python/paddle/v2/topology.py
+++ b/python/paddle/v2/topology.py
@@ -73,6 +73,18 @@ class Topology(object):
 
         assert isinstance(self.__model_config__, ModelConfig)
 
+    def use_sparse_updater(self):
+        """
+        check if any parameter require to use sparse_update
+        :return:
+        """
+        use_sparse = False
+        for parameter in self.__model_config__.parameters:
+            if parameter.sparse_update or parameter.sparse_remote_update:
+                use_sparse = True
+                break
+        return use_sparse
+
     def proto(self):
         return self.__model_config__
 
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index 68b4967cc031dfa2dd164d822aff97585f923e48..ec9fcfb749f1a858713d3d6672118e521fbdcb32 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -2,6 +2,8 @@
 Module Trainer
 """
 import collections
+import gzip
+import os
 
 import py_paddle.swig_paddle as api
 
@@ -42,7 +44,12 @@ class SGD(object):
     :type extra_layers: paddle.v2.config_base.Layer
     """
 
-    def __init__(self, cost, parameters, update_equation, extra_layers=None):
+    def __init__(self,
+                 cost,
+                 parameters,
+                 update_equation,
+                 extra_layers=None,
+                 is_local=True):
 
         if not isinstance(parameters, v2_parameters.Parameters):
             raise TypeError('parameters should be parameters')
@@ -55,20 +62,48 @@ class SGD(object):
         self.__topology__ = topology
         self.__parameters__ = parameters
         self.__topology_in_proto__ = topology.proto()
+        self.__is_local__ = is_local
 
-        # In local mode, disable sparse_remote_update.
-        for param in self.__topology_in_proto__.parameters:
-            if param.sparse_remote_update:
-                param.sparse_remote_update = False
+        self.__use_sparse_updater__ = self.__topology__.use_sparse_updater()
+        # # In local mode, disable sparse_remote_update.
+        if is_local:
+            for param in self.__topology_in_proto__.parameters:
+                if param.sparse_remote_update:
+                    param.sparse_remote_update = False
 
+        self.__gm_create_mode__ = api.CREATE_MODE_NORMAL if not \
+            self.__use_sparse_updater__ else api.CREATE_MODE_SGD_SPARSE_CPU_TRAINING
         self.__data_types__ = topology.data_type()
         gm = api.GradientMachine.createFromConfigProto(
-            self.__topology_in_proto__, api.CREATE_MODE_NORMAL,
+            self.__topology_in_proto__, self.__gm_create_mode__,
             self.__optimizer__.enable_types())
         assert isinstance(gm, api.GradientMachine)
         self.__gradient_machine__ = gm
         self.__gradient_machine__.randParameters()
-        parameters.append_gradient_machine(gm)
+        self.__parameters__.append_gradient_machine(gm)
+        self.__parameter_updater__ = None
+
+    def __use_remote_sparse_updater__(self):
+        return self.__use_sparse_updater__ and not self.__is_local__
+
+    def __prepare_parameter__(self, in_args):
+        """
+        prepare parameter before forward backward.
+        1. When use remote sparse updater, parameters should be got
+        from ps according to input arguments.
+        :param in_args: input arguments of this batch.
+        :return:
+        """
+        if self.__use_remote_sparse_updater__():
+            self.__gradient_machine__.prefetch(in_args)
+            self.__parameter_updater__.getParametersRemote()
+
+    def save_parameter_to_tar(self, f):
+        self.__parameter_updater__.catchUpWith()
+        self.__parameter_updater__.apply()
+        self.__parameter_updater__.getParametersRemote(True, True)
+        self.__parameters__.to_tar(f)
+        self.__parameter_updater__.restore()
 
     def train(self, reader, num_passes=1, event_handler=None, feeding=None):
         """
@@ -90,8 +125,9 @@ class SGD(object):
             event_handler = default_event_handler
         __check_train_args__(**locals())
 
-        updater = self.__optimizer__.create_local_updater()
-        updater.init(self.__gradient_machine__)
+        self.__parameter_updater__ = self.__optimizer__.create_updater(
+            self.__is_local__, num_passes, self.__use_sparse_updater__)
+        self.__parameter_updater__.init(self.__gradient_machine__)
 
         self.__gradient_machine__.start()
         batch_evaluator = self.__gradient_machine__.makeEvaluator()
@@ -103,23 +139,26 @@ class SGD(object):
         for pass_id in xrange(num_passes):
             event_handler(v2_event.BeginPass(pass_id))
             pass_evaluator.start()
-            updater.startPass()
+            self.__parameter_updater__.startPass()
             for batch_id, data_batch in enumerate(reader()):
                 batch_evaluator.start()
                 event_handler(
                     v2_event.BeginIteration(
                         pass_id=pass_id, batch_id=batch_id))
-                pass_type = updater.startBatch(len(data_batch))
-                self.__gradient_machine__.forwardBackward(
-                    feeder(data_batch), out_args, pass_type)
+                pass_type = self.__parameter_updater__.startBatch(
+                    len(data_batch))
+                in_args = feeder(data_batch)
+                self.__prepare_parameter__(in_args)
+                self.__gradient_machine__.forwardBackward(in_args, out_args,
+                                                          pass_type)
                 self.__gradient_machine__.eval(pass_evaluator)
                 self.__gradient_machine__.eval(batch_evaluator)
                 for each_param in self.__gradient_machine__.getNonStaticParameters(
                 ):
-                    updater.update(each_param)
+                    self.__parameter_updater__.update(each_param)
                 cost_sum = out_args.sum()
                 cost = cost_sum / len(data_batch)
-                updater.finishBatch(cost)
+                self.__parameter_updater__.finishBatch(cost)
                 batch_evaluator.finish()
                 event_handler(
                     v2_event.EndIteration(
@@ -128,7 +167,7 @@ class SGD(object):
                         cost=cost,
                         evaluator=batch_evaluator))
 
-            updater.finishPass()
+            self.__parameter_updater__.finishPass()
             pass_evaluator.finish()
             event_handler(v2_event.EndPass(pass_id, evaluator=pass_evaluator))
         self.__gradient_machine__.finish()
@@ -152,8 +191,9 @@ class SGD(object):
         num_samples = 0.0
         for data_batch in reader():
             num_samples += len(data_batch)
-            self.__gradient_machine__.forward(
-                feeder(data_batch), out_args, api.PASS_TEST)
+            in_args = feeder(data_batch)
+            self.__prepare_parameter__(in_args)
+            self.__gradient_machine__.forward(in_args, out_args, api.PASS_TEST)
             total_cost += out_args.sum()
             self.__gradient_machine__.eval(evaluator)
 
diff --git a/python/setup.py.in b/python/setup.py.in
index 59e967df7198c111e0b6ca7f994d32bf32f0d426..2ec0841b14d3f1b91722ec635604f181ed3ec860 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -13,6 +13,9 @@ packages=['paddle',
 setup(name='paddle',
       version='${PADDLE_VERSION}',
       description='Parallel Distributed Deep Learning',
+      install_requires=[
+          "requests",
+      ],
       packages=packages,
       install_requires=[
           "numpy",