diff --git a/.travis.yml b/.travis.yml
index 44b755ee32d204c883f0d74e7ad0f78380918954..a53bd1809416d6f14a1ec7f603622d3303d1ab28 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,23 +1,21 @@
 language: cpp
 cache:
   directories:
-    - $HOME/third_party
     - $HOME/.ccache
     - $HOME/.cache/pip
+    - $TRAVIS_BUILD_DIR/build/third_party
 sudo: required
 dist: trusty
 os:
   - linux
 env:
-  - JOB=DOCS
-  - JOB=BUILD_AND_TEST
-  - JOB=PRE_COMMIT
+  - JOB=build_doc
+  - JOB=check_style
 addons:
   apt:
     packages:
       - gcc-4.8
       - g++-4.8
-      - gfortran-4.8
       - git
       - build-essential
       - python
@@ -34,27 +32,17 @@ addons:
       - libtool
       - ccache
 before_install:
-  - |
-    if [ ${JOB} == "BUILD_AND_TEST" ]; then
-      local change_list=`git diff --name-only $TRAVIS_COMMIT_RANGE`
-      if [ $? -eq 0 ]; then  # if git diff return no zero, then rerun unit test.
-        if ! echo ${change_list} | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)'
-        then
-          echo "Only markdown docs were updated, stopping build process."
-          exit
-        fi
-      fi
-    fi
-  - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
+  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
   # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python 
   # protobuf version.
   - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
   - pip install rarfile
+  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
   - |
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
-  - | 
-    timeout 2580 paddle/scripts/travis/main.sh  # 43min timeout
+  - |
+    timeout 2580 paddle/scripts/travis/${JOB}.sh  # 43min timeout
     RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
 notifications:
   email:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 79210d043648de5d493f0b998eeb885c993a6106..3c719d35eced2420b7891dbaf507ba07cd78baf8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,7 @@ option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
+option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -70,7 +71,7 @@ if(ANDROID)
         "Disable RDMA when cross-compiling for Android" FORCE)
 endif(ANDROID)
 
-set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
+set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
 
 if (WITH_C_API AND WITH_PYTHON)
@@ -107,6 +108,7 @@ include(configure)          # add paddle env configuration
 include_directories("${PROJ_ROOT}")
 include_directories("${PROJ_ROOT}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
+include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/cclient")
 
 set(EXTERNAL_LIBS
     ${GFLAGS_LIBRARIES}
@@ -128,6 +130,11 @@ add_subdirectory(proto)
 add_subdirectory(paddle)
 add_subdirectory(python)
 
+if(WITH_GOLANG)
+    #TODO (add go/master/c back when fixed)
+    add_subdirectory(go/pserver/cclient)
+endif(WITH_GOLANG)
+
 if(WITH_DOC)
     add_subdirectory(doc)
 endif()
diff --git a/Dockerfile b/Dockerfile
index 39af60966b6cab7d8b9e644f4ea658613f8ba518..bf227737c5a67b006ccc221235daf6d8ad7b3bd8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -25,7 +25,7 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
     apt-get install -y \
     git python-pip python-dev openssh-server bison  \
-    wget unzip tar xz-utils bzip2 gzip coreutils  \
+    wget unzip tar xz-utils bzip2 gzip coreutils ntp \
     curl sed grep graphviz libjpeg-dev zlib1g-dev  \
     python-numpy python-matplotlib gcc g++ \
     automake locales clang-format-3.8 swig doxygen cmake  \
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 5e507e78f74eee885922f502f35e3c15fafb622d..e8425aedbdd269d54035a0457fa37e0ba834427a 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -40,6 +40,10 @@ if(NOT CMAKE_CROSSCOMPILING)
     endif()
 endif()
 
+if(NOT WITH_GOLANG)
+    add_definitions(-DPADDLE_WITHOUT_GOLANG)
+endif(NOT WITH_GOLANG)
+
 if(NOT WITH_GPU)
     add_definitions(-DPADDLE_ONLY_CPU)
     add_definitions(-DHPPL_STUB_FUNC)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 2341e3785bd8e951e10e3f6bbf8a32f63e4ae44d..5b9d9844ed21ceb507a8e01676c3533f4e3dd8fb 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -21,7 +21,8 @@ IF(NOT ${CBLAS_FOUND})
     SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
     SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
 
-    SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/${LIBRARY_PREFIX}openblas${STATIC_LIBRARY_SUFFIX}"
+    SET(CBLAS_LIBRARIES
+        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
         CACHE FILEPATH "openblas library." FORCE)
 
     SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 7340394b1e1fad9e1893ac87d62febb8dd72751c..d43badc1da50723d5d3dbd1f19f0bd4ef4d24737 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -14,11 +14,41 @@
 
 INCLUDE(ExternalProject)
 
+# Print and set the protobuf library information,
+# finish this cmake process and exit from this file.
 macro(PROMPT_PROTOBUF_LIB)
+    SET(protobuf_DEPS ${ARGN})
+
     MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}")
     MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}")
     MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}")
     INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
+
+    # Assuming that all the protobuf libraries are of the same type.
+    IF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$")
+        SET(protobuf_LIBTYPE STATIC)
+    ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$")
+        SET(protobuf_LIBTYPE SHARED)
+    ELSE()
+        MESSAGE(FATAL_ERROR "Unknown library type: ${PROTOBUF_LIBRARY}")
+    ENDIF()
+
+    ADD_LIBRARY(protobuf ${protobuf_LIBTYPE} IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET protobuf PROPERTY IMPORTED_LOCATION ${PROTOBUF_LIBRARY})
+
+    ADD_LIBRARY(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION ${PROTOBUF_LITE_LIBRARY})
+
+    ADD_LIBRARY(protoc ${protobuf_LIBTYPE} IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY})
+
+    FOREACH(dep ${protobuf_DEPS})
+        ADD_DEPENDENCIES(protobuf ${dep})
+        ADD_DEPENDENCIES(protobuf_lite ${dep})
+        ADD_DEPENDENCIES(protoc ${dep})
+    ENDFOREACH()
+
+    LIST(APPEND external_project_dependencies protobuf)
     RETURN()
 endmacro()
 macro(SET_PROTOBUF_VERSION)
@@ -43,22 +73,23 @@ if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
 endif()
 
 FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
-    SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_NAME})
-    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_NAME})
+    STRING(REPLACE "extern_" "" TARGET_DIR_NAME "${TARGET_NAME}")
+    SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME})
+    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_DIR_NAME})
 
     SET(${TARGET_NAME}_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE)
     SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE)
     SET(${TARGET_NAME}_LITE_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${STATIC_LIBRARY_SUFFIX}"
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}"
          PARENT_SCOPE)
     SET(${TARGET_NAME}_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${STATIC_LIBRARY_SUFFIX}"
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}"
          PARENT_SCOPE)
     SET(${TARGET_NAME}_PROTOC_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${STATIC_LIBRARY_SUFFIX}"
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}"
          PARENT_SCOPE)
     SET(${TARGET_NAME}_PROTOC_EXECUTABLE
-        "${PROTOBUF_INSTALL_DIR}/bin/protoc${EXECUTABLE_SUFFIX}"
+        "${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}"
          PARENT_SCOPE)
 
     SET(OPTIONAL_CACHE_ARGS "")
@@ -109,6 +140,8 @@ IF(NOT CMAKE_CROSSCOMPILING)
         SET_PROTOBUF_VERSION()
         IF("${PROTOBUF_VERSION}" VERSION_LESS "3.1.0")
             SET(PROTOBUF_FOUND OFF)
+        ELSE()
+            PROMPT_PROTOBUF_LIB()
         ENDIF()
     ENDIF(PROTOBUF_FOUND)
 ELSE()
@@ -120,18 +153,22 @@ ELSE()
 ENDIF()
 
 IF(NOT PROTOBUF_FOUND)
-    build_protobuf(protobuf FALSE)
-    LIST(APPEND external_project_dependencies protobuf)
+    build_protobuf(extern_protobuf FALSE)
 
-    SET(PROTOBUF_INCLUDE_DIR ${protobuf_INCLUDE_DIR}
+    SET(PROTOBUF_INCLUDE_DIR ${extern_protobuf_INCLUDE_DIR}
         CACHE PATH "protobuf include directory." FORCE)
-    IF(NOT CMAKE_CROSSCOMPILING)
-        SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_PROTOC_EXECUTABLE}
+    SET(PROTOBUF_LITE_LIBRARY ${extern_protobuf_LITE_LIBRARY}
+        CACHE FILEPATH "protobuf lite library." FORCE)
+    SET(PROTOBUF_LIBRARY ${extern_protobuf_LIBRARY}
+        CACHE FILEPATH "protobuf library." FORCE)
+    SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
+        CACHE FILEPATH "protoc library." FORCE)
+
+    IF(CMAKE_CROSSCOMPILING)
+        PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf)
+    ELSE()
+        SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE}
             CACHE FILEPATH "protobuf executable." FORCE)
+        PROMPT_PROTOBUF_LIB(extern_protobuf)
     ENDIF()
-    SET(PROTOBUF_LITE_LIBRARY ${protobuf_LITE_LIBRARY} CACHE FILEPATH "protobuf lite library." FORCE)
-    SET(PROTOBUF_LIBRARY ${protobuf_LIBRARY} CACHE FILEPATH "protobuf library." FORCE)
-    SET(PROTOBUF_PROTOC_LIBRARY ${protobuf_PROTOC_LIBRARY} CACHE FILEPATH "protoc library." FORCE)
 ENDIF(NOT PROTOBUF_FOUND)
-
-PROMPT_PROTOBUF_LIB()
\ No newline at end of file
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 43cd6b398b1caac55b938d576b96eb0282c00fda..11c1f677ae5b308558b54bf49caf168cf6023444 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -11,22 +11,80 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+#
 
 
-# To simplify the build process of PaddlePaddle, we defined couple of
-# fundamental abstractions, e.g., how to build library, binary and
-# test in C++, CUDA and Go.
+# generic.cmake defines CMakes functions that look like Bazel's
+# building rules (https://bazel.build/).
 #
+# 
 # -------------------------------------------
-#    C++	      CUDA C++	      Go
+#     C++        CUDA C++       Go
 # -------------------------------------------
-# cc_library	 nv_library	  go_library
-# cc_binary  	 nv_binary	  go_binary
-# cc_test        nv_test	  go_test
+# cc_library    nv_library   go_library
+# cc_binary     nv_binary    go_binary
+# cc_test       nv_test      go_test
 # -------------------------------------------
+# 
+# To build a static library example.a from example.cc using the system
+#  compiler (like GCC):
+# 
+#   cc_library(example SRCS example.cc)
+# 
+# To build a static library example.a from multiple source files
+# example{1,2,3}.cc:
+# 
+#   cc_library(example SRCS example1.cc example2.cc example3.cc)
+# 
+# To build a shared library example.so from example.cc:
+# 
+#   cc_library(example SHARED SRCS example.cc)
+# 
+# To build a library using Nvidia's NVCC from .cu file(s), use the nv_
+# prefixed version:
+# 
+#   nv_library(example SRCS example.cu)
+# 
+# To specify that a library new_example.a depends on other libraies:
+# 
+#   cc_library(new_example SRCS new_example.cc DEPS example)
+# 
+# Static libraries can be composed of other static libraries:
+# 
+#   cc_library(composed DEPS dependent1 dependent2 dependent3)
+# 
+# To build an executable binary file from some source files and
+# dependent libraries:
+# 
+#   cc_binary(example SRCS main.cc something.cc DEPS example1 example2)
+# 
+# To build an executable binary file using NVCC, use the nv_ prefixed
+# version:
+# 
+#   nv_binary(example SRCS main.cc something.cu DEPS example1 example2)
+# 
+# To build a unit test binary, which is an executable binary with
+# GoogleTest linked:
+# 
+#   cc_test(example_test SRCS example_test.cc DEPS example)
+# 
+# To build a unit test binary using NVCC, use the nv_ prefixed version:
+# 
+#   nv_test(example_test SRCS example_test.cu DEPS example)
+#
+# It is pretty often that executable and test binaries depend on
+# pre-defined external libaries like glog and gflags defined in
+# /cmake/external/*.cmake:
+#
+#   cc_test(example_test SRCS example_test.cc DEPS example glog gflags)
+#
+# To build a go static library using Golang, use the go_ prefixed version:
+#
+#   go_library(example STATIC)
+#
+# To build a go shared library using Golang, use the go_ prefixed version:
 #
-# cmake_parse_arguments can help us to achieve this goal.
-# https://cmake.org/cmake/help/v3.0/module/CMakeParseArguments.html
+#   go_library(example SHARED)
 #
 
 if(NOT APPLE)
@@ -34,33 +92,92 @@ if(NOT APPLE)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
 endif(NOT APPLE)
 
-# cc_library parses tensor.cc and figures out that target also depend on tensor.h.
-# cc_library(tensor
-#   SRCS
-#   tensor.cc
-#   DEPS
-#   variant)
+function(merge_static_libs TARGET_NAME)
+  set(libs ${ARGN})
+  list(REMOVE_DUPLICATES libs)
+
+  # First get the file names of the libraries to be merged
+  foreach(lib ${libs})
+    get_target_property(libtype ${lib} TYPE)
+    if(NOT libtype STREQUAL "STATIC_LIBRARY")
+      message(FATAL_ERROR "merge_static_libs can only process static libraries")
+    endif()
+    set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+  endforeach()
+
+  if(APPLE) # Use OSX's libtool to merge archives
+    add_custom_target(${TARGET_NAME}_archive
+      COMMAND libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+      DEPENDS ${libs}
+      )
+    add_library(${TARGET_NAME} STATIC IMPORTED GLOBAL)
+    set_property(TARGET ${TARGET_NAME} PROPERTY
+      IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a")
+    add_dependencies(${TARGET_NAME} ${TARGET_NAME}_archive)
+	else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+    foreach(lib ${libs})
+      set(objlistfile ${lib}.objlist) # list of objects in the input library
+      set(objdir ${lib}.objdir)
+
+      add_custom_command(OUTPUT ${objdir}
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir})
+
+      add_custom_command(OUTPUT ${objlistfile}
+        COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
+        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ../${objlistfile}
+        DEPENDS ${lib} ${objdir}
+        WORKING_DIRECTORY ${objdir})
+
+      # Empty dummy source file that goes into merged library
+      set(mergebase ${lib}.mergebase.c)
+      add_custom_command(OUTPUT ${mergebase}
+        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
+        DEPENDS ${objlistfile})
+
+      list(APPEND mergebases "${mergebase}")
+    endforeach()
+
+    # We need a target for the output merged library
+    add_library(${TARGET_NAME} STATIC ${mergebases})
+    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+
+    foreach(lib ${libs})
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+      COMMAND ${CMAKE_AR} ru ${outlibfile} @"../${objlistfile}"
+      WORKING_DIRECTORY ${objdir})
+    endforeach()
+
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+      COMMAND ${CMAKE_RANLIB} ${outlibfile})
+  endif()
+endfunction(merge_static_libs)
+
 function(cc_library TARGET_NAME)
-  set(options OPTIONAL)
+  set(options STATIC static SHARED shared)
   set(oneValueArgs "")
   set(multiValueArgs SRCS DEPS)
   cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  if (${cc_library_OPTIONAL} STREQUAL "SHARED")
-    add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
-  else()
-    add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
-  endif()
-  if (cc_library_DEPS)
-    add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
-  endif()
+  if (cc_library_SRCS)
+    if (cc_library_SHARED OR cc_library_shared) # build *.so
+      add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
+    else()
+      add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
+    endif()
+    if (cc_library_DEPS)
+      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
+    endif()
+  else(cc_library_SRCS)
+    if (cc_library_DEPS)
+      merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
+    else()
+      message(FATAL "Please specify source file or library in cc_library.")
+    endif()
+  endif(cc_library_SRCS)
 endfunction(cc_library)
 
-# cc_binary parses tensor.cc and figures out that target also depend on tensor.h.
-# cc_binary(tensor
-#   SRCS
-#   tensor.cc)
 function(cc_binary TARGET_NAME)
-  set(options OPTIONAL)
+  set(options "")
   set(oneValueArgs "")
   set(multiValueArgs SRCS DEPS)
   cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -71,13 +188,6 @@ function(cc_binary TARGET_NAME)
   endif()
 endfunction(cc_binary)
 
-# The dependency to target tensor implies that if any of
-# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built.
-# cc_test(tensor_test
-#   SRCS
-#   tensor_test.cc
-#   DEPS
-#   tensor)
 function(cc_test TARGET_NAME)
   if(WITH_TESTING)
     set(options "")
@@ -91,28 +201,28 @@ function(cc_test TARGET_NAME)
   endif()
 endfunction(cc_test)
 
-# Suppose that ops.cu includes global functions that take Tensor as
-# their parameters, so ops depend on tensor. This implies that if
-# any of tensor.{h.cc}, ops.{h,cu} is changed, ops need to be re-built.
-# nv_library(ops
-#   SRCS
-#   ops.cu
-#   DEPS
-#   tensor)
 function(nv_library TARGET_NAME)
   if (WITH_GPU)
-    set(options OPTIONAL)
+    set(options STATIC static SHARED shared)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    if (${nv_library_OPTIONAL} STREQUAL "SHARED")
-      cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
-    else()
-      cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
-    endif()
-    if (nv_library_DEPS)
-      add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
-    endif()
+    if(nv_library_SRCS)
+      if (nv_library_SHARED OR nv_library_shared) # build *.so
+        cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
+      else()
+          cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
+      endif()
+      if (nv_library_DEPS)
+        add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
+      endif()
+    else(nv_library_SRCS)
+      if (nv_library_DEPS)
+        merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
+      else()
+        message(FATAL "Please specify source file or library in nv_library.")
+      endif()
+    endif(nv_library_SRCS)
   endif()
 endfunction(nv_library)
 
@@ -130,13 +240,6 @@ function(nv_binary TARGET_NAME)
   endif()
 endfunction(nv_binary)
 
-# The dependency to target tensor implies that if any of
-# ops{.h,.cu,_test.cu} is changed, ops_test need to be re-built.
-# nv_test(ops_test
-#   SRCS
-#   ops_test.cu
-#   DEPS
-#   ops)
 function(nv_test TARGET_NAME)
   if (WITH_GPU AND WITH_TESTING)
     set(options "")
@@ -152,42 +255,53 @@ endfunction(nv_test)
 
 set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
 file(MAKE_DIRECTORY ${GOPATH})
+set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle")
 
-# Because api.go defines a GO wrapper to ops and tensor, it depends on
-# both.  This implies that if any of tensor.{h,cc}, ops.{h,cu}, or
-# api.go is changed, api need to be re-built.
-# go_library(api
-#   SRCS
-#   api.go
-#   DEPS
-#   tensor # Because ops depend on tensor, this line is optional.
-#   ops)
 function(go_library TARGET_NAME)
-  set(options OPTIONAL)
+  set(options STATIC static SHARED shared)
   set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
+  set(multiValueArgs DEPS)
   cmake_parse_arguments(go_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  if (${go_library_OPTIONAL} STREQUAL "SHARED")
+
+  if (go_library_SHARED OR go_library_shared)
     set(BUILD_MODE "-buildmode=c-shared")
-    if(APPLE)
-      set(LIB_NAME "lib${TARGET_NAME}.dylib")
-    else()
-      set(LIB_NAME "lib${TARGET_NAME}.so")
-    endif()
+    set(LIB_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}")
   else()
     set(BUILD_MODE "-buildmode=c-archive")
-    set(LIB_NAME "lib${TARGET_NAME}.a")
+    set(LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}")
   endif()
-  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
+
+  # Add dummy code to support `make target_name` under Terminal Command
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+  file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+  if (go_library_SHARED OR go_library_shared)
+    add_library(${TARGET_NAME} SHARED ${dummyfile})
+  else()
+    add_library(${TARGET_NAME} STATIC ${dummyfile})
+  endif()
+  if(go_library_DEPS)
+    add_dependencies(${TARGET_NAME} ${go_library_DEPS})
+  endif(go_library_DEPS)
+
+  # we need to symlink Paddle directory into GOPATH. If we
+  # don't do it and we have code that depends on Paddle, go
+  # get ./... will download a new Paddle repo from Github,
+  # without the changes in our current Paddle repo that we
+  # want to build.
+  file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
+  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+    COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
+    # Symlink Paddle directory into GOPATH
+    COMMAND mkdir -p ${PADDLE_IN_GOPATH}
+    COMMAND rm -rf ${PADDLE_IN_GOPATH}                                                                                                                                         
+    COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH}
+    # Automatically get all dependencies specified in the source code                                                                                                                                 
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ./...
+    # Golang build source code
     COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
     -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
-    ${go_library_SRCS}
+    ${GO_SOURCE}
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-  add_custom_target(${TARGET_NAME}_lib ALL DEPENDS ${TARGET_NAME}_timestamp ${go_library_DEPS})
-  add_library(${TARGET_NAME} STATIC IMPORTED)
-  set_property(TARGET ${TARGET_NAME} PROPERTY
-    IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}")
-  add_dependencies(${TARGET_NAME} ${TARGET_NAME}_lib)
 endfunction(go_library)
 
 function(go_binary TARGET_NAME)
@@ -217,10 +331,3 @@ function(go_test TARGET_NAME)
   add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS})
   add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME})
 endfunction(go_test)
-
-# go_extern will download extern go project.
-# go_extern(target_name extern_source)
-# go_extern(go_redis github.com/hoisie/redis)
-function(go_extern TARGET_NAME)
-  add_custom_target(${TARGET_NAME} env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get ${ARGN})
-endfunction(go_extern)
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 904652413e026e3a7f3f2a19f48f4e906ce6babb..adf5e2c539740076ad1808353522c7467d765e64 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -33,6 +33,7 @@ ELSE(WIN32)
             SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
                 "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
         ENDIF()
+        set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
     ELSE(APPLE)
 
         IF(EXISTS "/etc/issue")
@@ -84,24 +85,6 @@ IF(DEFINED CMAKE_SYSTEM_NAME)
     ENDIF()
 ENDIF()
 
-# prefix and suffix on different os
-IF(WIN32)
-    SET(LIBRARY_PREFIX "")
-    SET(SHARED_LIBRARY_SUFFIX ".dll")
-    SET(STATIC_LIBRARY_SUFFIX ".lib")
-    SET(EXECUTABLE_SUFFIX ".exe")
-ELSE(WIN32)
-    SET(LIBRARY_PREFIX "lib")
-    IF(APPLE)
-        SET(SHARED_LIBRARY_SUFFIX ".dylib")
-    ELSE(APPLE)
-        SET(SHARED_LIBRARY_SUFFIX ".so")
-    ENDIF(APPLE)
-
-    SET(STATIC_LIBRARY_SUFFIX ".a")
-    SET(EXECUTABLE_SUFFIX "")
-ENDIF(WIN32)
-
 # external dependencies log output
 SET(EXTERNAL_PROJECT_LOG_ARGS
     LOG_DOWNLOAD    0     # Wrap download in script to log output
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 8c9143462227e7081142f6be250b1a45e4b6d51b..87ad9d91d8701c56255c1e7f224764998df634a7 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -84,6 +84,7 @@ function(link_paddle_exe TARGET_NAME)
         paddle_parameter
         paddle_proto
         paddle_cuda
+        paddle_optimizer
         ${EXTERNAL_LIBS}
         ${CMAKE_THREAD_LIBS_INIT}
         ${CMAKE_DL_LIBS}
diff --git a/doc/api/v2/config/evaluators.rst b/doc/api/v2/config/evaluators.rst
index 39db51fa4abc370855ca3f2778b47464f33b6fce..9ac972fb193a2fb525edc507f7ba1303d2c8eabe 100644
--- a/doc/api/v2/config/evaluators.rst
+++ b/doc/api/v2/config/evaluators.rst
@@ -99,3 +99,12 @@ value_printer
 ..  automodule:: paddle.v2.evaluator
     :members:  value_printer
     :noindex:
+
+Detection
+=====
+
+detection_map
+-------------
+..  automodule:: paddle.v2.evaluator
+    :members:  detection_map
+    :noindex:
diff --git a/doc/design/cluster_train/pserver_client.md b/doc/design/cluster_train/pserver_client.md
index b3e4079010490b69db1de28157f0cab80cad2381..474b8c572cd92fc87e9f7f3f2b19d12cccd158de 100644
--- a/doc/design/cluster_train/pserver_client.md
+++ b/doc/design/cluster_train/pserver_client.md
@@ -74,14 +74,25 @@ typedef enum {
 typedef struct {
   char*               name;
   paddle_element_type element_type;
-  void*               content;
+  unsigned char*      content;
   int                 content_len;
 } paddle_parameter, paddle_gradient;
 
-typedef struct paddle_pserver_client paddle_pserver_client;
+typedef int paddle_pserver_client;
 
-paddle_pserver_client* paddle_new_pserver_client();
-void paddle_pserver_client_release(paddle_pserver_client* client);
+/**
+ * @brief creates a pserver client that talks to etcd for coordination.
+ */
+paddle_pserver_client paddle_new_etcd_pserver_client(char* etcd_addr);
+
+/**
+ * @brief creates a pserver client given pserver addresses.
+ *
+ * @param pserver_addrs comma-separated pserver addresses.
+ * @param selected if current pserver client is selected to initialize all parameter servers.
+ */
+paddle_pserver_client paddle_new_pserver_client(char* pserver_addrs, int selected);
+void paddle_pserver_client_release(paddle_pserver_client c);
 
 /**
  * @brief paddle_begin_init_params begins to initialize parameters on
@@ -95,7 +106,7 @@ void paddle_pserver_client_release(paddle_pserver_client* client);
  * @return 1 if the trainer is selected to initialize parameter
  * servers, otherwise 0.
  */
-int paddle_begin_init_params(paddle_pserver_client* client);
+int paddle_begin_init_params(paddle_pserver_client client);
 
 /**
  * @brief paddle_init_param initializes the parameter on parameter
@@ -109,7 +120,7 @@ int paddle_begin_init_params(paddle_pserver_client* client);
  * @paddle_begin_init_param). Or simply exit the program and wait for
  * the cluster management system to restart the trainer.
  */
-int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);
+int paddle_init_param(paddle_pserver_client client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);
 
 /**
  * @brief paddle_finish_init_params tells parameter servers client has
@@ -120,7 +131,7 @@ int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, con
  * @paddle_begin_init_param). Or simply exit the program and wait for
  * the cluster management system to restart the trainer.
  */
-int paddle_finish_init_params(paddle_pserver_client* client);
+int paddle_finish_init_params(paddle_pserver_client client);
 
 /**
  * @brief paddle_send_grads sends gradients to parameter servers for
@@ -131,7 +142,7 @@ int paddle_finish_init_params(paddle_pserver_client* client);
  * @param learning_rate the learning rate for the gradients.
  * @return 0 if successful, otherwise -1.
  */
-int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grads, int len);
+int paddle_send_grads(paddle_pserver_client client, const paddle_gradient* grads, int len);
 
 /**
  * @brief paddle_get_params gets parameters from parameter servers.
@@ -139,13 +150,15 @@ int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grad
  * paddle_get_params will block until parameters are initialized on
  * the parameter servers.
  *
- * @param names the array of names of the parameters to get.
- * @param dst the destination array of parameters to save to.
+ * @param dst the destination array of parameter pointers to save to.
+ * The parameter pointer must be pre-popullated with required parameter name,
+ * and the content of parameter must be pre-allocated of the size of required
+ * parameter on pserver.
  * @param len the length of the names array and the paddle_parameter
  * array.
  * @return 0 if successful, otherwise -1.
  */
-int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_parameter* dst, int len);
+int paddle_get_params(paddle_pserver_client client, paddle_parameter** dst, int len);
 
 /**
  * @brief paddle_save_model indicates parameters to save the parameter
@@ -154,5 +167,5 @@ int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_
  * @param path the path to save parameters.
  * @return 0 if successful, otherwise -1.
  */
-int paddle_save_model(paddle_pserver_client* client, const char* path);
+int paddle_save_model(paddle_pserver_client client, const char* path);
 ```
diff --git a/doc/design/cluster_train/remote_parameter_updater.md b/doc/design/cluster_train/remote_parameter_updater.md
new file mode 100644
index 0000000000000000000000000000000000000000..6e8e5938455b869e0f3367794c41250340b37f77
--- /dev/null
+++ b/doc/design/cluster_train/remote_parameter_updater.md
@@ -0,0 +1,21 @@
+# Design Doc: Remote Parameter Updater for Cluster Train
+
+For an overview of distribute training, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter updater that will use parameter server cclient [The Client Library of Parameter Server Design Doc](pserver_client.md) to manage and update parameters.
+
+## Parameter Updater
+
+Parameter Updater is used by trainer to manage and update parameter, there are mainly two kind of parameter updater: local and remote, since this design is for cluster train, we will only discuss remote parameter updater here.
+
+### Remote Parameter Updater
+
+Remote Parameter Updater manage parameters through remote parameter server with the client that communicate with pserver([The Client Library of Parameter Server Design Doc](pserver_client.md))
+
+In PaddlePaddle Python V2 API, trainer is implemented in python, and the trainer will hold a instance of parameter updater and call it's functions directly. In this design, we will also expose the api of RemoteParameterUpdater to python with swig.
+
+#### Sparse Remote Parameter Updater
+
+Since we will only implement dense parameter management new, the mechanism for sparse parameter will be discussed in next stage.
+
+### Interface Design
+
+TBD
diff --git a/doc/design/scope.md b/doc/design/scope.md
new file mode 100644
index 0000000000000000000000000000000000000000..2ff416f06e8ada48b1d4922f8869a106f35799e2
--- /dev/null
+++ b/doc/design/scope.md
@@ -0,0 +1,124 @@
+# Design of Scope in Paddle
+
+## Overview
+
+Scope is an important concept in programming languages, which defines a program region that a set of bindings between names and entities applies. In a specific scope, a valid name is uniquely associated with an entity, such as a variable. And in another scope, this name may refer to other entity or nothing at all. It clearly restricts the visibility and validity of names in a program. Hence **Scope** is introduced to PaddlePaddle to manage variables in context. But different from the original abstract concept, Scope now becomes an object with two important attributes:
+
+- Scope is an association of a name to variable.
+- Variables in a parent scope can be retrieved from local scope.
+
+A detailed explanation of these two attributes goes as following.
+
+
+## Scope is an association of a name to variable.
+
+Scope is an association of a name to variable. All variables belong to `Scope`. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. One net can run in different scopes and update different variable in the scope.
+
+
+1. Scope only contains a map of a name to variable.
+
+   All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state(momentum) etc.
+
+1. Variable can only be created by Scope and a variable can only be got from Scope. User cannot create or get a variable outside a scope. This is a constraints of our framework, and will keep our framework simple and clear.
+
+1. Scope only contains methods that are used to Create and Get Variables. Scope do not contain Operators and have no information to run them.
+    `Net` is designed to drive the computation and Scope only contains a map of variables. There is no computation logic inside a `Scope`. Scope just handles the lifetime management of variables.
+    - `Create` is used to create a Variable by its name and add the mapping relation.
+    - `Get` is used to find a Variable by name.
+
+1. Every variable only belongs to one certain Scope.
+
+   Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`.
+
+1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else. 
+
+   Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be a invalid pointer when associated `Scope` is destroyed.
+
+```cpp
+class Scope {
+ public:
+  Variable* CreateVariable(const std::string& name);
+  const Variable* GetVariable(const std::string& name) const;
+
+ private:
+    std::unordered_map<std::string, std::unique_ptr<Vairable>> vars_;
+};
+```
+
+
+## Parent scope and local scope
+
+Just like [scope](https://en.wikipedia.org/wiki/Scope_(computer_science)) in programming languages, `Scope` in the neural network can also be a local scope. There are two attributes about local scope.
+
+1.  We can create local variables in a local scope. When that local scope are destroyed, all local variables should also be destroyed.
+2.  Variables in a parent scope can be retrieved from local scopes of that parent scope, i.e., when user get a variable from a scope, it will try to search this variable in current scope. If there is no such variable in the local scope, `scope` will keep searching from its parent, until the variable is found or there is no parent.
+
+```cpp
+class Scope {
+ public:
+  Scope(const std::shared_ptr<Scope>& scope): parent_(scope) {}
+
+  Variable* GetVariable(const std::string& name) const {
+    Variable* var = GetVarLocally(name);
+    if (var != nullptr) {
+      return var;
+    } else if (parent_ != nullptr) {
+      return parent_->GetVariable(name);
+    } else {
+      return nullptr;
+    }
+  }
+
+ private:
+  std::shared_ptr<Scope> parent_ {nullptr};
+};
+```
+
+In `Scope` class, there is a private data member called `parent_`. `parent_` is a smart pointer to its parent scope. When user `Get` a variable by its `name`, the `name` will be searched inside the current scope. If the variable cannot be found locally and parent scope is not a `nullptr`, the variable will be searched inside that parent scope. `parent_` pointer's default value is `nullptr`. It means that the scope is a global scope when `parent_` is nullptr.
+
+A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily.
+
+# Interface Design
+
+```cpp
+class Variable {
+ private:
+  Variable() = default;
+  friend class Scope;
+};
+
+class Scope {
+ private:
+  Scope(const std::shared_ptr<Scope>& parent = nullptr);
+
+ public:
+  static std::shared_ptr<Scope> Create(const std::shared_ptr<Scope>& parent = nullptr);
+
+  // return nullptr if not found.
+  Variable* GetVariable(const std::string& name) const;
+
+  // return Error if already contains same name variable.
+  Error CreateVariable(const std::string& name);
+
+ private:
+  std::shared_ptr<Scope> parent_;
+  std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+};
+```
+## Only scope can create a variable
+
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `CreateVariable` can construct `Variable`.
+
+## When scope destroyed, all variables inside this scope should be destroyed together
+
+The scope hold unique pointers for all variables. User can `GetVariable` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.
+
+## Sharing a parent scope
+
+Local scope contains a `parent_` pointer. It is a linked-list for scopes. Using a `shared_ptr` because when a local scope is using, its parents cannot be destroyed.
+
+Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shared pointer. We cannot construct a scope variable, because it cannot be passed to other scope as `parent` pointer.
+
+## Orthogonal interface
+
+`GetVariable` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `CreateVariable` will return a `Error` when there is a name conflict locally. Combine `GetVariable` and `CreateVariable`, we can implement `CreateOrGetVariable` easily.
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index 69f4501f370dcc9d603ec54a63d68568d66e832e..c0608ede8e57b224dae4b3d510d704a8b0918b53 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -22,6 +22,7 @@ To compile the source code, your computer must be equipped with the following de
 - **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
 - **BLAS**: MKL, OpenBlas or ATLAS
 - **Python**: only support Python 2.7
+- **Go**
 
 **Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
 For CUDA 8.0, GCC versions later than 5.3 are not supported!
@@ -107,6 +108,18 @@ As a simple example, consider the following:
     sudo apt-get install -y python python-pip python-numpy libpython-dev bison
     sudo pip install 'protobuf==3.1.0.post1'
 
+    # Install Go
+    # You can follow https://golang.org/doc/install for a detailed explanation.
+    wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
+    tar -C $HOME -xzf go.tgz && \
+    mkdir $HOME/gopath && \
+    rm go.tgz
+
+    # Setup environment variables
+    export GOROOT=$HOME/go
+    export GOPATH=$HOME/gopath
+    export PATH=$PATH:$GOROOT/bin
+
     # install cmake 3.4
     curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
         cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \
diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/deep_model/rnn/index_cn.rst
index 9e805ca85191b793c8798a239927a318c70b96f5..9ecab5594cff47cde4700b7ce0f58013a960a16e 100644
--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ b/doc/howto/deep_model/rnn/index_cn.rst
@@ -4,6 +4,7 @@ RNN相关模型
 ..  toctree::
   :maxdepth: 1
 
+  rnn_config_cn.rst
   recurrent_group_cn.md
   hierarchical_layer_cn.rst
   hrnn_rnn_api_compare_cn.rst
diff --git a/doc/howto/deep_model/rnn/index_en.rst b/doc/howto/deep_model/rnn/index_en.rst
index 13a153b05c578e0af82ee29db5ea27fd4b6d6f59..7adc79873d699fdfd5a85034bcef964dd1f19132 100644
--- a/doc/howto/deep_model/rnn/index_en.rst
+++ b/doc/howto/deep_model/rnn/index_en.rst
@@ -1,2 +1,7 @@
 RNN Models
 ==========
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn_config_en.rst
diff --git a/doc/howto/deep_model/rnn/rnn_config_cn.rst b/doc/howto/deep_model/rnn/rnn_config_cn.rst
index ac2bd0775f4ab2e0a0c37462e2c23001123b152b..4d684cf8ad5a8082cf31fb27027119b3d3e700b6 100644
--- a/doc/howto/deep_model/rnn/rnn_config_cn.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst
@@ -5,36 +5,13 @@ RNN配置
 中配置循环神经网络（RNN）。PaddlePaddle
 高度支持灵活和高效的循环神经网络配置。 在本教程中，您将了解如何：
 
--  准备用来学习循环神经网络的序列数据。
 -  配置循环神经网络架构。
 -  使用学习完成的循环神经网络模型生成序列。
 
 我们将使用 vanilla 循环神经网络和 sequence to sequence
 模型来指导你完成这些步骤。sequence to sequence
-模型的代码可以在\ ``demo / seqToseq``\ 找到。
-
-准备序列数据
-------------
-
-PaddlePaddle
-不需要对序列数据进行任何预处理，例如填充。唯一需要做的是将相应类型设置为输入。例如，以下代码段定义了三个输入。
-它们都是序列，它们的大小是\ ``src_dict``\ ，\ ``trg_dict``\ 和\ ``trg_dict``\ ：
-
-.. code:: python
-
-    settings.input_types = [
-      integer_value_sequence(len(settings.src_dict)),
-      integer_value_sequence(len(settings.trg_dict)),
-      integer_value_sequence(len(settings.trg_dict))]
-
-在\ ``process``\ 函数中，每个\ ``yield``\ 函数将返回三个整数列表。每个整数列表被视为一个整数序列：
-
-.. code:: python
-
-    yield src_ids, trg_ids, trg_ids_next
-
-有关如何编写数据提供程序的更多细节描述，请参考 :ref:`api_pydataprovider2` 。完整的数据提供文件在
-``demo/seqToseq/dataprovider.py``\ 。
+模型的代码可以在 `book/08.machine_translation <https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation>`_ 找到。
+wmt14数据的提供文件在 `python/paddle/v2/dataset/wmt14.py <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/wmt14.py>`_ 。
 
 配置循环神经网络架构
 --------------------
@@ -85,19 +62,19 @@ vanilla
                    act=None,
                    rnn_layer_attr=None):
         def __rnn_step__(ipt):
-           out_mem = memory(name=name, size=size)
-           rnn_out = mixed_layer(input = [full_matrix_projection(ipt),
-                                          full_matrix_projection(out_mem)],
-                                 name = name,
-                                 bias_attr = rnn_bias_attr,
-                                 act = act,
-                                 layer_attr = rnn_layer_attr,
-                                 size = size)
+           out_mem = paddle.layer.memory(name=name, size=size)
+           rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt),
+                                                 paddle.layer.full_matrix_projection(input=out_mem)],
+                                        name = name,
+                                        bias_attr = rnn_bias_attr,
+                                        act = act,
+                                        layer_attr = rnn_layer_attr,
+                                        size = size)
            return rnn_out
-        return recurrent_group(name='%s_recurrent_group' % name,
-                               step=__rnn_step__,
-                               reverse=reverse,
-                               input=input)
+        return paddle.layer.recurrent_group(name='%s_recurrent_group' % name,
+                                            step=__rnn_step__,
+                                            reverse=reverse,
+                                            input=input)
 
 PaddlePaddle
 使用“Memory”（记忆模块）实现单步函数。\ **Memory**\ 是在PaddlePaddle中构造循环神经网络时最重要的概念。
@@ -140,43 +117,52 @@ Sequence to Sequence Model with Attention
 .. code:: python
 
     # 定义源语句的数据层
-    src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
     # 计算每个词的词向量
-    src_embedding = embedding_layer(
+    src_embedding = paddle.layer.embedding(
         input=src_word_id,
         size=word_vector_dim,
-        param_attr=ParamAttr(name='_source_language_embedding'))
+        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
     # 应用前向循环神经网络
-    src_forward = grumemory(input=src_embedding, size=encoder_size)
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
     # 应用反向递归神经网络（reverse=True表示反向循环神经网络）
-    src_backward = grumemory(input=src_embedding,
-                              size=encoder_size,
-                              reverse=True)
+    src_backward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
     # 将循环神经网络的前向和反向部分混合在一起
-    encoded_vector = concat_layer(input=[src_forward, src_backward])
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
 
     # 投射编码向量到 decoder_size
-    encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)],
-                               size = decoder_size)
+    encoded_proj = paddle.layer.mixed(
+        size=decoder_size,
+        input=paddle.layer.full_matrix_projection(encoded_vector))
 
     # 计算反向RNN的第一个实例
-    backward_first = first_seq(input=src_backward)
+    backward_first = paddle.layer.first_seq(input=src_backward)
 
     # 投射反向RNN的第一个实例到 decoder size
-    decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation())
+    decoder_boot = paddle.layer.mixed(
+       size=decoder_size,
+       act=paddle.activation.Tanh(),
+       input=paddle.layer.full_matrix_projection(backward_first))
 
 解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在
 ``gru_decoder_with_attention`` 中定义：
 
 .. code:: python
 
-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-                  StaticInput(input=encoded_proj,is_seq=True)]
-    trg_embedding = embedding_layer(
-        input=data_layer(name='target_language_word',
-                         size=target_dict_dim),
-        size=word_vector_dim,
-        param_attr=ParamAttr(name='_target_language_embedding'))
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+    trg_embedding = paddle.layer.embedding(
+            input=paddle.layer.data(
+                name='target_language_word',
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
     group_inputs.append(trg_embedding)
 
     # 对于配备有注意力机制的解码器，在训练中，
@@ -185,9 +171,10 @@ Sequence to Sequence Model with Attention
     # StaticInput 意味着不同时间步的输入都是相同的值，
     # 否则它以一个序列输入，不同时间步的输入是不同的。
     # 所有输入序列应该有相同的长度。
-    decoder = recurrent_group(name=decoder_group_name,
-                              step=gru_decoder_with_attention,
-                              input=group_inputs)
+    decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs)
 
 单步函数的实现如下所示。首先，它定义解码网络的\ **Memory**\ 。然后定义
 attention，门控循环单元单步函数和输出函数：
@@ -198,27 +185,32 @@ attention，门控循环单元单步函数和输出函数：
         # 定义解码器的Memory
         # Memory的输出定义在 gru_step 内
         # 注意 gru_step 应该与它的Memory名字相同
-        decoder_mem = memory(name='gru_decoder',
-                             size=decoder_size,
-                             boot_layer=decoder_boot)
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
         # 计算 attention 加权编码向量
-        context = simple_attention(encoded_sequence=enc_vec,
-                                   encoded_proj=enc_proj,
-                                   decoder_state=decoder_mem)
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
         # 混合当前词向量和attention加权编码向量
-        decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context),
-                                               full_matrix_projection(current_word)],
-                                     size = decoder_size * 3)
+         decoder_inputs = paddle.layer.mixed(
+            size=decoder_size * 3,
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
         # 定义门控循环单元循环神经网络单步函数
-        gru_step = gru_step_layer(name='gru_decoder',
-                                  input=decoder_inputs,
-                                  output_mem=decoder_mem,
-                                  size=decoder_size)
+         gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
         # 定义输出函数
-        out = mixed_layer(input=[full_matrix_projection(input=gru_step)],
-                          size=target_dict_dim,
-                          bias_attr=True,
-                          act=SoftmaxActivation())
+         out = paddle.layer.mixed(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
         return out
 
 生成序列
@@ -238,41 +230,32 @@ attention，门控循环单元单步函数和输出函数：
    -  ``beam_size``: beam search 算法中的beam大小。
    -  ``max_length``: 生成序列的最大长度。
 
--  使用 ``seqtext_printer_evaluator``
-   根据索引矩阵和字典打印文本。这个函数需要设置：
-
-   -  ``id_input``: 数据的整数ID，用于标识生成的文件中的相应输出。
-   -  ``dict_file``: 用于将词ID转换为词的字典文件。
-   -  ``result_file``: 生成结果文件的路径。
-
 代码如下：
 
 .. code:: python
 
-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-                  StaticInput(input=encoded_proj,is_seq=True)]
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
     # 在生成时，解码器基于编码源序列和最后生成的目标词预测下一目标词。
     # 编码源序列（编码器输出）必须由只读Memory的 StaticInput 指定。
     # 这里， GeneratedInputs 自动获取上一个生成的词，并在最开始初始化为起始词，如 <s>。
-    trg_embedding = GeneratedInput(
-        size=target_dict_dim,
-        embedding_name='_target_language_embedding',
-        embedding_size=word_vector_dim)
+    trg_embedding = paddle.layer.GeneratedInput(
+            size=target_dict_dim,
+            embedding_name='_target_language_embedding',
+            embedding_size=word_vector_dim)
     group_inputs.append(trg_embedding)
-    beam_gen = beam_search(name=decoder_group_name,
-                           step=gru_decoder_with_attention,
-                           input=group_inputs,
-                           bos_id=0, # Beginnning token.
-                           eos_id=1, # End of sentence token.
-                           beam_size=beam_size,
-                           max_length=max_length)
-
-    seqtext_printer_evaluator(input=beam_gen,
-                              id_input=data_layer(name="sent_id", size=1),
-                              dict_file=trg_dict_path,
-                              result_file=gen_trans_file)
-    outputs(beam_gen)
-
-注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 :ref:`semantic_role_labeling` 了解更多详细信息。
-
-完整的配置文件在\ ``demo/seqToseq/seqToseq_net.py``\ 。
+    beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs,
+            bos_id=0, # Beginnning token.
+            eos_id=1, # End of sentence token.
+            beam_size=beam_size,
+            max_length=max_length)
+
+    return beam_gen
+
+注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 `book/06.understand_sentiment <https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment>`_ 了解更多详细信息。
+
+完整的配置文件在 `book/08.machine_translation/train.py <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py>`_ 。
diff --git a/doc/howto/deep_model/rnn/rnn_config_en.rst b/doc/howto/deep_model/rnn/rnn_config_en.rst
index 73f5d5371fcd3ce95253cad47b0d8e738284441c..2b581290a41005c04cb1d8b6febe57f17d2416d3 100644
--- a/doc/howto/deep_model/rnn/rnn_config_en.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_en.rst
@@ -3,34 +3,11 @@ RNN Configuration
 
 This tutorial will guide you how to configure recurrent neural network in PaddlePaddle. PaddlePaddle supports highly flexible and efficient recurrent neural network configuration. In this tutorial, you will learn how to:
 
-- prepare sequence data for learning recurrent neural networks.
 - configure recurrent neural network architecture.
 - generate sequence with learned recurrent neural network models.
 
-We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at :code:`demo/seqToseq`.
-
-=====================
-Prepare Sequence Data
-=====================
-
-PaddlePaddle does not need any preprocessing to sequence data, such as padding. The only thing that needs to be done is to set the type of the corresponding type to input. For example, the following code snippets defines three input. All of them are sequences, and the size of them are :code:`src_dict`, :code:`trg_dict`, and :code:`trg_dict`:
-
-.. code-block:: python
-
-    settings.input_types = [
-      integer_value_sequence(len(settings.src_dict)),
-      integer_value_sequence(len(settings.trg_dict)),
-      integer_value_sequence(len(settings.trg_dict))]
-
-
-Then at the :code:`process` function, each :code:`yield` function will return three integer lists. Each integer list is treated as a sequence of integers:
-
-.. code-block:: python
-
-    yield src_ids, trg_ids, trg_ids_next
-
-
-For more details description of how to write a data provider, please refer to :ref:`api_pydataprovider2` . The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`.
+We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at `book/08.machine_translation <https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation>`_ .
+And the data preparation of this model can be found at `python/paddle/v2/dataset/wmt14.py <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/wmt14.py>`_ 
 
 ===============================================
 Configure Recurrent Neural Network Architecture
@@ -75,19 +52,19 @@ Its **output function** simply takes :math:`x_t` as the output.
                    act=None,
                    rnn_layer_attr=None):
         def __rnn_step__(ipt):
-           out_mem = memory(name=name, size=size)
-           rnn_out = mixed_layer(input = [full_matrix_projection(ipt),
-                                          full_matrix_projection(out_mem)],
-                                 name = name,
-                                 bias_attr = rnn_bias_attr,
-                                 act = act,
-                                 layer_attr = rnn_layer_attr,
-                                 size = size)
+           out_mem = paddle.layer.memory(name=name, size=size)
+           rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt),
+                                                 paddle.layer.full_matrix_projection(input=out_mem)],
+                                        name = name,
+                                        bias_attr = rnn_bias_attr,
+                                        act = act,
+                                        layer_attr = rnn_layer_attr,
+                                        size = size)
            return rnn_out
-        return recurrent_group(name='%s_recurrent_group' % name,
-                               step=__rnn_step__,
-                               reverse=reverse,
-                               input=input)
+        return paddle.layer.recurrent_group(name='%s_recurrent_group' % name,
+                                            step=__rnn_step__,
+                                            reverse=reverse,
+                                            input=input)
 
 
 PaddlePaddle uses memory to construct step function. **Memory** is the most important concept when constructing recurrent neural networks in PaddlePaddle. A memory is a state that is used recurrently in step functions, such as :math:`x_{t+1} = f_x(x_t)`. One memory contains an **output** and a **input**. The output of memory at the current time step is utilized as the input of the memory at the next time step. A memory can also has a **boot layer**, whose output is utilized as the initial value of the memory. In our case, the output of the gated recurrent unit is employed as the output memory. Notice that the name of the layer :code:`rnn_out` is the same as the name of :code:`out_mem`. This means the output of the layer :code:`rnn_out` (:math:`x_{t+1}`) is utilized as the **output** of :code:`out_mem` memory.
@@ -113,43 +90,52 @@ We also project the encoder vector to :code:`decoder_size` dimensional space, ge
 .. code-block:: python
 
     # Define the data layer of the source sentence.
-    src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
     # Calculate the word embedding of each word.
-    src_embedding = embedding_layer(
+    src_embedding = paddle.layer.embedding(
         input=src_word_id,
         size=word_vector_dim,
-        param_attr=ParamAttr(name='_source_language_embedding'))
+        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
     # Apply forward recurrent neural network.
-    src_forward = grumemory(input=src_embedding, size=encoder_size)
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
     # Apply backward recurrent neural network. reverse=True means backward recurrent neural network.
-    src_backward = grumemory(input=src_embedding,
-                              size=encoder_size,
-                              reverse=True)
+    src_backward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
     # Mix the forward and backward parts of the recurrent neural network together.
-    encoded_vector = concat_layer(input=[src_forward, src_backward])
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
 
     # Project encoding vector to decoder_size.
-    encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)],
-                               size = decoder_size)
+    encoded_proj = paddle.layer.mixed(
+        size=decoder_size,
+        input=paddle.layer.full_matrix_projection(encoded_vector))
 
     # Compute the first instance of the backward RNN.
-    backward_first = first_seq(input=src_backward)
+    backward_first = paddle.layer.first_seq(input=src_backward)
 
     # Project the first instance of backward RNN to decoder size.
-    decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation())
+    decoder_boot = paddle.layer.mixed(
+       size=decoder_size,
+       act=paddle.activation.Tanh(),
+       input=paddle.layer.full_matrix_projection(backward_first))
 
 
 The decoder uses :code:`recurrent_group` to define the recurrent neural network. The step and output functions are defined in :code:`gru_decoder_with_attention`:
 
 .. code-block:: python
 
-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-                  StaticInput(input=encoded_proj,is_seq=True)]
-    trg_embedding = embedding_layer(
-        input=data_layer(name='target_language_word',
-                         size=target_dict_dim),
-        size=word_vector_dim,
-        param_attr=ParamAttr(name='_target_language_embedding'))
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+    trg_embedding = paddle.layer.embedding(
+            input=paddle.layer.data(
+                name='target_language_word',
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
     group_inputs.append(trg_embedding)
 
     # For decoder equipped with attention mechanism, in training,
@@ -158,9 +144,10 @@ The decoder uses :code:`recurrent_group` to define the recurrent neural network.
     # StaticInput means the same value is utilized at different time steps.
     # Otherwise, it is a sequence input. Inputs at different time steps are different.
     # All sequence inputs should have the same length.
-    decoder = recurrent_group(name=decoder_group_name,
-                              step=gru_decoder_with_attention,
-                              input=group_inputs)
+    decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs)
 
 
 The implementation of the step function is listed as below. First, it defines the **memory** of the decoder network. Then it defines attention, gated recurrent unit step function, and the output function:
@@ -171,27 +158,32 @@ The implementation of the step function is listed as below. First, it defines th
         # Defines the memory of the decoder.
         # The output of this memory is defined in gru_step.
         # Notice that the name of gru_step should be the same as the name of this memory.
-        decoder_mem = memory(name='gru_decoder',
-                             size=decoder_size,
-                             boot_layer=decoder_boot)
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
         # Compute attention weighted encoder vector.
-        context = simple_attention(encoded_sequence=enc_vec,
-                                   encoded_proj=enc_proj,
-                                   decoder_state=decoder_mem)
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
         # Mix the current word embedding and the attention weighted encoder vector.
-        decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context),
-                                               full_matrix_projection(current_word)],
-                                     size = decoder_size * 3)
+        decoder_inputs = paddle.layer.mixed(
+            size=decoder_size * 3,
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
         # Define Gated recurrent unit recurrent neural network step function.
-        gru_step = gru_step_layer(name='gru_decoder',
-                                  input=decoder_inputs,
-                                  output_mem=decoder_mem,
-                                  size=decoder_size)
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
         # Defines the output function.
-        out = mixed_layer(input=[full_matrix_projection(input=gru_step)],
-                          size=target_dict_dim,
-                          bias_attr=True,
-                          act=SoftmaxActivation())
+        out = paddle.layer.mixed(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
         return out
 
 
@@ -207,45 +199,37 @@ After training the model, we can use it to generate sequences. A common practice
   - :code:`eos_id`: the end token. Every sentence ends with the end token.
   - :code:`beam_size`: the beam size used in beam search.
   - :code:`max_length`: the maximum length of the generated sentences.
-
-* use :code:`seqtext_printer_evaluator` to print text according to index matrix and dictionary. This function needs to set:
-
-  - :code:`id_input`: the integer ID of the data, used to identify the corresponding output in the generated files.
-  - :code:`dict_file`: the dictionary file for converting word id to word.
-  - :code:`result_file`: the path of the generation result file.
     
 The code is listed below:
 
 .. code-block:: python
 
-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-                  StaticInput(input=encoded_proj,is_seq=True)]
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
     # In generation, decoder predicts a next target word based on
     # the encoded source sequence and the last generated target word.
     # The encoded source sequence (encoder's output) must be specified by
     # StaticInput which is a read-only memory.
     # Here, GeneratedInputs automatically fetchs the last generated word,
     # which is initialized by a start mark, such as <s>.
-    trg_embedding = GeneratedInput(
-        size=target_dict_dim,
-        embedding_name='_target_language_embedding',
-        embedding_size=word_vector_dim)
+    trg_embedding = paddle.layer.GeneratedInput(
+            size=target_dict_dim,
+            embedding_name='_target_language_embedding',
+            embedding_size=word_vector_dim)
     group_inputs.append(trg_embedding)
-    beam_gen = beam_search(name=decoder_group_name,
-                           step=gru_decoder_with_attention,
-                           input=group_inputs,
-                           bos_id=0, # Beginnning token.
-                           eos_id=1, # End of sentence token.
-                           beam_size=beam_size,
-                           max_length=max_length)
+    beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs,
+            bos_id=0, # Beginnning token.
+            eos_id=1, # End of sentence token.
+            beam_size=beam_size,
+            max_length=max_length)
 
-    seqtext_printer_evaluator(input=beam_gen,
-                              id_input=data_layer(name="sent_id", size=1),
-                              dict_file=trg_dict_path,
-                              result_file=gen_trans_file)
-    outputs(beam_gen)
+    return beam_gen
 
 
-Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to :ref:`semantic_role_labeling` for more details.
+Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to `book/06.understand_sentiment <https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment>`_ for more details.
 
-The full configuration file is located at :code:`demo/seqToseq/seqToseq_net.py`.
+The full configuration file is located at `book/08.machine_translation/train.py <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py>`_ .
diff --git a/go/cmake/CMakeDetermineGoCompiler.cmake b/go/cmake/CMakeDetermineGoCompiler.cmake
deleted file mode 100644
index a9bb6906c7440782bd648bb7505a548248a11bb0..0000000000000000000000000000000000000000
--- a/go/cmake/CMakeDetermineGoCompiler.cmake
+++ /dev/null
@@ -1,44 +0,0 @@
-if(NOT CMAKE_Go_COMPILER)
-  if(NOT $ENV{GO_COMPILER} STREQUAL "")
-    get_filename_component(CMAKE_Go_COMPILER_INIT $ENV{GO_COMPILER} PROGRAM PROGRAM_ARGS CMAKE_Go_FLAGS_ENV_INIT)
-
-    if(CMAKE_Go_FLAGS_ENV_INIT)
-      set(CMAKE_Go_COMPILER_ARG1 "${CMAKE_Go_FLAGS_ENV_INIT}" CACHE STRING "First argument to Go compiler")
-    endif()
-
-    if(NOT EXISTS ${CMAKE_Go_COMPILER_INIT})
-      message(SEND_ERROR "Could not find compiler set in environment variable GO_COMPILER:\n$ENV{GO_COMPILER}.")
-    endif()
-
-  endif()
-
-  set(Go_BIN_PATH
-    $ENV{GOPATH}
-    $ENV{GOROOT}
-    $ENV{GOROOT}/../bin
-    $ENV{GO_COMPILER}
-    /usr/bin
-    /usr/local/bin
-    )
-
-  if(CMAKE_Go_COMPILER_INIT)
-    set(CMAKE_Go_COMPILER ${CMAKE_Go_COMPILER_INIT} CACHE PATH "Go Compiler")
-  else()
-    find_program(CMAKE_Go_COMPILER
-      NAMES go
-      PATHS ${Go_BIN_PATH}
-    )
-    EXEC_PROGRAM(${CMAKE_Go_COMPILER} ARGS version OUTPUT_VARIABLE GOLANG_VERSION)
-    STRING(REGEX MATCH "go[0-9]+.[0-9]+.[0-9]+[ /A-Za-z0-9]*" VERSION "${GOLANG_VERSION}")
-    message("-- The Golang compiler identification is ${VERSION}")
-    message("-- Check for working Golang compiler: ${CMAKE_Go_COMPILER}")
-  endif()
-
-endif()
-
-mark_as_advanced(CMAKE_Go_COMPILER)
-
-configure_file(${CMAKE_MODULE_PATH}/CMakeGoCompiler.cmake.in
-  ${CMAKE_PLATFORM_INFO_DIR}/CMakeGoCompiler.cmake @ONLY)
-
-set(CMAKE_Go_COMPILER_ENV_VAR "GO_COMPILER")
diff --git a/go/cmake/CMakeGoCompiler.cmake.in b/go/cmake/CMakeGoCompiler.cmake.in
deleted file mode 100644
index a71f08e064656fbaad8cfa77aea6f216515712ef..0000000000000000000000000000000000000000
--- a/go/cmake/CMakeGoCompiler.cmake.in
+++ /dev/null
@@ -1,8 +0,0 @@
-set(CMAKE_Go_COMPILER "@CMAKE_Go_COMPILER@")
-set(CMAKE_Go_COMPILER_LOADED 1)
-
-set(CMAKE_Go_SOURCE_FILE_EXTENSIONS go)
-set(CMAKE_Go_LINKER_PREFERENCE 40)
-set(CMAKE_Go_OUTPUT_EXTENSION .o)
-set(CMAKE_Go_OUTPUT_EXTENSION_REPLACE 1)
-set(CMAKE_Go_COMPILER_ENV_VAR "GO_COMPILER")
diff --git a/go/cmake/CMakeGoInformation.cmake b/go/cmake/CMakeGoInformation.cmake
deleted file mode 100644
index ba51ac93fcd429478f324b66bd5129d94ea2a8f4..0000000000000000000000000000000000000000
--- a/go/cmake/CMakeGoInformation.cmake
+++ /dev/null
@@ -1,7 +0,0 @@
-if(NOT CMAKE_Go_COMPILE_OBJECT)
-  set(CMAKE_Go_COMPILE_OBJECT "go tool compile -l -N -o <OBJECT> <SOURCE> ")
-endif()
-
-if(NOT CMAKE_Go_LINK_EXECUTABLE)
-  set(CMAKE_Go_LINK_EXECUTABLE "go tool link -o <TARGET> <OBJECTS>  ")
-endif()
diff --git a/go/cmake/CMakeTestGoCompiler.cmake b/go/cmake/CMakeTestGoCompiler.cmake
deleted file mode 100644
index b9891b015baced05b51e34dba562fd98a84fe14c..0000000000000000000000000000000000000000
--- a/go/cmake/CMakeTestGoCompiler.cmake
+++ /dev/null
@@ -1 +0,0 @@
-set(CMAKE_Go_COMPILER_WORKS 1 CACHE INTERNAL "")
diff --git a/go/cmake/flags.cmake b/go/cmake/flags.cmake
deleted file mode 100644
index a167c432a920e9ee93878603f3b946e8593412f6..0000000000000000000000000000000000000000
--- a/go/cmake/flags.cmake
+++ /dev/null
@@ -1,45 +0,0 @@
-# Setting Paddle Compile Flags
-include(CheckCXXCompilerFlag)
-include(CheckCCompilerFlag)
-include(CheckCXXSymbolExists)
-include(CheckTypeSize)
-
-function(CheckCompilerCXX11Flag)
-    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
-            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
-        endif()
-    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-        # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
-        # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
-        # https://gist.github.com/yamaya/2924292
-        if(APPLE)  # cmake < 3.0 compiler id "Clang" on Mac OS X
-            if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.1)
-                message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.")
-            endif()
-        else()
-            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
-                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
-            endif()
-        endif()
-    endif()
-endfunction()
-
-CheckCompilerCXX11Flag()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-# Common gpu architectures: Kepler, Maxwell
-foreach(capability 30 35 50)
-      list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
-endforeach()
-
-if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
-      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
-endif()
-
-# Modern gpu architectures: Pascal
-if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
-      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
-endif()
-
-set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
diff --git a/go/cmake/golang.cmake b/go/cmake/golang.cmake
deleted file mode 100644
index d38d06de2348821b21109f7dc708314da81111c5..0000000000000000000000000000000000000000
--- a/go/cmake/golang.cmake
+++ /dev/null
@@ -1,50 +0,0 @@
-set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
-file(MAKE_DIRECTORY ${GOPATH})
-set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle")
-file(MAKE_DIRECTORY ${PADDLE_IN_GOPATH})
-
-function(GO_LIBRARY NAME BUILD_TYPE)
-  if(BUILD_TYPE STREQUAL "STATIC")
-    set(BUILD_MODE -buildmode=c-archive)
-    set(LIB_NAME "lib${NAME}.a")
-  else()
-    set(BUILD_MODE -buildmode=c-shared)
-    if(APPLE)
-      set(LIB_NAME "lib${NAME}.dylib")
-    else()
-      set(LIB_NAME "lib${NAME}.so")
-    endif()
-  endif()
-
-  file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
-  file(RELATIVE_PATH rel ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
-
-  # find Paddle directory.
-  get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
-  get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
-  get_filename_component(PADDLE_DIR ${PARENT_DIR} DIRECTORY)
-
-  # automatically get all dependencies specified in the source code
-  # for given target.
-  add_custom_target(goGet env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ${rel}/...)
-
-  # make a symlink that references Paddle inside $GOPATH, so go get
-  # will use the local changes in Paddle rather than checkout Paddle
-  # in github.
-  add_custom_target(copyPaddle
-    COMMAND ln -sf ${PADDLE_DIR} ${PADDLE_IN_GOPATH})
-  add_dependencies(goGet copyPaddle)
-
-  add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
-    -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
-    ${CMAKE_GO_FLAGS} ${GO_SOURCE}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-
-  add_custom_target(${NAME} ALL DEPENDS ${OUTPUT_DIR}/.timestamp ${ARGN})
-  add_dependencies(${NAME} goGet)
-
-  if(NOT BUILD_TYPE STREQUAL "STATIC")
-    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME} DESTINATION bin)
-  endif()
-endfunction(GO_LIBRARY)
diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go
index d1f3d7d76c438670faf6677b01e790c5ebe1f2cb..54fa254863156455f66fa87de9077042a45f9735 100644
--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
@@ -5,89 +5,65 @@ import (
 	"net"
 	"net/http"
 	"net/rpc"
-	"os"
-	"path/filepath"
 	"strconv"
 	"strings"
 	"time"
 
 	"github.com/namsral/flag"
+	log "github.com/sirupsen/logrus"
 
 	"github.com/PaddlePaddle/Paddle/go/master"
-	"github.com/PaddlePaddle/recordio"
+	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
 )
 
 func main() {
 	port := flag.Int("port", 8080, "port of the master server.")
-	dataset := flag.String("training_dataset", "", "dataset: comma separated path to RecordIO paths, supports golb patterns.")
-	faultTolerance := flag.Bool("fault_tolerance", false, "enable fault tolerance (requires etcd).")
+	ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.")
+	endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.")
 	taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
 	taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
 	chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
 	flag.Parse()
 
-	if *dataset == "" {
-		panic("no dataset specified.")
+	if *endpoints == "" {
+		log.Warningln("-endpoints not set, fault tolerance not be enabled.")
 	}
 
-	if *faultTolerance {
-		panic("fault tolernance not implemented.")
-	}
-
-	var chunks []master.Chunk
-	var paths []string
-	ss := strings.Split(*dataset, ",")
-	fmt.Println(ss)
-	for _, s := range ss {
-		match, err := filepath.Glob(s)
+	var store master.Store
+	if *endpoints != "" {
+		eps := strings.Split(*endpoints, ",")
+		ip, err := networkhelper.GetExternalIP()
 		if err != nil {
-			panic(err)
+			log.Fatal(err)
 		}
-		paths = append(paths, match...)
-	}
 
-	if len(paths) == 0 {
-		panic("no valid datset specified.")
-	}
-
-	idx := 0
-	for _, path := range paths {
-		f, err := os.Open(path)
+		addr := fmt.Sprintf("%s:%d", ip, *port)
+		store, err = master.NewEtcdClient(eps, addr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, *ttlSec)
 		if err != nil {
-			panic(err)
+			log.Fatal(err)
 		}
+	} else {
+		store = &master.InMemStore{}
+	}
 
-		index, err := recordio.LoadIndex(f)
-		if err != nil {
-			panic(err)
-		}
-		f.Close()
-
-		count := index.NumChunks()
-		for i := 0; i < count; i++ {
-			chunk := master.Chunk{
-				Idx:   idx,
-				Path:  path,
-				Index: *index.ChunkIndex(i),
-			}
-			chunks = append(chunks, chunk)
-		}
+	s, err := master.NewService(store, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
+	if err != nil {
+		log.Fatal(err)
 	}
 
-	s := master.NewService(chunks, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
-	err := rpc.Register(s)
+	err = rpc.Register(s)
 	if err != nil {
-		panic(err)
+		log.Fatal(err)
 	}
 
 	rpc.HandleHTTP()
 	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
 	if err != nil {
-		panic(err)
+		log.Fatal(err)
 	}
 
 	err = http.Serve(l, nil)
 	if err != nil {
-		panic(err)
+		log.Fatal(err)
 	}
 }
diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
index f0be251c2471cc9ddc069f040417b5181a78c058..6c85b1804bb9c5f3a8bc46bb3f54cc62c56cca70 100644
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -5,18 +5,36 @@ import (
 	"net/http"
 	"net/rpc"
 	"strconv"
+	"time"
 
 	"github.com/namsral/flag"
 
 	"github.com/PaddlePaddle/Paddle/go/pserver"
+	log "github.com/sirupsen/logrus"
 )
 
 func main() {
 	port := flag.Int("port", 0, "port of the pserver")
+	etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
+		"comma separated endpoint string for pserver to connect to etcd")
+	etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls")
+	numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job")
+	logLevel := flag.String("log-level", "info",
+		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()
 
-	s := pserver.NewService()
-	err := rpc.Register(s)
+	level, err := log.ParseLevel(*logLevel)
+	if err != nil {
+		panic(err)
+	}
+	log.SetLevel(level)
+
+	timeout := time.Second * time.Duration((*etcdTimeout))
+	s, err := pserver.NewService(*etcdEndpoint, *numPservers, timeout)
+	if err != nil {
+		panic(err)
+	}
+	err = rpc.Register(s)
 	if err != nil {
 		panic(err)
 	}
@@ -27,7 +45,9 @@ func main() {
 		panic(err)
 	}
 
+	log.Infof("start pserver at port %d", *port)
 	err = http.Serve(l, nil)
+
 	if err != nil {
 		panic(err)
 	}
diff --git a/go/pserver/internal/connection/conn.go b/go/connection/conn.go
similarity index 81%
rename from go/pserver/internal/connection/conn.go
rename to go/connection/conn.go
index 1c04f117254054741b7d45fb16462b5ce84a2aea..977e8cc123707dbcf055bb77399adbc232c575a0 100644
--- a/go/pserver/internal/connection/conn.go
+++ b/go/connection/conn.go
@@ -4,6 +4,8 @@ import (
 	"errors"
 	"net/rpc"
 	"sync"
+
+	log "github.com/sirupsen/logrus"
 )
 
 // TODO(helin): add TCP re-connect logic
@@ -21,6 +23,18 @@ func New() *Conn {
 	return c
 }
 
+// Close closes the connection.
+func (c *Conn) Close() error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if c.client == nil {
+		return nil
+	}
+
+	return c.client.Close()
+}
+
 // Connect connects the connection to a address.
 func (c *Conn) Connect(addr string) error {
 	c.mu.Lock()
@@ -50,12 +64,20 @@ func (c *Conn) Connect(addr string) error {
 			c.waitConn = nil
 		}
 	} else {
+		err := client.Close()
+		if err != nil {
+			log.Errorln(err)
+		}
+
 		return errors.New("client already set from a concurrent goroutine")
 	}
 
 	return nil
 }
 
+// TODO(helin): refactor Call to be able to perform given retry
+// policy.
+
 // Call make a RPC call.
 //
 // Call will be blocked until the connection to remote RPC service
diff --git a/go/master/c/CMakeLists.txt b/go/master/c/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..acce698051ec7217d60a40b3d9cdc98fb1499653
--- /dev/null
+++ b/go/master/c/CMakeLists.txt
@@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 3.0)
+
+get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
+get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake")
+
+project(cxx_go C Go)
+
+include(golang)
+include(flags)
+
+set(MASTER_LIB_NAME "paddle_master")
+go_library(${MASTER_LIB_NAME} SHARED)
+
+if(PROJ_ROOT)
+  add_custom_command(OUTPUT ${PROJ_ROOT}/python/paddle/v2/master/lib${MASTER_LIB_NAME}.so
+    COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/lib${MASTER_LIB_NAME}.h
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/lib${MASTER_LIB_NAME}.so ${PROJ_ROOT}/python/paddle/v2/master/
+    DEPENDS ${MASTER_LIB_NAME})
+  add_custom_target(paddle_master_shared ALL DEPENDS ${PROJ_ROOT}/python/paddle/v2/master/lib${MASTER_LIB_NAME}.so)
+endif(PROJ_ROOT)
diff --git a/go/master/c/client.go b/go/master/c/client.go
new file mode 100644
index 0000000000000000000000000000000000000000..b186474dc33138aeb02a2ffe34418b379b7a2db0
--- /dev/null
+++ b/go/master/c/client.go
@@ -0,0 +1,110 @@
+package main
+
+/*
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#define PADDLE_MASTER_OK    0
+#define PADDLE_MASTER_ERROR -1
+
+typedef int paddle_master_client;
+*/
+import "C"
+
+import (
+	"sync"
+	"unsafe"
+
+	"github.com/PaddlePaddle/Paddle/go/master"
+	log "github.com/sirupsen/logrus"
+)
+
+var nullPtr = unsafe.Pointer(uintptr(0))
+var mu sync.Mutex
+var handleMap = make(map[C.paddle_master_client]*master.Client)
+var curHandle C.paddle_master_client
+
+func add(c *master.Client) C.paddle_master_client {
+	mu.Lock()
+	defer mu.Unlock()
+	client := curHandle
+	curHandle++
+	handleMap[client] = c
+	return client
+}
+
+func get(client C.paddle_master_client) *master.Client {
+	mu.Lock()
+	defer mu.Unlock()
+	return handleMap[client]
+}
+
+func remove(client C.paddle_master_client) *master.Client {
+	mu.Lock()
+	defer mu.Unlock()
+	h := handleMap[client]
+	delete(handleMap, client)
+	return h
+}
+
+type addresser string
+
+func (a addresser) Address() string {
+	return string(a)
+}
+
+//export paddle_new_master_client
+func paddle_new_master_client(addr *C.char, bufSize int) C.paddle_master_client {
+	a := C.GoString(addr)
+	c := master.NewClient(addresser(a), bufSize)
+	return add(c)
+}
+
+//export paddle_release_master_client
+func paddle_release_master_client(client C.paddle_master_client) {
+	remove(client)
+}
+
+//export paddle_set_dataset
+func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int) C.int {
+	c := get(client)
+	var paths []string
+	for i := 0; i < int(size); i++ {
+		ptr := (**C.char)(unsafe.Pointer(uintptr(unsafe.Pointer(path)) + uintptr(i)*unsafe.Sizeof(*path)))
+		str := C.GoString(*ptr)
+		paths = append(paths, str)
+	}
+	err := c.SetDataset(paths)
+	if err != nil {
+		log.Errorln(err)
+		return C.PADDLE_MASTER_ERROR
+	}
+
+	return C.PADDLE_MASTER_OK
+}
+
+//export paddle_next_record
+func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
+	c := get(client)
+	r := c.NextRecord()
+	if len(r) == 0 {
+		*record = (*C.uchar)(nullPtr)
+		return 0
+	}
+
+	size := C.size_t(len(r))
+	*record = (*C.uchar)(C.malloc(size))
+	C.memcpy(unsafe.Pointer(*record), unsafe.Pointer(&r[0]), size)
+	return C.int(size)
+}
+
+//export mem_free
+func mem_free(p unsafe.Pointer) {
+	// "free" may be a better name for this function, but doing so
+	// will cause calling any function of this library from Python
+	// ctypes hanging.
+	C.free(p)
+}
+
+func main() {}
diff --git a/go/master/client.go b/go/master/client.go
new file mode 100644
index 0000000000000000000000000000000000000000..8451820c1963dd5a4eff0c3ab7763eb6a8e05ba4
--- /dev/null
+++ b/go/master/client.go
@@ -0,0 +1,137 @@
+package master
+
+import (
+	"os"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/connection"
+	"github.com/PaddlePaddle/recordio"
+	log "github.com/sirupsen/logrus"
+)
+
+// Addresser provide the address of the master server.
+type Addresser interface {
+	Address() string
+}
+
+// Client is the client of the master server.
+type Client struct {
+	conn *connection.Conn
+	ch   chan []byte
+}
+
+// NewClient creates a new Client.
+//
+// bufSize is the record buffer size. NextRecord will read from this
+// buffer.
+func NewClient(addr Addresser, bufSize int) *Client {
+	c := &Client{}
+	c.conn = connection.New()
+	c.ch = make(chan []byte, bufSize)
+	go c.monitorMaster(addr)
+	go c.getRecords()
+	return c
+}
+
+func (c *Client) getRecords() {
+	for {
+		t, err := c.getTask()
+		if err != nil {
+			// TODO(helin): wait before move on with next
+			// getTask call.
+			log.Errorln(err)
+			continue
+		}
+
+		for _, chunk := range t.Chunks {
+			f, err := os.Open(chunk.Path)
+			if err != nil {
+				log.Errorln(err)
+				continue
+			}
+
+			s := recordio.NewRangeScanner(f, &chunk.Index, -1, -1)
+			for s.Scan() {
+				c.ch <- s.Record()
+			}
+
+			if s.Err() != nil {
+				log.Errorln(err, chunk.Path)
+			}
+
+			err = f.Close()
+			if err != nil {
+				log.Errorln(err)
+			}
+		}
+
+		// We treat a task as finished whenever the last data
+		// instance of the task is read. This is not exactly
+		// correct, but a reasonable approximation.
+		c.taskFinished(t.ID)
+	}
+}
+
+func (c *Client) monitorMaster(addr Addresser) {
+	lastMaster := ""
+	monitor := func() {
+		// get the lastest address of the master server,
+		// connect to the new address once address changed.
+		curMaster := addr.Address()
+		if curMaster != lastMaster {
+			if curMaster == "" {
+				err := c.conn.Close()
+				if err != nil {
+					log.Errorln(err)
+				}
+			} else {
+				err := c.conn.Connect(curMaster)
+				if err != nil {
+					log.Errorln(err)
+
+					// connect to addr failed, set
+					// to last known addr in order
+					// to retry next time.
+					curMaster = lastMaster
+				}
+
+			}
+		}
+
+		lastMaster = curMaster
+	}
+
+	monitor()
+	ticker := time.NewTicker(10 * time.Second)
+	for _ = range ticker.C {
+		monitor()
+	}
+}
+
+// SetDataset set dataset for the master server to dispatch.
+//
+// SetDataset can be call multiple times from different nodes. But
+// only the first call will be honored.
+func (c *Client) SetDataset(globPaths []string) error {
+	return c.conn.Call("Service.SetDataset", globPaths, nil)
+}
+
+// getTask gets a new task from the master server.
+func (c *Client) getTask() (Task, error) {
+	var t Task
+	err := c.conn.Call("Service.GetTask", 0, &t)
+	return t, err
+}
+
+// TaskFinished tells the master server a task is finished.
+func (c *Client) taskFinished(taskID int) error {
+	return c.conn.Call("Service.TaskFinished", taskID, nil)
+}
+
+// NextRecord returns next record in the dataset.
+//
+// NextRecord will block until the next record is available. It is
+// thread-safe.
+func (c *Client) NextRecord() []byte {
+	return <-c.ch
+}
diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go
new file mode 100644
index 0000000000000000000000000000000000000000..251225780ae3077f90655b4e874d03b4f3794525
--- /dev/null
+++ b/go/master/client_internal_test.go
@@ -0,0 +1,125 @@
+package master
+
+import (
+	"fmt"
+	"net"
+	"net/http"
+	"net/rpc"
+	"os"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	log "github.com/sirupsen/logrus"
+
+	"github.com/PaddlePaddle/Paddle/go/connection"
+	"github.com/PaddlePaddle/recordio"
+)
+
+const (
+	totalTask    = 20
+	chunkPerTask = 10
+)
+
+func init() {
+	log.SetLevel(log.ErrorLevel)
+}
+
+type TestAddresser string
+
+func (a TestAddresser) Address() string {
+	return string(a)
+}
+
+func TestGetFinishTask(t *testing.T) {
+	const path = "/tmp/master_client_test_0"
+
+	l, err := net.Listen("tcp", ":0")
+	if err != nil {
+		panic(err)
+	}
+
+	ss := strings.Split(l.Addr().String(), ":")
+	p, err := strconv.Atoi(ss[len(ss)-1])
+	if err != nil {
+		panic(err)
+	}
+
+	go func(l net.Listener) {
+		s, err := NewService(&InMemStore{}, chunkPerTask, time.Second, 1)
+		if err != nil {
+			panic(err)
+		}
+
+		server := rpc.NewServer()
+		err = server.Register(s)
+		if err != nil {
+			panic(err)
+		}
+
+		mux := http.NewServeMux()
+		mux.Handle(rpc.DefaultRPCPath, server)
+		err = http.Serve(l, mux)
+		if err != nil {
+			panic(err)
+		}
+	}(l)
+
+	f, err := os.Create(path)
+	if err != nil {
+		panic(err)
+	}
+
+	for i := 0; i < totalTask*chunkPerTask; i++ {
+		w := recordio.NewWriter(f, -1, -1)
+		w.Write(nil)
+		// call Close to force RecordIO writing a chunk.
+		w.Close()
+	}
+	f.Close()
+
+	// Manually intialize client to avoid calling c.getRecords()
+	c := &Client{}
+	c.conn = connection.New()
+	go c.monitorMaster(TestAddresser(fmt.Sprintf(":%d", p)))
+	c.SetDataset([]string{path})
+
+	checkOnePass := func(i int) {
+		var tasks []Task
+		for idx := 0; idx < totalTask; idx++ {
+			task, err := c.getTask()
+			if err != nil {
+				t.Fatalf("Error: %v, pass: %d\n", err, i)
+			}
+			tasks = append(tasks, task)
+		}
+
+		_, err = c.getTask()
+		if err == nil {
+			t.Fatalf("Should get error, pass: %d\n", i)
+		}
+
+		err = c.taskFinished(tasks[0].ID)
+		if err != nil {
+			t.Fatalf("Error: %v, pass: %d\n", err, i)
+		}
+		tasks = tasks[1:]
+		task, err := c.getTask()
+		if err != nil {
+			t.Fatal(err)
+		}
+		tasks = append(tasks, task)
+
+		for _, task := range tasks {
+			err = c.taskFinished(task.ID)
+			if err != nil {
+				t.Fatalf("Error: %v, pass: %d\n", err, i)
+			}
+		}
+	}
+
+	for i := 0; i < 10; i++ {
+		checkOnePass(i)
+	}
+}
diff --git a/go/master/client_test.go b/go/master/client_test.go
new file mode 100644
index 0000000000000000000000000000000000000000..85a86761c2e5897e3e89cbebfd32f7666c4a9f7f
--- /dev/null
+++ b/go/master/client_test.go
@@ -0,0 +1,83 @@
+package master_test
+
+import (
+	"fmt"
+	"net"
+	"net/http"
+	"net/rpc"
+	"os"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/master"
+	"github.com/PaddlePaddle/recordio"
+)
+
+func TestNextRecord(t *testing.T) {
+	const (
+		path  = "/tmp/master_client_TestFull"
+		total = 50
+	)
+
+	l, err := net.Listen("tcp", ":0")
+	if err != nil {
+		panic(err)
+	}
+
+	ss := strings.Split(l.Addr().String(), ":")
+	p, err := strconv.Atoi(ss[len(ss)-1])
+	if err != nil {
+		panic(err)
+	}
+
+	go func(l net.Listener) {
+		s, err := master.NewService(&master.InMemStore{}, 10, time.Second, 1)
+		if err != nil {
+			panic(err)
+		}
+
+		server := rpc.NewServer()
+		err = server.Register(s)
+		if err != nil {
+			panic(err)
+		}
+
+		mux := http.NewServeMux()
+		mux.Handle(rpc.DefaultRPCPath, server)
+		err = http.Serve(l, mux)
+		if err != nil {
+			panic(err)
+		}
+	}(l)
+
+	f, err := os.Create(path)
+	if err != nil {
+		panic(err)
+	}
+
+	w := recordio.NewWriter(f, -1, -1)
+	for i := 0; i < total; i++ {
+		w.Write([]byte{byte(i)})
+	}
+	w.Close()
+	f.Close()
+
+	c := master.NewClient(master.TestAddresser(fmt.Sprintf(":%d", p)), 10)
+	c.SetDataset([]string{path})
+
+	for pass := 0; pass < 50; pass++ {
+		received := make(map[byte]bool)
+		for i := 0; i < total; i++ {
+			r := c.NextRecord()
+			if len(r) != 1 {
+				t.Fatal("Length should be 1.", r)
+			}
+			if received[r[0]] {
+				t.Fatal("Received duplicate.", received, r)
+			}
+			received[r[0]] = true
+		}
+	}
+}
diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go
new file mode 100644
index 0000000000000000000000000000000000000000..b7293a759896f113d630d57d14b4b4ac8963f54a
--- /dev/null
+++ b/go/master/etcd_client.go
@@ -0,0 +1,144 @@
+package master
+
+import (
+	"context"
+	"time"
+
+	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/clientv3/concurrency"
+	log "github.com/sirupsen/logrus"
+)
+
+const (
+	// DefaultLockPath is the default etcd master lock path.
+	DefaultLockPath = "/master/lock"
+	// DefaultStatePath is the default etcd key for master state.
+	DefaultStatePath = "/master/state"
+	// DefaultAddrPath is the default etcd key for master address.
+	DefaultAddrPath = "/master/addr"
+)
+
+// EtcdClient is the etcd client that master uses for fault tolerance
+// and service registry.
+type EtcdClient struct {
+	lockPath  string
+	statePath string
+	client    *clientv3.Client
+	lock      *concurrency.Mutex
+}
+
+// NewEtcdClient creates a new EtcdClient.
+func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
+	log.Debugf("Connecting to etcd at %v", endpoints)
+	// TODO(helin): gracefully shutdown etcd store. Becuase etcd
+	// store holds a etcd lock, even though the lock will expire
+	// when the lease timeout, we need to implement graceful
+	// shutdown to release the lock.
+	cli, err := clientv3.New(clientv3.Config{
+		Endpoints:   endpoints,
+		DialTimeout: dialTimeout,
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	sess, err := concurrency.NewSession(cli, concurrency.WithTTL(ttlSec))
+	if err != nil {
+		return nil, err
+	}
+
+	lock := concurrency.NewMutex(sess, lockPath)
+	// It's fine for the lock to get stuck, in this case we have
+	// multiple master servers running (only configured to have
+	// one master running, but split-brain problem may cuase
+	// multiple master servers running), and the cluster management
+	// software will kill one of them.
+	log.Debugf("Trying to acquire lock at %s.", lockPath)
+	err = lock.Lock(context.TODO())
+	if err != nil {
+		return nil, err
+	}
+	log.Debugf("Successfully acquired lock at %s.", lockPath)
+
+	put := clientv3.OpPut(addrPath, string(addr))
+	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
+	if err != nil {
+		return nil, err
+	}
+
+	if !resp.Succeeded {
+		log.Fatal("No longer owns the master lock. Exiting.")
+	}
+
+	e := &EtcdClient{
+		lockPath:  lockPath,
+		statePath: statePath,
+		client:    cli,
+		lock:      lock,
+	}
+
+	return e, nil
+}
+
+// Save saves the state into the etcd.
+func (e *EtcdClient) Save(state []byte) error {
+	ctx := context.TODO()
+	put := clientv3.OpPut(e.statePath, string(state))
+	resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
+	if err != nil {
+		return err
+	}
+
+	if !resp.Succeeded {
+		log.Errorln("No longer owns the lock, trying to lock again")
+		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+		err := e.lock.Lock(ctx)
+		cancel()
+		if err != nil {
+			// We lost the master lock and can not acquire
+			// it back, it means some other master is
+			// already started. We don't want cluster
+			// managment system to kill the master server
+			// who is holding the lock and running
+			// correctly. So the most feasible solution is
+			// to kill current master server. The current
+			// state is not saved, but the trainer's RPC
+			// call will fail, so the trainer will retry.
+			log.Fatalf("Could not acquire the lock at %s: %v. Exiting.", e.lockPath, err)
+		}
+		log.Infof("Successfully acquired lock at %s.", e.lockPath)
+		return e.Save(state)
+	}
+
+	return nil
+}
+
+// Load loads the state from etcd.
+func (e *EtcdClient) Load() ([]byte, error) {
+	ctx := context.TODO()
+	get := clientv3.OpGet(e.statePath)
+
+	resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(get).Commit()
+	if err != nil {
+		return nil, err
+	}
+
+	if !resp.Succeeded {
+		log.Errorln("No longer owns the lock, trying to lock and load again.")
+		err = e.lock.Lock(context.Background())
+		if err != nil {
+			return nil, err
+		}
+
+		return e.Load()
+	}
+
+	kvs := resp.Responses[0].GetResponseRange().Kvs
+	if len(kvs) == 0 {
+		// No state exists
+		return nil, nil
+	}
+
+	state := kvs[0].Value
+	return state, nil
+}
diff --git a/go/master/inmem_store.go b/go/master/inmem_store.go
new file mode 100644
index 0000000000000000000000000000000000000000..bcd549b20e46381783bad11caa08cb7f4ba40add
--- /dev/null
+++ b/go/master/inmem_store.go
@@ -0,0 +1,28 @@
+package master
+
+import "sync"
+
+// InMemStore is an in memory implementation of Store interface.
+//
+// It does not tolerate the fault that casues the program to crash.
+type InMemStore struct {
+	mu  sync.Mutex
+	buf []byte
+}
+
+// Save saves the state into the in-memory store.
+func (m *InMemStore) Save(state []byte) error {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	m.buf = state
+	return nil
+}
+
+// Load loads the state from the in-memory store.
+func (m *InMemStore) Load() ([]byte, error) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	return m.buf, nil
+}
diff --git a/go/master/service.go b/go/master/service.go
index ab17a62f3854c1e32d731037fcc9857260d03781..58e68e744859933aa74cac231356d4ff9dfb8d7b 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -1,39 +1,68 @@
 package master
 
 import (
+	"bytes"
+	"compress/gzip"
+	"encoding/gob"
 	"errors"
-	"log"
+	"os"
+	"path/filepath"
 	"sync"
 	"time"
 
+	log "github.com/sirupsen/logrus"
+
 	"github.com/PaddlePaddle/recordio"
 )
 
 const (
-	targetTaskCount = 300
+	dialTimeout = 5 * time.Second
 )
 
-// errors
-var (
-	ErrNoMoreTask          = errors.New("no more task for current pass")
-	ErrPendingTaskNotFound = errors.New("pending task not found")
-)
+// Store is the interface for save and load the master state.
+type Store interface {
+	Save([]byte) error
+	Load() ([]byte, error)
+}
+
+// Chunk is a chunk of data consisted of several data instances.
+type Chunk struct {
+	Path  string
+	Index recordio.Index // chunk index
+}
+
+// Task is the basic unit of data instances assigned to trainers.
+type Task struct {
+	ID     int
+	Chunks []Chunk
+}
+
+type taskEntry struct {
+	Epoch      int
+	NumTimeout int
+	Task       Task
+}
+
+type taskQueues struct {
+	Todo    []taskEntry
+	Pending map[int]taskEntry // map from task ID to task entry
+	Done    []taskEntry
+	Failed  []Task
+}
 
 // Service is the master server service.
 type Service struct {
-	timeoutDur time.Duration
-	timeoutMax int
+	chunksPerTask int
+	timeoutDur    time.Duration
+	timeoutMax    int
+	ready         chan struct{}
+	store         Store
 
 	mu         sync.Mutex
+	initDone   bool
 	taskQueues taskQueues
 }
 
-// Recover recovers service state from etcd.
-func Recover() (*Service, error) {
-	// TODO(helin): recover from snapshot state from etcd.
-	return nil, nil
-}
-
 func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	id := 0
 	if chunksPerTask <= 0 {
@@ -55,7 +84,6 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 
 	if len(cur.Task.Chunks) > 0 {
 		cur.Task.ID = id
-		id++
 		result = append(result, cur)
 	}
 
@@ -63,55 +91,251 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 }
 
 // NewService creates a new service.
-func NewService(chunks []Chunk, chunksPerTask int, timeoutDur time.Duration, timeoutMax int) *Service {
+func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, timeoutMax int) (*Service, error) {
 	s := &Service{}
+	s.chunksPerTask = chunksPerTask
 	s.timeoutDur = timeoutDur
 	s.timeoutMax = timeoutMax
 	s.taskQueues = taskQueues{}
 	s.taskQueues.Pending = make(map[int]taskEntry)
-	s.taskQueues.Todo = partition(chunks, chunksPerTask)
-	return s
-}
+	s.ready = make(chan struct{})
+	s.store = store
+	recovered, err := s.recover()
+	if err != nil {
+		return nil, err
+	}
 
-// Chunk is a chunk of data consisted of several data instances.
-type Chunk struct {
-	Idx   int // index of the chunk within the file
-	Path  string
-	Index recordio.Index // block index
+	if recovered {
+		// Recovered. Now the state is already initialized,
+		// and the master is ready.
+		s.initDone = true
+		close(s.ready)
+		log.Info("Master recovered from saved state.")
+	}
+
+	return s, nil
 }
 
-// Task is the basic unit of data instances assigned to trainers.
-type Task struct {
-	ID     int
-	Chunks []Chunk
+// recover recovers service state from etcd.
+func (s *Service) recover() (bool, error) {
+	state, err := s.store.Load()
+	if err != nil {
+		return false, err
+	}
+
+	if state == nil {
+		log.Infoln("No state exists, not recovered.")
+		return false, nil
+	}
+
+	log.Infof("Loaded snapshot of size: %d bytes.", len(state))
+	gr, err := gzip.NewReader(bytes.NewReader(state))
+	if err != nil {
+		return false, err
+	}
+
+	dec := gob.NewDecoder(gr)
+	var tqs taskQueues
+	err = dec.Decode(&tqs)
+	if err != nil {
+		return false, err
+	}
+
+	err = gr.Close()
+	if err != nil {
+		// Only close failed, recover actually succeed, so
+		// just log error.
+		log.Errorln(err)
+	}
+
+	s.taskQueues = tqs
+	return true, nil
 }
 
-type taskEntry struct {
-	Epoch      int
-	NumTimeout int
-	Task       Task
+// snapshot *must* be called with s.mu being held.
+func (s *Service) snapshot() error {
+	// TOOD(helin): etcd request has a size limit, so the snapshot
+	// size is limited by the max request size. We should either
+	// divide the snapshot into smaller chunks and save under
+	// different keys, or configure the request size to be big
+	// enough:
+	// https://github.com/coreos/etcd/blob/2f84f3d8d8ed8f9537ab6ffa44a3a1c7eddfa9b1/embed/config.go#L44
+	var buf bytes.Buffer
+	gw := gzip.NewWriter(&buf)
+	enc := gob.NewEncoder(gw)
+	err := enc.Encode(s.taskQueues)
+	if err != nil {
+		return err
+	}
+	err = gw.Close()
+	if err != nil {
+		return err
+	}
+
+	state := buf.Bytes()
+	log.Infof("Saving snapshot of size: %d bytes.", len(state))
+	return s.store.Save(state)
 }
 
-type taskQueues struct {
-	Todo    []taskEntry
-	Pending map[int]taskEntry // map from task ID to task entry
-	Done    []taskEntry
-	Failed  []Task
+func readChunks(globPaths []string) ([]Chunk, error) {
+	var chunks []Chunk
+	var paths []string
+
+	for _, s := range globPaths {
+		match, err := filepath.Glob(s)
+		if err != nil {
+			return nil, err
+		}
+		paths = append(paths, match...)
+	}
+
+	if len(paths) == 0 {
+		return nil, errors.New("no valid dataset specified")
+	}
+
+	for _, path := range paths {
+		f, err := os.Open(path)
+		if err != nil {
+			return nil, err
+		}
+
+		index, err := recordio.LoadIndex(f)
+		if err != nil {
+			return nil, err
+		}
+		err = f.Close()
+		if err != nil {
+			return nil, err
+		}
+
+		count := index.NumChunks()
+		for i := 0; i < count; i++ {
+			chunk := Chunk{
+				Path:  path,
+				Index: *index.ChunkIndex(i),
+			}
+			chunks = append(chunks, chunk)
+		}
+	}
+
+	return chunks, nil
 }
 
-// *must* be called with s.mu being held.
-func (s *Service) snapshot() error {
-	// TODO(helin): snapshot state on etcd.
+// SetDataset sets dataset to dispatch for the master server.
+//
+// SetDataset can be call multiple times. But only the first call will
+// be honored.
+func (s *Service) SetDataset(globPaths []string, dummy *int) error {
+	if len(globPaths) == 0 {
+		return errors.New("no dataset specified")
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.initDone {
+		// Already initialized. All trainer will call
+		// SetDataset, but we only handle the first one. Treat
+		// other calls as successful but do nothing.
+		return nil
+	}
+
+	chunks, err := readChunks(globPaths)
+	if err != nil {
+		return err
+	}
+
+	s.taskQueues.Todo = partition(chunks, s.chunksPerTask)
+
+	err = s.snapshot()
+	if err != nil {
+		log.Errorln(err)
+		return err
+	}
+
+	close(s.ready)
+	s.initDone = true
 	return nil
 }
 
+func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
+	return func() {
+		s.mu.Lock()
+		defer s.mu.Unlock()
+
+		t, ok := s.taskQueues.Pending[taskID]
+		if !ok {
+			return
+		}
+
+		if t.Epoch != epoch {
+			// new epoch, task launched after the
+			// schedule of this timeout check.
+			return
+		}
+
+		defer func() {
+			err := s.snapshot()
+			if err != nil {
+				log.Errorln(err)
+			}
+		}()
+
+		delete(s.taskQueues.Pending, t.Task.ID)
+
+		t.NumTimeout++
+		if t.NumTimeout > s.timeoutMax {
+			log.Warningf("Task %v timed out %d times, discard.", t.Task, t.NumTimeout)
+			s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
+			return
+		}
+
+		log.Warningf("Task %v timed out %d times, retry.", t.Task, t.NumTimeout)
+		s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+	}
+}
+
+// must be called with lock held.
+func (s *Service) logFields() log.Fields {
+	return log.Fields{
+		"todoLen":    len(s.taskQueues.Todo),
+		"pendingLen": len(s.taskQueues.Pending),
+		"doneLen":    len(s.taskQueues.Done),
+		"failedLen":  len(s.taskQueues.Failed),
+	}
+}
+
 // GetTask gets a new task from the service.
 func (s *Service) GetTask(dummy int, task *Task) error {
+	select {
+	case <-s.ready:
+	}
+
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
 	if len(s.taskQueues.Todo) == 0 {
-		return ErrNoMoreTask
+		if len(s.taskQueues.Done) == 0 {
+			if len(s.taskQueues.Pending) == 0 {
+				err := errors.New("all task failed")
+				log.WithFields(s.logFields()).Warningln("All tasks failed.")
+				return err
+			}
+
+			// TODO(helin): client need to retry in this
+			// error case. Gotcha: RPC client can't
+			// compare returned error with predefined
+			// errors like io.EOF, because the error
+			// instance deserialized from RPC is a
+			// different instance than the error defined
+			// in package. So we need to figure out a way
+			// for client to check this error correctly.
+			err := errors.New("no more available task")
+			log.WithFields(s.logFields()).Warningln("No more available task.")
+			return err
+		}
+		s.taskQueues.Todo = s.taskQueues.Done
+		s.taskQueues.Done = nil
+		log.WithFields(s.logFields()).Infoln("No more todo task, but trainer is requesting task to do. Move all done task to todo.")
 	}
 
 	t := s.taskQueues.Todo[0]
@@ -123,56 +347,45 @@ func (s *Service) GetTask(dummy int, task *Task) error {
 		return err
 	}
 
-	time.AfterFunc(s.timeoutDur, func(taskID int, epoch int) func() {
-		return func() {
-			s.mu.Lock()
-			defer s.mu.Unlock()
-
-			t, ok := s.taskQueues.Pending[taskID]
-			if !ok {
-				return
-			}
+	*task = t.Task
+	log.WithFields(s.logFields()).Infof("Task #%d dispatched.", task.ID)
 
-			if t.Epoch != epoch {
-				// new epoch, task launched after the
-				// schedule of this timeout check.
-				return
-			}
-
-			defer func() {
-				err := s.snapshot()
-				if err != nil {
-					log.Println(err)
-				}
-			}()
-
-			delete(s.taskQueues.Pending, t.Task.ID)
-
-			t.NumTimeout++
-			if t.NumTimeout > s.timeoutMax {
-				s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
-				return
-			}
-
-			s.taskQueues.Todo = append(s.taskQueues.Todo, t)
-		}
-	}(t.Task.ID, t.Epoch))
+	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.ID, t.Epoch))
 	return nil
 }
 
 // TaskFinished tell the service that a task is finished.
 func (s *Service) TaskFinished(taskID int, dummy *int) error {
+	select {
+	case <-s.ready:
+	}
+
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
 	t, ok := s.taskQueues.Pending[taskID]
 	if !ok {
-		return ErrPendingTaskNotFound
+		err := errors.New("pending task not found")
+		log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID)
+		return err
 	}
 
 	// task finished, reset timeout
 	t.NumTimeout = 0
 	s.taskQueues.Done = append(s.taskQueues.Done, t)
 	delete(s.taskQueues.Pending, taskID)
-	return s.snapshot()
+
+	log.WithFields(s.logFields()).Infof("Task #%d finished.", taskID)
+
+	if len(s.taskQueues.Pending) == 0 && len(s.taskQueues.Todo) == 0 {
+		log.WithFields(s.logFields()).Infoln("No more todo and pending task, start a new pass.")
+		s.taskQueues.Todo = append(s.taskQueues.Todo, s.taskQueues.Done...)
+		s.taskQueues.Done = nil
+	}
+
+	err := s.snapshot()
+	if err != nil {
+		log.Errorln(err)
+	}
+	return err
 }
diff --git a/go/pserver/cclient/CMakeLists.txt b/go/pserver/cclient/CMakeLists.txt
index c017d7465611373309c6c60141fed864f5ccfb5d..d2c339d68866bd5c91403227e97af2c97bb30eeb 100644
--- a/go/pserver/cclient/CMakeLists.txt
+++ b/go/pserver/cclient/CMakeLists.txt
@@ -1,13 +1,3 @@
-cmake_minimum_required(VERSION 3.0)
+go_library(paddle_pserver_cclient STATIC)
 
-get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
-get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake")
-
-project(cxx_go C Go)
-
-include(golang)
-include(flags)
-
-go_library(client STATIC)
 add_subdirectory(test)
diff --git a/go/pserver/cclient/cclient.go b/go/pserver/cclient/cclient.go
index 0b4aa79806b72f4608230d2216d1741389913d95..bbaf43d9f1434a278568bc110a709718b9b8c222 100644
--- a/go/pserver/cclient/cclient.go
+++ b/go/pserver/cclient/cclient.go
@@ -1,7 +1,6 @@
 package main
 
 /*
-#include <stdlib.h>
 #include <string.h>
 typedef enum {
   PADDLE_ELEMENT_TYPE_INT32   = 0,
@@ -19,39 +18,27 @@ typedef struct {
   int                 content_len;
 } paddle_parameter, paddle_gradient;
 
-static inline void paddle_release_param(paddle_parameter* param) {
-  if (param != NULL) {
-    if (param->name != NULL) {
-      free(param->name);
-    }
-
-    if (param->content != NULL) {
-      free(param->content);
-    }
-
-    free(param);
-  }
-}
-
-typedef int client;
+typedef int paddle_pserver_client;
+#define PSERVER_ERROR -1
+#define PSERVER_OK 0
 */
 import "C"
 
 import (
-	"log"
 	"strings"
 	"sync"
 	"unsafe"
 
 	"github.com/PaddlePaddle/Paddle/go/pserver"
+	log "github.com/sirupsen/logrus"
 )
 
 var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
-var handleMap = make(map[C.client]*pserver.Client)
-var curHandle C.client
+var handleMap = make(map[C.paddle_pserver_client]*pserver.Client)
+var curHandle C.paddle_pserver_client
 
-func add(c *pserver.Client) C.client {
+func add(c *pserver.Client) C.paddle_pserver_client {
 	mu.Lock()
 	defer mu.Unlock()
 	client := curHandle
@@ -60,13 +47,13 @@ func add(c *pserver.Client) C.client {
 	return client
 }
 
-func get(client C.client) *pserver.Client {
+func get(client C.paddle_pserver_client) *pserver.Client {
 	mu.Lock()
 	defer mu.Unlock()
 	return handleMap[client]
 }
 
-func remove(client C.client) *pserver.Client {
+func remove(client C.paddle_pserver_client) *pserver.Client {
 	mu.Lock()
 	defer mu.Unlock()
 	h := handleMap[client]
@@ -100,7 +87,7 @@ func (l lister) List() []pserver.Server {
 }
 
 //export paddle_new_pserver_client
-func paddle_new_pserver_client(addrs *C.char, selected int) C.client {
+func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_client {
 	a := C.GoString(addrs)
 	as := strings.Split(a, ",")
 	servers := make([]pserver.Server, len(as))
@@ -113,27 +100,27 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.client {
 }
 
 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcd_addr *C.char) C.client {
+func paddle_new_etcd_pserver_client(etcd_addr *C.char) C.paddle_pserver_client {
 	// TODO(helin): fault tolerant pserver client using etcd.
 	panic("not implemented.")
 }
 
 //export paddle_pserver_client_release
-func paddle_pserver_client_release(client C.client) {
+func paddle_pserver_client_release(client C.paddle_pserver_client) {
 	remove(client)
 }
 
 //export paddle_begin_init_params
-func paddle_begin_init_params(client C.client) C.int {
+func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
 	if selected := c.BeginInitParams(); selected {
 		return 1
 	}
-	return 0
+	return C.PSERVER_OK
 }
 
 //export paddle_init_param
-func paddle_init_param(client C.client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
+func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
 	et := pserver.ElementType(param.element_type)
 	name := C.GoString(param.name)
 	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
@@ -143,31 +130,41 @@ func paddle_init_param(client C.client, param C.paddle_parameter, param_config u
 	}
 	c := get(client)
 	err := c.InitParam(pc)
+
 	if err != nil {
-		log.Println(err)
-		return -1
+		if err.Error() == pserver.AlreadyInitialized {
+			log.Warningf("parameter %s already initialized, treat paddle_init_param as sucessful.", name)
+			return C.PSERVER_OK
+		}
+		log.Errorln(err)
+		return C.PSERVER_ERROR
 	}
 
-	return 0
+	return C.PSERVER_OK
 }
 
 //export paddle_finish_init_params
-func paddle_finish_init_params(client C.client) C.int {
+func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
 	err := c.FinishInitParams()
 	if err != nil {
-		log.Println(err)
-		return -1
+		if err.Error() == pserver.AlreadyInitialized {
+			log.Warningln("parameters already initialized, treat paddle_finish_init_params as sucessful.")
+			return C.PSERVER_OK
+		}
+
+		log.Errorln(err)
+		return C.PSERVER_ERROR
 	}
 
-	return 0
+	return C.PSERVER_OK
 }
 
 //export paddle_send_grads
-func paddle_send_grads(client C.client, grads *C.paddle_gradient, total C.int) C.int {
+func paddle_send_grads(client C.paddle_pserver_client, grads **C.paddle_gradient, total C.int) C.int {
 	var gs []pserver.Gradient
 	for i := 0; i < int(total); i++ {
-		grad := (*C.paddle_gradient)(unsafe.Pointer((uintptr(unsafe.Pointer(grads)) + uintptr(i)*unsafe.Sizeof(*grads))))
+		grad := *(**C.paddle_gradient)(unsafe.Pointer((uintptr(unsafe.Pointer(grads)) + uintptr(i)*unsafe.Sizeof(*grads))))
 		et := pserver.ElementType(grad.element_type)
 		name := C.GoString(grad.name)
 		content := cArrayToSlice(unsafe.Pointer(grad.content), int(grad.content_len))
@@ -177,84 +174,82 @@ func paddle_send_grads(client C.client, grads *C.paddle_gradient, total C.int) C
 	c := get(client)
 	err := c.SendGrads(gs)
 	if err != nil {
-		log.Println(err)
-		return -1
+		log.Errorln(err)
+		return C.PSERVER_ERROR
 	}
 
-	return 0
+	return C.PSERVER_OK
 }
 
 //export paddle_get_params
-func paddle_get_params(client C.client, names **C.char, dst **C.paddle_parameter, total C.int) C.int {
+func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter, total C.int) C.int {
 	var ns []string
 	for i := 0; i < int(total); i++ {
-		name := *(**C.char)(unsafe.Pointer((uintptr(unsafe.Pointer(names)) + uintptr(i)*unsafe.Sizeof(*names))))
-		ns = append(ns, C.GoString(name))
+		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
+		ns = append(ns, C.GoString(param.name))
 	}
 	c := get(client)
 	ps, err := c.GetParams(ns)
 	if err != nil {
-		log.Println(err)
-		return -1
+		log.Errorln(err)
+		return C.PSERVER_ERROR
 	}
 
-	for i := 0; i < int(total); i++ {
-		if i >= len(ps) {
-			break
+	if len(ps) != len(ns) {
+		pn := make([]string, len(ps))
+		for i, p := range ps {
+			pn[i] = p.Name
 		}
+		log.Errorf("pserver returned wrong number of parameters. Requested: %s, returned: %s.", strings.Join(pn, ", "), strings.Join(ns, ", "))
+		return C.PSERVER_ERROR
+	}
+
+	for i := range ps {
+		if ns[i] != ps[i].Name {
+			pn := make([]string, len(ps))
+			for i, p := range ps {
+				pn[i] = p.Name
+			}
+			log.Errorf("pserver returned wrong parameters, or not in requested order. Requested: %s, returned: %s.", strings.Join(pn, ", "), strings.Join(ns, ", "))
+			return C.PSERVER_ERROR
+		}
+	}
 
+	for i := 0; i < int(total); i++ {
 		p := ps[i]
 		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
-		nameReady := false
-		contentAllocated := false
 
 		if unsafe.Pointer(param) == nullPtr {
-			param = (*C.paddle_parameter)(C.calloc(1, C.size_t(unsafe.Sizeof(*param))))
-		} else {
-			if unsafe.Pointer(param.name) != nullPtr {
-				if n := C.GoString(param.name); n != p.Name {
-					log.Println("Warning: the pre-allocated parameter name does not match the parameter name, it will be freed.", n, p.Name)
-					C.free(unsafe.Pointer(param.name))
-				} else {
-					nameReady = true
-				}
-			}
+			log.Errorln("must pre-allocate parameter.")
+			return C.PSERVER_ERROR
+		}
 
-			if unsafe.Pointer(param.content) != nullPtr {
-				if int(param.content_len) == len(p.Content) {
-					contentAllocated = true
-				} else {
-					log.Println("Warning: the pre-allocated content len does not match parameter content len, the pre-allocated content will be freed.", param.content_len, len(p.Content))
-					C.free(unsafe.Pointer(param.content))
-				}
+		if unsafe.Pointer(param.content) != nullPtr {
+			if int(param.content_len) != len(p.Content) {
+				log.Errorf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content))
+				return C.PSERVER_ERROR
 			}
 		}
 
-		if !nameReady {
-			param.name = C.CString(p.Name)
-		}
-		if !contentAllocated {
-			param.content = (*C.uchar)(C.malloc(C.size_t(len(p.Content))))
-		}
 		C.memcpy(unsafe.Pointer(param.content), unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)))
 		param.content_len = C.int(len(p.Content))
 		param.element_type = C.paddle_element_type(p.ElementType)
 	}
 
-	return 0
+	return C.PSERVER_OK
 }
 
 //export paddle_save_model
-func paddle_save_model(client C.client, path *C.char) C.int {
+func paddle_save_model(client C.paddle_pserver_client, path *C.char) C.int {
 	p := C.GoString(path)
 	c := get(client)
 	err := c.Save(p)
 	if err != nil {
-		log.Println(err)
-		return -1
+		log.Errorln(err)
+		return C.PSERVER_ERROR
 	}
 
-	return 0
+	return C.PSERVER_OK
 }
 
 func main() {} // Required but ignored
diff --git a/go/pserver/cclient/test/CMakeLists.txt b/go/pserver/cclient/test/CMakeLists.txt
index 16f84648c1de3a8fdb4595c00bdb7608a152ded2..916e4e99a24ea7f76f1935fc7d281cd158ac5061 100644
--- a/go/pserver/cclient/test/CMakeLists.txt
+++ b/go/pserver/cclient/test/CMakeLists.txt
@@ -1,11 +1,3 @@
-cmake_minimum_required(VERSION 3.0)
 
-include_directories(${CMAKE_BINARY_DIR})
-
-add_executable(main main.c)
-add_dependencies(main client)
-
-if(APPLE)
-  set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
-endif()
-target_link_libraries(main ${CMAKE_BINARY_DIR}/libclient.a)
+cc_library(main SRCS main.c DEPS paddle_pserver_cclient)
+cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient)
diff --git a/go/pserver/cclient/test/main.c b/go/pserver/cclient/test/main.c
index f75a2110b947520dfec1265e56eaf2ba7ac3b51b..03f749d4e46c4890c6dcfa25af572dab4a053c86 100644
--- a/go/pserver/cclient/test/main.c
+++ b/go/pserver/cclient/test/main.c
@@ -1,68 +1,91 @@
 #include <stdio.h>
+#include <stdlib.h>
 
-#include "libclient.h"
+#include "libpaddle_pserver_cclient.h"
 
-void fail() {
-  // TODO(helin): fix: gtest using cmake is not working, using this
-  // hacky way for now.
-  printf("test failed.\n");
+// TODO(helin): Fix: gtest using cmake is not working, using this
+// hacky way for now.
+#define fail()                                          \
+  fprintf(stderr, "info: %s:%d: ", __FILE__, __LINE__); \
   exit(-1);
+
+void sendGrads(paddle_pserver_client c) {
+  unsigned char grad_a[2000] = {2};
+  unsigned char grad_b[3000] = {3};
+  paddle_gradient grad1 = {
+      "param_a", PADDLE_ELEMENT_TYPE_FLOAT32, grad_a, 2000};
+  paddle_gradient grad2 = {
+      "param_b", PADDLE_ELEMENT_TYPE_FLOAT32, grad_b, 3000};
+  paddle_gradient* grads[2] = {&grad1, &grad2};
+  if (paddle_send_grads(c, grads, 2)) {
+    fail();
+  }
+}
+
+void getParams(paddle_pserver_client c) {
+  paddle_parameter param_a;
+  paddle_parameter param_b;
+  char name_a[] = "param_a";
+  char name_b[] = "param_b";
+  // Must pre-allocate the prameter content before calling paddle_get_params.
+  unsigned char content_a[2000] = {};
+  unsigned char content_b[3000] = {};
+  param_a.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+  param_a.name = name_a;
+  param_a.content = content_a;
+  param_a.content_len = 2000;
+  param_b.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+  param_b.name = name_b;
+  param_b.content = content_b;
+  param_b.content_len = 3000;
+
+  paddle_parameter* params[2] = {&param_a, &param_b};
+  if (paddle_get_params(c, params, 2)) {
+    fail();
+  }
 }
 
 int main() {
   char addr[] = "localhost:3000";
-  client c = paddle_new_pserver_client(addr, 1);
+  paddle_pserver_client c = paddle_new_pserver_client(addr, 1);
 retry:
   if (paddle_begin_init_params(c)) {
     paddle_parameter param;
     char name_a[] = "param_a";
     char name_b[] = "param_b";
-    unsigned char content[] = {0x00, 0x11, 0x22};
+    unsigned char content_a[2000] = {1};
+    unsigned char content_b[3000] = {0};
     param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
     param.name = name_a;
-    param.content = content;
-    param.content_len = 3;
-    if (paddle_init_param(c, param, NULL, 0) != 0) {
+    param.content = content_a;
+    param.content_len = 2000;
+    int error = paddle_init_param(c, param, NULL, 0);
+    if (error != 0) {
       goto retry;
     }
-    param.element_type = PADDLE_ELEMENT_TYPE_INT32;
+
+    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
     param.name = name_b;
-    param.content = content;
-    param.content_len = 3;
-    if (paddle_init_param(c, param, NULL, 0) != 0) {
+    param.content = content_b;
+    param.content_len = 3000;
+    error = paddle_init_param(c, param, NULL, 0);
+    if (error != 0) {
       goto retry;
     }
-    if (paddle_finish_init_params(c) != 0) {
+
+    error = paddle_finish_init_params(c);
+    if (error != 0) {
       goto retry;
     }
-  } else {
-    fail();
-  }
-
-  unsigned char content[] = {0x00, 0x11, 0x22};
-  paddle_gradient grads[2] = {
-      {"param_a", PADDLE_ELEMENT_TYPE_INT32, content, 3},
-      {"param_b", PADDLE_ELEMENT_TYPE_FLOAT32, content, 3}};
-
-  if (!paddle_send_grads(c, grads, 2)) {
-    fail();
   }
 
-  paddle_parameter* params[2] = {NULL, NULL};
-  char* names[] = {"param_a", "param_b"};
-  if (!paddle_get_params(c, names, params, 2)) {
-    fail();
+  int i;
+  for (i = 0; i < 100; i++) {
+    sendGrads(c);
+    getParams(c);
   }
 
-  // get parameters again by reusing the allocated parameter buffers.
-  if (!paddle_get_params(c, names, params, 2)) {
-    fail();
-  }
-
-  paddle_release_param(params[0]);
-  paddle_release_param(params[1]);
-
-  if (!paddle_save_model(c, "/tmp/")) {
+  if (paddle_save_model(c, "/tmp/")) {
     fail();
   }
 
diff --git a/go/pserver/cclient/test/test_cclient.c b/go/pserver/cclient/test/test_cclient.c
new file mode 100644
index 0000000000000000000000000000000000000000..0f9c2ef80114d4c5cd887117952f5b7b5d9355f6
--- /dev/null
+++ b/go/pserver/cclient/test/test_cclient.c
@@ -0,0 +1,117 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "libpaddle_pserver_cclient.h"
+
+typedef float real;
+
+void fail() {
+  // TODO(helin): fix: gtest using cmake is not working, using this
+  // hacky way for now.
+  printf("test failed.\n");
+  exit(-1);
+}
+
+void print_parameter(paddle_gradient* param) {
+  if (param == NULL) {
+    printf("param is NULL!!\n");
+  } else {
+    printf("==== parameter ====\n");
+    printf("name: %s\n", param->name);
+    printf("content_len: %d\n", param->content_len);
+    printf("content_type: %d\n", param->element_type);
+    int i;
+    for (i = 0; i < param->content_len / (int)sizeof(real); ++i) {
+      printf("%f ", ((float*)param->content)[i]);
+    }
+    printf("\n\n");
+  }
+}
+
+int main() {
+  char addr[] = "localhost:3000";
+  paddle_pserver_client c = paddle_new_pserver_client(addr, 1);
+
+  char* names[] = {"param_a", "param_b"};
+
+retry:
+  printf("init parameter to pserver:\n");
+
+  real param_content1[] = {0.1, 0.2, 0.3};
+  real param_content2[] = {0.4, 0.5, 0.6};
+  paddle_parameter** params =
+      (paddle_parameter**)malloc(sizeof(paddle_parameter*) * 2);
+  params[0] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
+  params[0]->name = names[0];
+  params[0]->content = (unsigned char*)param_content1;
+  params[0]->content_len = 3 * sizeof(real);
+  params[0]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+
+  params[1] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
+  params[1]->name = names[1];
+  params[1]->content = (unsigned char*)param_content2;
+  params[1]->content_len = 3 * sizeof(real);
+  params[1]->element_type = PADDLE_ELEMENT_TYPE_INT32;
+
+  if (paddle_begin_init_params(c)) {
+    if (paddle_init_param(c, *params[0], NULL, 0) != 0) {
+      goto retry;
+    }
+    if (paddle_init_param(c, *params[1], NULL, 0) != 0) {
+      goto retry;
+    }
+    if (paddle_finish_init_params(c) != 0) {
+      goto retry;
+    }
+  } else {
+    fail();
+  }
+
+  printf("get inited parameters from pserver:\n");
+  // get parameters again by reusing the allocated parameter buffers.
+  if (paddle_get_params(c, params, 2) != 0) {
+    fail();
+  }
+  print_parameter(params[0]);
+  print_parameter(params[1]);
+
+  printf("send gradient to pserver:\n");
+  real gradient_content1[] = {0.01, 0.02, 0.03};
+  real gradinet_content2[] = {0.04, 0.05, 0.06};
+
+  paddle_gradient** grads =
+      (paddle_gradient**)malloc(sizeof(paddle_gradient*) * 2);
+  grads[0] = (paddle_gradient*)malloc(sizeof(paddle_gradient));
+  grads[0]->name = names[0];
+  grads[0]->content = (unsigned char*)gradient_content1;
+  grads[0]->content_len = 3 * sizeof(real);
+  grads[0]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+
+  grads[1] = (paddle_gradient*)malloc(sizeof(paddle_gradient));
+  grads[1]->name = names[1];
+  grads[1]->content = (unsigned char*)gradinet_content2;
+  grads[1]->content_len = 3 * sizeof(real);
+  grads[1]->element_type = PADDLE_ELEMENT_TYPE_INT32;
+
+  printf("print gradient sent to pserver:\n");
+  print_parameter(grads[0]);
+  print_parameter(grads[1]);
+
+  if (paddle_send_grads(c, grads, 2) != 0) {
+    fail();
+  }
+
+  printf("get updated parameters from pserver:\n");
+  // get parameters again by reusing the allocated parameter buffers.
+  if (paddle_get_params(c, params, 2) != 0) {
+    fail();
+  }
+  print_parameter(params[0]);
+  print_parameter(params[1]);
+
+  if (paddle_save_model(c, "/tmp/") != 0) {
+    fail();
+  }
+
+  return 0;
+}
diff --git a/go/pserver/cclient/test/test_mnist.py b/go/pserver/cclient/test/test_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3a3af55e2812fa0c965d22ddaba198f43f3c4ad
--- /dev/null
+++ b/go/pserver/cclient/test/test_mnist.py
@@ -0,0 +1,131 @@
+import paddle.v2 as paddle
+import gzip
+
+
+def softmax_regression(img):
+    predict = paddle.layer.fc(input=img,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def multilayer_perceptron(img):
+    # The first fully-connected layer
+    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
+    # The second fully-connected layer and the according activation function
+    hidden2 = paddle.layer.fc(input=hidden1,
+                              size=64,
+                              act=paddle.activation.Relu())
+    # The thrid fully-connected layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=hidden2,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def convolutional_neural_network(img):
+    # first conv layer
+    conv_pool_1 = paddle.networks.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        num_channel=1,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # second conv layer
+    conv_pool_2 = paddle.networks.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        num_channel=20,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # The first fully-connected layer
+    fc1 = paddle.layer.fc(input=conv_pool_2,
+                          size=128,
+                          act=paddle.activation.Tanh())
+    # The softmax layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=fc1,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # define network topology
+    images = paddle.layer.data(
+        name='pixel', type=paddle.data_type.dense_vector(784))
+    label = paddle.layer.data(
+        name='label', type=paddle.data_type.integer_value(10))
+
+    # Here we can build the prediction network in different ways. Please
+    # choose one by uncomment corresponding line.
+    predict = softmax_regression(images)
+    #predict = multilayer_perceptron(images)
+    #predict = convolutional_neural_network(images)
+
+    cost = paddle.layer.classification_cost(input=predict, label=label)
+    parameters = paddle.parameters.create(cost)
+
+    optimizer = paddle.optimizer.Momentum(
+        learning_rate=0.1 / 128.0,
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer,
+                                 is_local=False,
+                                 pserver_spec="localhost:3000")
+
+    lists = []
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1000 == 0:
+                print "Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+
+        elif isinstance(event, paddle.event.EndPass):
+            result = trainer.test(reader=paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=128))
+            print "Test with Pass %d, Cost %f, %s\n" % (
+                event.pass_id, result.cost, result.metrics)
+            lists.append((event.pass_id, result.cost,
+                          result.metrics['classification_error_evaluator']))
+
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=128),
+        event_handler=event_handler,
+        num_passes=100)
+
+    # find the best pass
+    best = sorted(lists, key=lambda list: float(list[1]))[0]
+    print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
+    print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
+
+    test_creator = paddle.dataset.mnist.test()
+    test_data = []
+    for item in test_creator():
+        test_data.append((item[0], ))
+        if len(test_data) == 100:
+            break
+
+    # output is a softmax layer. It returns probabilities.
+    # Shape should be (100, 10)
+    probs = paddle.infer(
+        output_layer=predict, parameters=parameters, input=test_data)
+    print probs.shape
+
+
+if __name__ == '__main__':
+    main()
diff --git a/go/pserver/cclient/test/test_train.py b/go/pserver/cclient/test/test_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f8d5d793bdeb687c9d234005d9e2eae760cc3a7
--- /dev/null
+++ b/go/pserver/cclient/test/test_train.py
@@ -0,0 +1,60 @@
+import paddle.v2 as paddle
+import paddle.v2.dataset.uci_housing as uci_housing
+
+
+def main():
+    # init
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # network config
+    x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+    y_predict = paddle.layer.fc(input=x,
+                                param_attr=paddle.attr.Param(name='w'),
+                                size=1,
+                                act=paddle.activation.Linear(),
+                                bias_attr=paddle.attr.Param(name='b'))
+    y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+    cost = paddle.layer.mse_cost(input=y_predict, label=y)
+
+    # create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # create optimizer
+    optimizer = paddle.optimizer.Momentum(momentum=0)
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer,
+                                 is_local=False,
+                                 pserver_spec="localhost:3000")
+
+    # event_handler to print training and testing info
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d, Batch %d, Cost %f" % (
+                    event.pass_id, event.batch_id, event.cost)
+
+        if isinstance(event, paddle.event.EndPass):
+            if (event.pass_id + 1) % 10 == 0:
+                result = trainer.test(
+                    reader=paddle.batch(
+                        uci_housing.test(), batch_size=2),
+                    feeding={'x': 0,
+                             'y': 1})
+                print "Test %d, %.2f" % (event.pass_id, result.cost)
+
+    # training
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                uci_housing.train(), buf_size=500),
+            batch_size=2),
+        feeding={'x': 0,
+                 'y': 1},
+        event_handler=event_handler,
+        num_passes=30)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/go/pserver/client.go b/go/pserver/client.go
index f8bd0aa59f30ec7e2b2d318929af96135d3128ed..dda915977282d4880ddcc8c18ef6fd80ede9e01b 100644
--- a/go/pserver/client.go
+++ b/go/pserver/client.go
@@ -2,11 +2,11 @@ package pserver
 
 import (
 	"hash/fnv"
-	"log"
 	"sort"
 	"time"
 
-	"github.com/PaddlePaddle/Paddle/go/pserver/internal/connection"
+	"github.com/PaddlePaddle/Paddle/go/connection"
+	log "github.com/sirupsen/logrus"
 )
 
 // TODO(helin): add RPC call retry logic
@@ -47,7 +47,7 @@ func NewClient(l Lister, pserverNum int, sel Selector) *Client {
 // monitorPservers monitors pserver addresses, and updates connection
 // when the address changes.
 func (c *Client) monitorPservers(l Lister, pserverNum int) {
-	knownServers := make([]Server, pserverNum)
+	lastServers := make([]Server, pserverNum)
 	ticker := time.NewTicker(10 * time.Second)
 	monitor := func() {
 		curServers := make([]Server, pserverNum)
@@ -56,25 +56,37 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) {
 			curServers[l.Index] = l
 		}
 
-		for i := range knownServers {
-			if knownServers[i].Addr != curServers[i].Addr {
-				err := c.pservers[i].Connect(curServers[i].Addr)
-				if err != nil {
-					log.Println(err)
+		for i := range lastServers {
+			if lastServers[i].Addr == curServers[i].Addr {
+				continue
+			}
 
-					// connect to addr failed, set
-					// to last known addr in order
-					// to retry next time.
-					curServers[i].Addr = knownServers[i].Addr
+			if curServers[i].Addr == "" {
+				err := c.pservers[i].Close()
+				if err != nil {
+					log.Errorln(err)
 				}
+
+				continue
 			}
+
+			err := c.pservers[i].Connect(curServers[i].Addr)
+			if err != nil {
+				log.Errorln(err)
+
+				// connect to addr failed, set
+				// to last known addr in order
+				// to retry next time.
+				curServers[i].Addr = lastServers[i].Addr
+			}
+
 		}
 
-		knownServers = curServers
+		lastServers = curServers
 	}
 
 	monitor()
-	for _ = range ticker.C {
+	for range ticker.C {
 		monitor()
 	}
 }
@@ -93,16 +105,14 @@ func (c *Client) BeginInitParams() bool {
 
 // InitParam initializes the parameter on parameter servers.
 func (c *Client) InitParam(paramWithConfigs ParameterWithConfig) error {
-	var dummy int
-	return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, &dummy)
+	return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, nil)
 }
 
 // FinishInitParams tells parameter servers client has sent all
 // parameters to parameter servers as initialization.
 func (c *Client) FinishInitParams() error {
 	for _, p := range c.pservers {
-		var dummy int
-		err := p.Call("Service.FinishInitParams", dummy, &dummy)
+		err := p.Call("Service.FinishInitParams", 0, nil)
 		if err != nil {
 			return err
 		}
@@ -116,8 +126,7 @@ func (c *Client) SendGrads(grads []Gradient) error {
 	errCh := make(chan error, len(grads))
 	for _, g := range grads {
 		go func(g Gradient) {
-			var dummy int
-			err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, &dummy)
+			err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, nil)
 			errCh <- err
 		}(g)
 	}
@@ -196,8 +205,7 @@ func (c *Client) Save(path string) error {
 	errCh := make(chan error, len(c.pservers))
 
 	for _, p := range c.pservers {
-		var dummy int
-		err := p.Call("Service.Save", path, &dummy)
+		err := p.Call("Service.Save", path, nil)
 		errCh <- err
 	}
 
diff --git a/go/pserver/client_test.go b/go/pserver/client_test.go
index a9a0948a51a31a1c7393f716e3dfc436dbf919af..6ecf1fa08a02ed2ce04fae0903cebd46a7b768a4 100644
--- a/go/pserver/client_test.go
+++ b/go/pserver/client_test.go
@@ -7,6 +7,7 @@ import (
 	"strconv"
 	"strings"
 	"testing"
+	"time"
 
 	"github.com/PaddlePaddle/Paddle/go/pserver"
 )
@@ -30,9 +31,12 @@ func init() {
 		port[i] = p
 
 		go func(l net.Listener) {
-			s := pserver.NewService()
+			s, err := pserver.NewService("", time.Second*5)
+			if err != nil {
+				panic(err)
+			}
 			server := rpc.NewServer()
-			err := server.Register(s)
+			err = server.Register(s)
 			if err != nil {
 				panic(err)
 			}
@@ -117,7 +121,7 @@ func TestClientFull(t *testing.T) {
 
 	for i := range params {
 		if names[i] != params[i].Name {
-			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i])
+			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i].Name)
 		}
 	}
 }
diff --git a/go/pserver/optimizer.c b/go/pserver/optimizer.c
index b8da3ec9592053e3efe00e69d73a8ae259a30a2f..f16ba2cbf8e168a434fdcdb4f1e0ba1e98d91c6b 100644
--- a/go/pserver/optimizer.c
+++ b/go/pserver/optimizer.c
@@ -32,7 +32,13 @@ int update_SGD(void* optimizer,
                const void* gradient,
                int num_bytes) {
   SGD_optimizer* o = (SGD_optimizer*)optimizer;
-  // TODO
+  float* parameter = (float*)buffer;
+  float* grad = (float*)gradient;
+
+  int i;
+  for (i = 0; i < num_bytes / sizeof(float); ++i) {
+    parameter[i] -= o->learning_rate * grad[i];
+  }
   return 0;
 }
 
diff --git a/go/pserver/service.go b/go/pserver/service.go
index d5787b9708bb15629a6e6290ffc97ee9885bc8b8..f966595fdccbf23e23f94a857503ce05815164ef 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -1,16 +1,27 @@
 package pserver
 
 import (
+	"context"
 	"errors"
 	"fmt"
+	"strconv"
+	"strings"
 	"sync"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
+	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/clientv3/concurrency"
+	log "github.com/sirupsen/logrus"
 )
 
 // ElementType is the type of elements of a Parameter.
 type ElementType int
 
-var ErrAlreadyInitialized = errors.New("pserver already initialized")
-var ErrUninitialized = errors.New("pserver not fully initialized")
+const (
+	AlreadyInitialized = "pserver already initialized"
+	Uninitialized      = "pserver not fully initialized"
+)
 
 // Supported element types
 const (
@@ -22,6 +33,9 @@ const (
 	Float64
 )
 
+// PsDesired is etcd path for store desired pserver count
+const PsDesired = "/ps_desired"
+
 // Parameter is a piece of data to sync with the parameter server.
 type Parameter struct {
 	Name        string
@@ -45,21 +59,161 @@ type Service struct {
 	mu       sync.Mutex
 	opt      *optimizer
 	paramMap map[string]Parameter
+
+	etcdEndpoints string
+	etcdClient    *clientv3.Client
+	// etcdTimeout is also used as retry intervals.
+	etcdTimeout time.Duration
+	// desired number of pservers in the job.
+	// assume desired will not change during one training job.
+	desired int
+	// FIXME: ensure GetExternalIP gets the correct ip for trainers to connect.
+	externalIP string
 }
 
-// NewService creates a new service.
-func NewService() *Service {
-	s := &Service{opt: newOptimizer(sgd, 0.01)}
+// NewService creates a new service, will bypass etcd registration if no
+// endpoints specified.
+func NewService(endpoints string, numPservers int, timeout time.Duration) (*Service, error) {
+	s := &Service{opt: newOptimizer(sgd, 0.005)}
 	s.paramMap = make(map[string]Parameter)
 	s.initialized = make(chan struct{})
-	return s
+	s.etcdEndpoints = endpoints
+	s.etcdTimeout = timeout
+
+	var err error
+	s.externalIP, err = networkhelper.GetExternalIP()
+	if err != nil {
+		return nil, err
+	}
+
+	if endpoints != "" {
+		// initialize connection to etcd, try
+		ep := strings.Split(s.etcdEndpoints, ",")
+		for {
+			cli, err := clientv3.New(clientv3.Config{
+				Endpoints:   ep,
+				DialTimeout: s.etcdTimeout,
+			})
+			if err != nil {
+				log.Errorf("connect to etcd error: %v", err)
+				time.Sleep(s.etcdTimeout)
+				continue
+			}
+			s.etcdClient = cli
+			log.Debugf("inited client to %s", s.etcdEndpoints)
+			break
+		}
+		// init /ps_desired using transaction, for multiple pservers may want to write
+		// it at the same time.
+		for {
+			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+			_, err := s.initDesiredPsercers(ctx, numPservers)
+			cancel()
+			if err != nil {
+				log.Warn(err)
+				time.Sleep(s.etcdTimeout)
+				continue
+			}
+			break
+		}
+		// TODO: when implementing extending or reducing pservers, /ps_desired is
+		// changed, then we need to watch /ps_desired node for events. For now, just
+		// write once when init and read from it.
+		// wait and set s.desired init value
+		for {
+			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+			resp, err := s.etcdClient.Get(ctx, PsDesired)
+			cancel()
+			if err != nil {
+				log.Errorf("getting %s error: %v", PsDesired, err)
+				time.Sleep(s.etcdTimeout)
+				continue
+			}
+			if len(resp.Kvs) != 0 {
+				s.desired, err = strconv.Atoi(string(resp.Kvs[0].Value))
+				if err != nil {
+					log.Errorf("value of %s invalid %v\n", PsDesired, err)
+					time.Sleep(s.etcdTimeout)
+					// NOTE: wait util ps_desired value change
+					continue
+				}
+				break
+			}
+		}
+		// try register pserver node on etcd
+		for {
+			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+			_, err := s.registerPserverEtcd(ctx)
+			cancel()
+			if err != nil {
+				log.Warn(err)
+				time.Sleep(s.etcdTimeout)
+				continue
+			}
+			break
+		}
+	} // if endpoints != ""
+	// Bypass etcd registration if no endpoints specified
+	return s, nil
+}
+
+func (s *Service) initDesiredPsercers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
+	return concurrency.NewSTM(s.etcdClient, func(c concurrency.STM) error {
+		dsStr := c.Get(PsDesired)
+		if dsStr == "" {
+			c.Put(PsDesired, strconv.Itoa(numPservers))
+		}
+		return nil
+	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
+}
+
+// registerPserverEtcd registers pserver node on etcd using transaction.
+func (s *Service) registerPserverEtcd(ctx context.Context) (*clientv3.TxnResponse, error) {
+	return concurrency.NewSTM(s.etcdClient, func(c concurrency.STM) error {
+		registered := false
+		for i := 0; i < s.desired; i++ {
+			psKey := "/ps/" + strconv.Itoa(i)
+			log.Debugf("checking %s", psKey)
+			ps := c.Get(psKey)
+			log.Debugf("got value (%s) for key: %s", ps, psKey)
+
+			if ps == "" {
+				resp, err := s.etcdClient.Grant(context.TODO(), 5)
+				if err != nil {
+					log.Fatal(err)
+				}
+				// find the first id and write info
+				c.Put(psKey, s.externalIP, clientv3.WithLease(resp.ID))
+				log.Debugf("set pserver node %s with value %s", psKey, s.externalIP)
+				ch, kaerr := s.etcdClient.KeepAlive(context.TODO(), resp.ID)
+				if kaerr != nil {
+					log.Errorf("keepalive etcd node error: %v", kaerr)
+					return kaerr
+				}
+
+				// Eat the keep alive message so etcd
+				// will not expire the lease.
+				go func(ch <-chan *clientv3.LeaseKeepAliveResponse) {
+					ka := <-ch
+					log.Debugf("keepalive: %d\n", ka.TTL)
+				}(ch)
+				log.Debug("register finished")
+				registered = true
+				break
+			}
+		}
+		if registered == true {
+			return nil
+		}
+		return errors.New("not registerd, may due to already have enough pservers")
+	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
 }
 
 // InitParam initializes a parameter.
 func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) error {
 	select {
 	case <-s.initialized:
-		return ErrAlreadyInitialized
+		return errors.New(AlreadyInitialized)
 	default:
 	}
 
@@ -80,7 +234,7 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) er
 func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
 	select {
 	case <-s.initialized:
-		return ErrAlreadyInitialized
+		return errors.New(AlreadyInitialized)
 	default:
 	}
 
@@ -94,7 +248,7 @@ func (s *Service) SendGrad(g Gradient, dummy *int) error {
 	select {
 	case <-s.initialized:
 	default:
-		return ErrUninitialized
+		return errors.New(Uninitialized)
 	}
 
 	s.mu.Lock()
diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go
index 4c9fac4536e09013916aadb26af3a86a5a775b4f..f317535592165b921491120888badd30c6795c12 100644
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -10,13 +10,15 @@ import (
 )
 
 func TestFull(t *testing.T) {
-	s := pserver.NewService()
+	s, err := pserver.NewService("", time.Second*5)
+	if err != nil {
+		t.Error(err)
+	}
 	var p pserver.Parameter
 	p.Name = "param_a"
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
 	p.ElementType = pserver.Int32
-	var dummy int
-	err := s.InitParam(pserver.ParameterWithConfig{p, nil}, &dummy)
+	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: nil}, nil)
 	if err != nil {
 		t.FailNow()
 	}
@@ -25,12 +27,12 @@ func TestFull(t *testing.T) {
 	p1.Name = "param_b"
 	p1.Content = []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
 	p1.ElementType = pserver.Float32
-	err = s.InitParam(pserver.ParameterWithConfig{p1, nil}, &dummy)
+	err = s.InitParam(pserver.ParameterWithConfig{Param: p1, Config: nil}, nil)
 	if err != nil {
 		t.FailNow()
 	}
 
-	err = s.FinishInitParams(0, &dummy)
+	err = s.FinishInitParams(0, nil)
 	if err != nil {
 		t.FailNow()
 	}
@@ -46,11 +48,11 @@ func TestFull(t *testing.T) {
 	}
 
 	g1, g2 := pserver.Gradient(p1), pserver.Gradient(p)
-	err = s.SendGrad(g1, &dummy)
+	err = s.SendGrad(g1, nil)
 	if err != nil {
 		t.FailNow()
 	}
-	err = s.SendGrad(g2, &dummy)
+	err = s.SendGrad(g2, nil)
 
 	if err != nil {
 		t.FailNow()
@@ -73,38 +75,43 @@ func TestFull(t *testing.T) {
 }
 
 func TestMultipleInit(t *testing.T) {
-	s := pserver.NewService()
-	var dummy int
-	err := s.FinishInitParams(0, &dummy)
+	s, err := pserver.NewService("", time.Second*5)
+	if err != nil {
+		t.Error(err)
+	}
+	err = s.FinishInitParams(0, nil)
 	if err != nil {
 		t.FailNow()
 	}
 
-	err = s.FinishInitParams(0, &dummy)
-	if err != pserver.ErrAlreadyInitialized {
+	err = s.FinishInitParams(0, nil)
+	if err.Error() != pserver.AlreadyInitialized {
 		t.FailNow()
 	}
 }
 
 func TestUninitialized(t *testing.T) {
-	s := pserver.NewService()
-	var dummy int
-	err := s.SendGrad(pserver.Gradient{}, &dummy)
-	if err != pserver.ErrUninitialized {
+	s, err := pserver.NewService("", time.Second*5)
+	err = s.SendGrad(pserver.Gradient{}, nil)
+	if err.Error() != pserver.Uninitialized {
 		t.FailNow()
 	}
 }
 
 func TestBlockUntilInitialized(t *testing.T) {
-	s := pserver.NewService()
+	s, err := pserver.NewService("", time.Second*5)
+	if err != nil {
+		t.Error(err)
+	}
 	ch := make(chan struct{}, 2)
+	errCh := make(chan error, 2)
 	var wg sync.WaitGroup
 	wg.Add(1)
 	go func() {
 		var param pserver.Parameter
 		err := s.GetParam("param_a", &param)
 		if err != nil {
-			t.FailNow()
+			errCh <- err
 		}
 		wg.Done()
 		ch <- struct{}{}
@@ -112,10 +119,9 @@ func TestBlockUntilInitialized(t *testing.T) {
 
 	wg.Add(1)
 	go func() {
-		var dummy int
-		err := s.Save("", &dummy)
+		err := s.Save("", nil)
 		if err != nil {
-			t.FailNow()
+			errCh <- err
 		}
 		wg.Done()
 		ch <- struct{}{}
@@ -127,6 +133,8 @@ func TestBlockUntilInitialized(t *testing.T) {
 	case <-ch:
 		// some function returned before initialization is completed.
 		t.FailNow()
+	case <-errCh:
+		t.FailNow()
 	default:
 	}
 
@@ -134,13 +142,12 @@ func TestBlockUntilInitialized(t *testing.T) {
 	p.Name = "param_a"
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
 	p.ElementType = pserver.Int32
-	var dummy int
-	err := s.InitParam(pserver.ParameterWithConfig{p, nil}, &dummy)
+	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: nil}, nil)
 	if err != nil {
 		t.FailNow()
 	}
 
-	err = s.FinishInitParams(0, &dummy)
+	err = s.FinishInitParams(0, nil)
 	if err != nil {
 		t.FailNow()
 	}
diff --git a/go/utils/networkhelper/helper.go b/go/utils/networkhelper/helper.go
new file mode 100644
index 0000000000000000000000000000000000000000..fbeaea8f5e7d93309befbd23063e474a4c6df46e
--- /dev/null
+++ b/go/utils/networkhelper/helper.go
@@ -0,0 +1,45 @@
+package networkhelper
+
+import (
+	"errors"
+	"net"
+)
+
+// GetExternalIP returns the ip address of local network interface, not the
+// loopback device.
+func GetExternalIP() (string, error) {
+	ifaces, err := net.Interfaces()
+	if err != nil {
+		return "", err
+	}
+	for _, iface := range ifaces {
+		if iface.Flags&net.FlagUp == 0 {
+			continue // interface down
+		}
+		if iface.Flags&net.FlagLoopback != 0 {
+			continue // loopback interface
+		}
+		addrs, err := iface.Addrs()
+		if err != nil {
+			return "", err
+		}
+		for _, addr := range addrs {
+			var ip net.IP
+			switch v := addr.(type) {
+			case *net.IPNet:
+				ip = v.IP
+			case *net.IPAddr:
+				ip = v.IP
+			}
+			if ip == nil || ip.IsLoopback() {
+				continue
+			}
+			ip = ip.To4()
+			if ip == nil {
+				continue // not an ipv4 address
+			}
+			return ip.String(), nil
+		}
+	}
+	return "", errors.New("are you connected to the network?")
+}
diff --git a/go/utils/networkhelper/helper_test.go b/go/utils/networkhelper/helper_test.go
new file mode 100644
index 0000000000000000000000000000000000000000..4208f9e358fc4345b73a2b8a9211b8889c1190d8
--- /dev/null
+++ b/go/utils/networkhelper/helper_test.go
@@ -0,0 +1,10 @@
+package networkhelper
+
+import "testing"
+
+func TestGetIP(t *testing.T) {
+	_, err := GetExternalIP()
+	if err != nil {
+		t.Errorf("GetExternalIP returns error : %v\n", err)
+	}
+}
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 47ca1833967ee705d6558b1dad06a6335b30f03a..573bd937a351a6f308974e14f3bc92cbe1b541bc 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -8,6 +8,7 @@ add_subdirectory(gserver)
 add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
+add_subdirectory(optimizer)
 add_subdirectory(strings)
 
 # Do not build go directory until go cmake is working smoothly.
@@ -19,8 +20,8 @@ find_package(Boost QUIET)
 
 if(Boost_FOUND)
   include_directories(${Boost_INCLUDE_DIRS})
-  include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-  add_subdirectory(majel)
+  add_subdirectory(platform)
+  add_subdirectory(framework)
 endif()
 
 if(WITH_C_API)
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index 071bc36c2ded51ba977350aeae15f6d244cea5be..f2315e31cc06d8b5fea7a9fd203a697bac603a90 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -16,7 +16,7 @@ set(API_HEADER
     Internal.h)
 
 add_library(paddle_api STATIC ${API_SOURCES})
-add_dependencies(paddle_api gen_proto_cpp)
+add_dependencies(paddle_api gen_proto_cpp paddle_trainer_lib)
 
 INCLUDE(${SWIG_USE_FILE})
 INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)
@@ -45,7 +45,7 @@ SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
 )
 
 IF(APPLE)
-    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load")
+    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
 ELSE(APPLE)
     SET(START_GROUP "-Xlinker -start-group")
     SET(END_GROUP "-Xlinker -end-group")
diff --git a/paddle/api/Paddle.i b/paddle/api/Paddle.i
index 068ba286c07d8854a1a7c7042224a679b50b4957..3237e73745dca58bed923b20851f0f0039a3487c 100644
--- a/paddle/api/Paddle.i
+++ b/paddle/api/Paddle.i
@@ -179,6 +179,7 @@ namespace std {
 %newobject ParameterOptimizer::needSpecialTraversal;
 %newobject ParameterUpdater::createLocalUpdater;
 %newobject ParameterUpdater::createRemoteUpdater;
+%newobject ParameterUpdater::createNewRemoteUpdater;
 
 %feature("director") UpdateCallback;
 %feature("autodoc", 1); // To generate method stub, for code hint in ide
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index da0f157abd68c73c45f498cf9ef2726aac67c95b..5fb3d1c73bc56e921f13aafd27c25224e259b3fe 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -841,6 +841,9 @@ public:
   static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
                                                int passCount,
                                                bool useSparseUpdater);
+  static ParameterUpdater* createNewRemoteUpdater(
+      OptimizationConfig* config,
+      const std::string pserverSpec) throw(UnsupportError);
   ~ParameterUpdater();
 
   /**
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp
index 79921ea6e787f3c0ebecaad6a9a54bac92211320..1aaefdfb8107a2eaa0432211fd7df4f5f12d537f 100644
--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -15,6 +15,9 @@ limitations under the License. */
 #include "PaddleAPI.h"
 
 #include "PaddleAPIPrivate.h"
+#ifndef PADDLE_WITHOUT_GOLANG
+#include "paddle/trainer/NewRemoteParameterUpdater.h"
+#endif
 #include "paddle/trainer/RemoteParameterUpdater.h"
 #include "paddle/trainer/ThreadParameterUpdater.h"
 
@@ -28,6 +31,19 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(
   return updater;
 }
 
+ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
+    OptimizationConfig *config,
+    const std::string pserverSpec) throw(UnsupportError) {
+#ifndef PADDLE_WITHOUT_GOLANG
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
+      config->m->getConfig(), pserverSpec));
+  return updater;
+#else
+  throw UnsupportError();
+#endif
+}
+
 ParameterUpdater *ParameterUpdater::createRemoteUpdater(
     OptimizationConfig *config, int passCount, bool useSparseUpdater) {
   auto updater = new ParameterUpdater();
diff --git a/paddle/framework/.clang-format b/paddle/framework/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..29282dc87e2c499988c17d90d47d44cd5cf7f115
--- /dev/null
+++ b/paddle/framework/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e3c3155aa902c941058ea1b15488360df6c06175
--- /dev/null
+++ b/paddle/framework/CMakeLists.txt
@@ -0,0 +1,6 @@
+cc_library(ddim SRCS ddim.cc)
+cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
+
+nv_test(dim_test SRCS dim_test.cu DEPS ddim)
+
+cc_test(variable_test SRCS variable_test.cc)
diff --git a/paddle/majel/ddim.cc b/paddle/framework/ddim.cc
similarity index 94%
rename from paddle/majel/ddim.cc
rename to paddle/framework/ddim.cc
index f32408ed53074234873ec0ea8ee7f4e449e5e908..3f949a6595ea326b97ac567daf9b35a68c8cf7f8 100644
--- a/paddle/majel/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -1,6 +1,7 @@
-#include "paddle/majel/ddim.h"
+#include "paddle/framework/ddim.h"
 
-namespace majel {
+namespace paddle {
+namespace framework {
 
 ///@cond HIDDEN
 
@@ -66,7 +67,7 @@ DDim make_ddim(const std::vector<int>& dims) {
 ///@cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int&> {
-public:
+ public:
   DynamicMutableIndexer(int idx) : idx_(idx) {}
 
   template <int D>
@@ -74,12 +75,12 @@ public:
     return dim[idx_];
   }
 
-private:
+ private:
   int idx_;
 };
 
 class DynamicConstIndexer : public boost::static_visitor<int> {
-public:
+ public:
   DynamicConstIndexer(int idx) : idx_(idx) {}
 
   template <int D>
@@ -87,7 +88,7 @@ public:
     return dim[idx_];
   }
 
-private:
+ private:
   int idx_;
 };
 
@@ -213,10 +214,11 @@ struct DDimPrinter : boost::static_visitor<void> {
 
 ///\endcond
 
-std::ostream& operator<<(std::ostream& os, const majel::DDim& ddim) {
+std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
   DDimPrinter printer(os);
   boost::apply_visitor(printer, ddim);
   return os;
 }
 
-}  // namespace majel
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/majel/ddim.h b/paddle/framework/ddim.h
similarity index 79%
rename from paddle/majel/ddim.h
rename to paddle/framework/ddim.h
index 7be756f8c098ba5aa3a5ff4380c90f4b90a55bb7..223c4180bee45e21547364441476b27051daca56 100644
--- a/paddle/majel/ddim.h
+++ b/paddle/framework/ddim.h
@@ -5,20 +5,14 @@
 #include <stdexcept>
 #include <vector>
 
-#include "paddle/majel/dim.h"
+#include "paddle/framework/dim.h"
 
-namespace majel {
+namespace paddle {
+namespace framework {
 
 namespace {
-typedef boost::variant<Dim<1>,
-                       Dim<2>,
-                       Dim<3>,
-                       Dim<4>,
-                       Dim<5>,
-                       Dim<6>,
-                       Dim<7>,
-                       Dim<8>,
-                       Dim<9>>
+typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
+                       Dim<8>, Dim<9>>
     DDimVar;
 }
 
@@ -95,14 +89,15 @@ ssize_t product(const DDim& ddim);
 
 int arity(const DDim& ddim);
 
-std::ostream& operator<<(std::ostream&, const majel::DDim&);
+std::ostream& operator<<(std::ostream&, const DDim&);
 
-}  // namespace majel
+}  // namespace framework
+}  // namespace paddle
 
 namespace boost {
 
 template <typename T>
-T get(const majel::DDim& in) {
+T get(const paddle::framework::DDim& in) {
   return boost::get<T>(in.var);
 }
 
diff --git a/paddle/majel/ddim_test.cc b/paddle/framework/ddim_test.cc
similarity index 58%
rename from paddle/majel/ddim_test.cc
rename to paddle/framework/ddim_test.cc
index a5b8a7c4d26740c1c4169547e76a0cf5558facc0..36eef02370e0196c2af2c05f49176b70ce69235a 100644
--- a/paddle/majel/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -1,21 +1,19 @@
-//#include <stdexcept>
-//#include <unittest/unittest.h>
 #include <sstream>
 #include <vector>
 
 #include "gtest/gtest.h"
-#include "paddle/majel/ddim.h"
+#include "paddle/framework/ddim.h"
 
 TEST(DDim, Equality) {
   // construct a DDim from an initialization list
-  majel::DDim ddim = majel::make_ddim({9, 1, 5});
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({9, 1, 5});
   EXPECT_EQ(ddim[0], 9);
   EXPECT_EQ(ddim[1], 1);
   EXPECT_EQ(ddim[2], 5);
 
   // construct a DDim from a vector
   std::vector<int> vec({9, 1, 5});
-  majel::DDim vddim = majel::make_ddim(vec);
+  paddle::framework::DDim vddim = paddle::framework::make_ddim(vec);
   EXPECT_EQ(ddim[0], 9);
   EXPECT_EQ(ddim[1], 1);
   EXPECT_EQ(ddim[2], 5);
@@ -23,43 +21,43 @@ TEST(DDim, Equality) {
   // mutate a DDim
   ddim[1] = 2;
   EXPECT_EQ(ddim[1], 2);
-  majel::set(ddim, 0, 6);
-  EXPECT_EQ(majel::get(ddim, 0), 6);
+  paddle::framework::set(ddim, 0, 6);
+  EXPECT_EQ(paddle::framework::get(ddim, 0), 6);
 
   // vectorize a DDim
-  std::vector<int> res_vec = majel::vectorize(vddim);
+  std::vector<int> res_vec = paddle::framework::vectorize(vddim);
   EXPECT_EQ(res_vec[0], 9);
   EXPECT_EQ(res_vec[1], 1);
   EXPECT_EQ(res_vec[2], 5);
-  majel::Dim<3> d(3, 2, 1);
-  res_vec = majel::vectorize(majel::DDim(d));
+  paddle::framework::Dim<3> d(3, 2, 1);
+  res_vec = paddle::framework::vectorize(paddle::framework::DDim(d));
   EXPECT_EQ(res_vec[0], 3);
   EXPECT_EQ(res_vec[1], 2);
   EXPECT_EQ(res_vec[2], 1);
 
   // add two DDims
-  majel::DDim ddim_sum = ddim + vddim;
+  paddle::framework::DDim ddim_sum = ddim + vddim;
   EXPECT_EQ(ddim_sum[0], 15);
   EXPECT_EQ(ddim_sum[1], 3);
   EXPECT_EQ(ddim_sum[2], 10);
 
   // multiply two DDims
-  majel::DDim ddim_mul = ddim * vddim;
+  paddle::framework::DDim ddim_mul = ddim * vddim;
   EXPECT_EQ(ddim_mul[0], 54);
   EXPECT_EQ(ddim_mul[1], 2);
   EXPECT_EQ(ddim_mul[2], 25);
 
   // arity of a DDim
-  EXPECT_EQ(majel::arity(ddim), 3);
+  EXPECT_EQ(paddle::framework::arity(ddim), 3);
 
   // product of a DDim
-  EXPECT_EQ(majel::product(vddim), 45);
+  EXPECT_EQ(paddle::framework::product(vddim), 45);
 }
 
 TEST(DDim, Print) {
   // print a DDim
   std::stringstream ss;
-  majel::DDim ddim = majel::make_ddim({2, 3, 4});
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 3, 4});
   ss << ddim;
   EXPECT_EQ("2, 3, 4", ss.str());
 }
diff --git a/paddle/majel/dim.h b/paddle/framework/dim.h
similarity index 96%
rename from paddle/majel/dim.h
rename to paddle/framework/dim.h
index c4b0c6aea683384d4657dd5db6f419b9e1108704..bcde291d12d429a3f2cd41fa6d0ee606c7c9c92f 100644
--- a/paddle/majel/dim.h
+++ b/paddle/framework/dim.h
@@ -5,10 +5,11 @@
 #include <stdexcept>
 #include <type_traits>
 
-#include "paddle/majel/detail/cuda_assert.h"
-#include "paddle/majel/detail/hostdevice.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/hostdevice.h"
 
-namespace majel {
+namespace paddle {
+namespace framework {
 
 // Statically sized, statically indexed dimension
 template <int i>
@@ -74,7 +75,7 @@ struct Dim<1> {
       throw std::invalid_argument("Index out of range.");
     }
 #else
-    MAJEL_ASSERT(idx < size.head);
+    PADDLE_ASSERT(idx < size.head);
 #endif
   }
 
@@ -131,7 +132,7 @@ HOSTDEVICE int& indexer(Dim<D>& dim, int idx) {
     throw std::invalid_argument("Tried to access a negative dimension");
   }
 #else
-  MAJEL_ASSERT(idx >= 0);
+  PADDLE_ASSERT(idx >= 0);
 #endif
   if (idx == 0) {
     return dim.head;
@@ -146,7 +147,7 @@ HOSTDEVICE int& indexer<1>(Dim<1>& dim, int idx) {
     throw std::invalid_argument("Invalid index");
   }
 #else
-  MAJEL_ASSERT(idx == 0);
+  PADDLE_ASSERT(idx == 0);
 #endif
   return dim.head;
 }
@@ -158,7 +159,7 @@ HOSTDEVICE int indexer(const Dim<D>& dim, int idx) {
     throw std::invalid_argument("Tried to access a negative dimension");
   }
 #else
-  MAJEL_ASSERT(idx >= 0);
+  PADDLE_ASSERT(idx >= 0);
 #endif
   if (idx == 0) {
     return dim.head;
@@ -173,7 +174,7 @@ HOSTDEVICE int indexer<1>(const Dim<1>& dim, int idx) {
     throw std::invalid_argument("Invalid index");
   }
 #else
-  MAJEL_ASSERT(idx == 0);
+  PADDLE_ASSERT(idx == 0);
 #endif
   return dim.head;
 }
@@ -411,7 +412,7 @@ HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
 // XXX For some reason, overloading fails to resolve this correctly
 template <int i>
 typename std::enable_if<(i > 1), std::ostream&>::type operator<<(
-    std::ostream& os, const majel::Dim<i>& d) {
+    std::ostream& os, const Dim<i>& d) {
   os << d.head << ", " << d.tail;
   return os;
 }
@@ -420,7 +421,7 @@ typename std::enable_if<(i > 1), std::ostream&>::type operator<<(
 // XXX I wish this could be an overload instead of a template
 template <int i>
 typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
-    std::ostream& os, const majel::Dim<i>& d) {
+    std::ostream& os, const Dim<i>& d) {
   os << d.head;
   return os;
 }
@@ -448,4 +449,5 @@ HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
   return result;
 }
 
-}  // namespace majel
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/dim_test.cu b/paddle/framework/dim_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..809bf04826637195425a32c054c94e00ef940df9
--- /dev/null
+++ b/paddle/framework/dim_test.cu
@@ -0,0 +1,128 @@
+#include <thrust/device_vector.h>
+#include <sstream>
+
+#include "paddle/framework/dim.h"
+#include "gtest/gtest.h"
+
+__global__ void test(paddle::framework::Dim<2>* o) {
+    o[0] = paddle::framework::make_dim(5, 6);
+}
+
+__global__ void dyn_idx_gpu(int* o) {
+    auto d = paddle::framework::make_dim(5, 6);
+    o[0] = d[1];
+}
+
+TEST(Dim, Equality) {
+    // construct a Dim on the CPU
+    auto a = paddle::framework::make_dim(3, 4);
+    EXPECT_EQ(paddle::framework::get<0>(a), 3);
+    EXPECT_EQ(paddle::framework::get<1>(a), 4);
+
+    // construct a Dim on the GPU
+    thrust::device_vector<paddle::framework::Dim<2>> t(2);
+    test<<<1,1>>>(thrust::raw_pointer_cast(t.data()));
+    a = t[0];
+    EXPECT_EQ(paddle::framework::get<0>(a), 5);
+    EXPECT_EQ(paddle::framework::get<1>(a), 6);
+
+    // linearization
+    auto b = paddle::framework::make_dim(7, 8);
+    EXPECT_EQ(paddle::framework::linearize(a, b), 83);
+
+    // product
+    EXPECT_EQ(paddle::framework::product(a), 30);
+
+    // mutate a Dim
+    paddle::framework::get<1>(b) = 10;
+    EXPECT_EQ(paddle::framework::get<0>(b), 7);
+    EXPECT_EQ(paddle::framework::get<1>(b), 10);
+
+    // dynamic access
+    paddle::framework::get(b, 0) = 8;
+    b[1] = 11;
+    EXPECT_EQ(paddle::framework::get<0>(b), 8);
+    EXPECT_EQ(paddle::framework::get<1>(b), 11);
+    EXPECT_EQ(paddle::framework::get(b, 0), 8);
+    EXPECT_EQ(b[1], 11);
+
+    // dynamic access on GPU
+    thrust::device_vector<int> r(1);
+    dyn_idx_gpu<<<1,1>>>(thrust::raw_pointer_cast(r.data()));
+    int res = r[0];
+    EXPECT_EQ(res, 6);
+
+    // ex_prefix_mul
+    paddle::framework::Dim<3> c = paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 3);
+    EXPECT_EQ(paddle::framework::get<2>(c), 12);
+
+    // contiguous_strides
+    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 1, 10));
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 0);
+    EXPECT_EQ(paddle::framework::get<2>(c), 10);
+    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 10, 1));
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 10);
+    EXPECT_EQ(paddle::framework::get<2>(c), 0);
+    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(1, 10, 10));
+    EXPECT_EQ(paddle::framework::get<0>(c), 0);
+    EXPECT_EQ(paddle::framework::get<1>(c), 1);
+    EXPECT_EQ(paddle::framework::get<2>(c), 10);
+    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(2, 3, 4));
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 2);
+    EXPECT_EQ(paddle::framework::get<2>(c), 6);
+
+    // generate from an index
+    auto size = paddle::framework::make_dim(4, 5, 2);
+    c = paddle::framework::Dim<3>(14, size);
+    EXPECT_EQ(paddle::framework::get<0>(c), 2);
+    EXPECT_EQ(paddle::framework::get<1>(c), 3);
+    EXPECT_EQ(paddle::framework::get<2>(c), 0);
+    c = paddle::framework::Dim<3>(25, size);
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 1);
+    EXPECT_EQ(paddle::framework::get<2>(c), 1);
+}
+
+TEST(Dim, Bool) {
+    auto a = paddle::framework::make_dim(3, 4);
+    auto b = paddle::framework::make_dim(5, 6);
+    auto c = paddle::framework::make_dim(3, 4);
+
+    // in_bounds check
+    EXPECT_TRUE(paddle::framework::contained(a, b));
+    EXPECT_FALSE(paddle::framework::contained(b, a));
+
+    // comparison
+    EXPECT_TRUE(a == a);
+    EXPECT_FALSE(a == b);
+    EXPECT_TRUE(a == c);
+
+    // contiguous check
+    int x = 4, y = 5, z = 2;
+    paddle::framework::Dim<3> sizef(x, y, z);
+    paddle::framework::Dim<3> stridea(1, x, x*y);
+    paddle::framework::Dim<3> strideb(2, 2*x, 2*x*y);
+    paddle::framework::Dim<3> stridec(1, x, 2*x*y);
+    EXPECT_TRUE(paddle::framework::contiguous(sizef, stridea));
+    EXPECT_FALSE(paddle::framework::contiguous(sizef, strideb));
+    EXPECT_FALSE(paddle::framework::contiguous(sizef, stridec));
+}
+
+TEST(Dim, Print) {
+    {
+        std::stringstream ss;
+        auto a = paddle::framework::make_dim(2, 3);
+        ss << a;
+        EXPECT_EQ(ss.str(), "2, 3");
+    }
+    {
+        std::stringstream ss;
+        ss << paddle::framework::make_dim(8);
+        EXPECT_EQ(ss.str(), "8");
+    }
+}
diff --git a/paddle/majel/README.md b/paddle/framework/tensor.md
similarity index 100%
rename from paddle/majel/README.md
rename to paddle/framework/tensor.md
diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h
new file mode 100644
index 0000000000000000000000000000000000000000..72c4a7a2a1d1cf93a784f24e687727ee8481484c
--- /dev/null
+++ b/paddle/framework/variable.h
@@ -0,0 +1,71 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+#pragma once
+
+#include <memory>
+#include <typeindex>
+#include <typeinfo>
+
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace framework {
+
+class Variable {
+ public:
+  template <typename T>
+  const T& Get() const {
+    PADDLE_ASSERT(IsType<T>());
+    return *static_cast<const T*>(holder_->Ptr());
+  }
+
+  template <typename T>
+  T* GetMutable() {
+    if (!IsType<T>()) {
+      holder_.reset(new PlaceholderImpl<T>(new T()));
+    }
+    return static_cast<T*>(holder_->Ptr());
+  }
+
+  template <typename T>
+  bool IsType() const {
+    return holder_ != nullptr &&
+           std::type_index(typeid(T)) == std::type_index(holder_->Type());
+  }
+
+ private:
+  struct Placeholder {
+    virtual ~Placeholder() {}
+    virtual const std::type_info& Type() const = 0;
+    virtual void* Ptr() const = 0;
+  };
+
+  // Placeholder hides type T, so it doesn't appear as a template
+  // parameter of Variable.
+  template <typename T>
+  struct PlaceholderImpl : public Placeholder {
+    PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {}
+
+    virtual const std::type_info& Type() const { return type_; }
+    virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
+
+    std::unique_ptr<T> ptr_;
+    const std::type_info& type_;
+  };
+
+  std::unique_ptr<Placeholder>
+      holder_;  // pointers to a PlaceholderImpl object indeed.
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/variable.md b/paddle/framework/variable.md
new file mode 100644
index 0000000000000000000000000000000000000000..f44d5ea46e7ce98dd443d684ad42308496bc4179
--- /dev/null
+++ b/paddle/framework/variable.md
@@ -0,0 +1,52 @@
+# Design Doc: Variable
+
+
+Variable is also known as *blob* in MxNet and Caffe2.  It is the input and output type of operators, where a neural network is a graph of operators.
+
+## Requirements: Lazy Memory Allocation
+
+For the flexibility of a DL system, a variable should be able to contain any typed value -- a tensor in most cases, but could also be some integer IDs or a scope of other variables in the case of RNN.
+
+To use the minimum amount of memory, we'd like that a variable to allocate memory when it has to, or, lazy memory allocation.  Let's take the following example:
+
+```cpp
+Variable vr, v1, v2;
+
+Tensor* t1 = new Tensor();
+Tensor* t2 = new Tensor();
+
+Randomize(
+  /* malloc */ v1.GetMutable<Tensor>().mutable_data<float16>(DDim(100,200)),
+  /* size */ t1.Size());
+  
+Randomize(
+  /* malloc */ v2.GetMutable<Tensor>().mutable_data<float16>(DDim(200,300)),
+  /* size */ t2.Size());
+  
+Mult(
+  /*result*/ vr.GetMutable<Tensor>().mutable_data<v1.Type()>(SizeOfMult(v1, v2)),
+  /*input1*/ v1.Get<Tensor>().data(),
+  /*input2*/ v2.Get<Tensor>().data());
+```
+     
+We see that a variable holds nothing until `Variable::GetMutable<Tensor>()` allocates a tensor and puts it in the variable.  Similarly, a tensor gets its memory until `Tensor::mutable_data()`.
+
+This syntax for lazy memory allocation when we call `Randomize` and `Mult`, those functions that mutate the variable, so it saves us some line of C++ code.
+
+
+## Implementation: Type Hiding
+
+To make memory allocation lazy, we cannot assume that we know the type held by a variable at definition time.  In other words, `class Variable` cannot be a template `template <T> class Variable`.
+
+Because we don't know the type `T`, we cannot save a `T*` as `Variable's` data member.  Instead, we save an interface object `Placeholder`, who can return the pointer to the saved object via `Placeholder::Ptr()` as `void*`.
+
+But anyway, Variable needs to know `T` so could it `delete<T>(ptr)` and so could `Variable::Get` checks the expected type and the saved object's type.
+
+We save `T` in `PlaceholderImpl`, the implementation of `Placeholder`.  Please be aware that `PlaceholderImpl` is a class template and `T` is passed in as a template parameter.
+
+Because `PlaceholderImpl` knows `T`, it can save and return `typeid(T)` for the type comparison in `Variable::Get` and `Variable::GetMutable`.
+
+
+## Conclusion
+
+The technique type hiding utilizes C++ class templates, interface and derivation, and C++ RTTI (typeid).  This combination saves us from definition something like `caffe2::TypeMata`, which takes hundreds of lines of C++ code.
diff --git a/paddle/framework/variable_test.cc b/paddle/framework/variable_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aea03bcf5719dacc01d2d78b52b33e8a0b29b5e5
--- /dev/null
+++ b/paddle/framework/variable_test.cc
@@ -0,0 +1,40 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include <memory>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "paddle/framework/variable.h"
+
+TEST(Variable, GetMutable) {
+  using paddle::framework::Variable;
+
+  struct Tensor {
+    int content_;
+  };
+
+  std::unique_ptr<Variable> v(new Variable());
+
+  Tensor* t = v->GetMutable<Tensor>();
+  t->content_ = 1234;
+
+  const Tensor& tt = v->Get<Tensor>();
+  EXPECT_EQ(1234, tt.content_);
+
+  std::string* s = v->GetMutable<std::string>();
+  *s = "hello";
+
+  const std::string& ss = v->Get<std::string>();
+  EXPECT_EQ("hello", ss);
+}
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 1f54ac1231c6ac2e19b25bb336292194c63c11e9..5e170714cf5b183fcf6e76d34746333397e6b060 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -14,8 +14,8 @@ add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
 add_dependencies(paddle_function gen_proto_cpp)
 
-if(WITH_GPU)
 if(WITH_TESTING)
+if(WITH_GPU)
     # TODO:
     # file(GLOB test_files . *OpTest.cpp)
     # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
@@ -30,6 +30,8 @@ if(WITH_TESTING)
     add_simple_unittest(CosSimOpTest)
     add_simple_unittest(RowConvOpTest)
 endif()
+
+add_simple_unittest(ConvOpTest)
 endif()
 
 add_style_check_target(paddle_function ${h_files})
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
index 1b25172ca5c0c4e64db01806fb8239af7e06d90d..9e9dd20e6f3abe3bd087e434d7b64eec5bfadcfb 100644
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -28,7 +28,7 @@ void testMatrixProjectionForward(int context_start,
                std::max(0, (int)(context_start + context_length - 1));
   if (pad == 0) is_padding = false;
 
-  FunctionCompare test(
+  CpuGpuFuncCompare test(
       "ContextProjectionForward",
       FuncConfig()
           .set("context_length", context_length)
@@ -60,7 +60,7 @@ void testMatrixProjectionBackward(int context_start,
                std::max(0, (int)(context_start + context_length - 1));
   if (pad == 0) is_padding = false;
 
-  FunctionCompare test(
+  CpuGpuFuncCompare test(
       "ContextProjectionBackward",
       FuncConfig()
           .set("context_length", context_length)
diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb4f48364b9b454af7d37fe4d3c340666e53285c
--- /dev/null
+++ b/paddle/function/ConvOp.h
@@ -0,0 +1,144 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/*
+ * \brief Based on the ConvFunctionBase class, the forward calculation,
+ *        backward input calculation and backward filter calculation
+ *        of convolution operations can be implemented.
+ *
+ * Arguments of forward and backward calculation:
+ *   1. Forward calculation of convolution.
+ *      inputs = {INPUT, FILTER}, outputs = {OUTPUT}
+ *      The first and second input arguments are input image and filter data.
+ *      The output argument is output image.
+ *
+ *   2. Backward input calculation of convolution.
+ *      inputs = {OUTPUT_GRAD, FILTER}, outputs = {INPUT_GRAD}
+ *      The first and second input arguments are output grad image
+ *      and filter data.
+ *      The output argument is input grad image.
+ *
+ *   3. Backward filter calculation of convolution.
+ *      inputs = {OUTPUT_GRAD, INPUT}, outputs = {FILTER_GRAD}
+ *      The first and second input arguments are output grad image
+ *      and input image.
+ *      The output argument is filter grad.
+ *
+ * Arguments format of input, filter and output:
+ *   1. Input image, output image, input image gradient, output image gradient
+ *      are all NCHW format. Where N is batch size, C is the number of channels,
+ *      H and W is the height and width of image or image gradient.
+ *
+ *   2. The format of the filter data is MCHW, where M is the number of output
+ *      image channels, C is the number of input image channels,
+ *      H and W is height and width of filter.
+ *
+ *      If `groups` is greater than 1, the filter's data format should be GMCHW,
+ *      where G is the `groups`, and G * M is the number of output image
+ *      channels, G * C is the number of input image channels,
+ *      H and W is height and width of filter.
+ */
+class ConvFunctionBase : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    strides_ = config.get<std::vector<size_t>>("strides");
+    paddings_ = config.get<std::vector<size_t>>("paddings");
+    groups_ = config.get<size_t>("groups");
+
+    // number of inputs and outputs
+    numInputs_ = 2;
+    numOutputs_ = 1;
+  }
+
+  // input can be INPUT and INPUT_GRAD
+  // filter can be FILTER and FILTER_GRAD
+  // output can be OUTPUT and OUTPUT_GRAD
+  void checkShape(const TensorShape& input,
+                  const TensorShape& filter,
+                  const TensorShape& output) {
+    // inputs and outputs arguments should be 4-dimensional.
+    CHECK_EQ(input.ndims(), (size_t)4);
+    CHECK_EQ(output.ndims(), (size_t)4);
+    // The batchSize of the input needs to be equal to
+    // the batchSize of the output.
+    CHECK_EQ(input[0], output[0]);
+
+    if (filter.ndims() == (size_t)4) {
+      // If the filter's dimension is 4, groups convolution is not supported.
+      CHECK_EQ(groups_, (size_t)1);
+      // The input and output channel dimensions are the second and first
+      // dimensions of the filter shape.
+      CHECK_EQ(input[1], filter[1]);
+      CHECK_EQ(output[1], filter[0]);
+    } else {
+      // filter argument should be 5-dimensional.
+      CHECK_EQ(filter.ndims(), (size_t)5);
+      // The first dimension of the filter is the size of the group
+      CHECK_EQ(filter[0], groups_);
+      // The input and output channel dimensions are the third and second
+      // dimensions of the filter shape.
+      CHECK_EQ(input[1], filter[2] * groups_);
+      CHECK_EQ(output[1], filter[1] * groups_);
+    }
+  }
+
+protected:
+  size_t getFilterHeight(const TensorShape& filter) const {
+    return filter[filter.ndims() - 2];
+  }
+
+  size_t getFilterWidth(const TensorShape& filter) const {
+    return filter[filter.ndims() - 1];
+  }
+
+  std::vector<size_t> strides_;
+  std::vector<size_t> paddings_;
+
+  /// Group size, refer to grouped convolution in
+  /// Alex Krizhevsky's paper: when group=2, the first half of the
+  /// filters are only connected to the first half of the input channels,
+  /// and the second half only connected to the second half.
+  size_t groups_;
+
+  inline int strideH() const { return strides_[0]; }
+
+  inline int strideW() const { return strides_[1]; }
+
+  inline int paddingH() const { return paddings_[0]; }
+
+  inline int paddingW() const { return paddings_[1]; }
+
+  // A temporary memory in convolution calculation.
+  MemoryHandlePtr memory_;
+
+  template <DeviceType Device>
+  void resizeBuffer(size_t newSize) {
+    if (!memory_ || newSize * sizeof(real) > memory_->getAllocSize()) {
+      if (Device == DEVICE_TYPE_CPU) {
+        memory_ = std::make_shared<CpuMemoryHandle>(newSize * sizeof(real));
+      } else {
+        memory_ = std::make_shared<GpuMemoryHandle>(newSize * sizeof(real));
+      }
+    }
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dfa2f784610b0dd60340e0ebc6a066437f3715eb
--- /dev/null
+++ b/paddle/function/ConvOpTest.cpp
@@ -0,0 +1,210 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+#include "Function.h"
+#include "FunctionTest.h"
+
+namespace paddle {
+
+enum TestType {
+  kForwardTest = 0,
+  kBackwardInputTest = 1,
+  kBackwardFilterTest = 2,
+};
+
+template <DeviceType DType1, DeviceType DType2>
+class ConvolutionTest {
+public:
+  ConvolutionTest(const std::string& conv1,
+                  const std::string& conv2,
+                  TestType type,
+                  std::string algo = "auto") {
+    for (size_t batchSize : {1, 32}) {
+      for (size_t inputSize : {7, 14, 54}) {
+        for (size_t filterSize : {1, 3, 5}) {
+          for (size_t inputChannels : {3, 64}) {
+            for (size_t outputChannels : {3, 64, 128}) {
+              if (inputChannels < outputChannels) break;
+              for (size_t stride : {1, 2}) {
+                for (size_t padding : {0, 1}) {
+                  if (padding >= filterSize) break;
+                  size_t outputSize =
+                      (inputSize - filterSize + 2 * padding + stride) / stride;
+                  VLOG(3) << " batchSize=" << batchSize
+                          << " inputChannels=" << inputChannels
+                          << " inputHeight=" << inputSize
+                          << " inputWidth=" << inputSize
+                          << " outputChannels=" << outputChannels
+                          << " filterHeight=" << filterSize
+                          << " filterWidth=" << filterSize
+                          << " outputHeight=" << outputSize
+                          << " outputWidth=" << outputSize
+                          << " stride=" << stride << " padding=" << padding;
+
+                  std::vector<size_t> paddings = {padding, padding};
+                  std::vector<size_t> strides = {stride, stride};
+                  Compare2Function<DType1, DType2> test(
+                      conv1,
+                      conv2,
+                      FuncConfig()
+                          .set("paddings", paddings)
+                          .set("strides", strides)
+                          .set("groups", (size_t)1)
+                          .set("algo", algo));
+
+                  TensorShape input{
+                      batchSize, inputChannels, inputSize, inputSize};
+                  TensorShape filter{
+                      outputChannels, inputChannels, filterSize, filterSize};
+                  TensorShape output{
+                      batchSize, outputChannels, outputSize, outputSize};
+
+                  if (type == kForwardTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.run();
+                  } else if (type == kBackwardInputTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
+                    test.run();
+                  } else if (type == kBackwardFilterTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.run();
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+// Mainly used to test cases where the height and width (input, filter)
+// are not equal.
+template <DeviceType DType1, DeviceType DType2>
+class ConvolutionTest2 {
+public:
+  ConvolutionTest2(const std::string& conv1,
+                   const std::string& conv2,
+                   TestType type,
+                   std::string algo = "auto") {
+    for (size_t batchSize : {16}) {
+      for (size_t inputHeight : {7, 31}) {
+        for (size_t inputWidth : {10, 54}) {
+          for (size_t filterHeight : {1, 5}) {
+            for (size_t filterWidth : {3, 7}) {
+              for (size_t inputChannels : {7}) {
+                for (size_t outputChannels : {32}) {
+                  size_t stride = 1;
+                  size_t padding = 0;
+                  size_t outputHeight =
+                      (inputHeight - filterHeight + 2 * padding + stride) /
+                      stride;
+                  size_t outputWidth =
+                      (inputWidth - filterWidth + 2 * padding + stride) /
+                      stride;
+                  VLOG(3) << " batchSize=" << batchSize
+                          << " inputChannels=" << inputChannels
+                          << " inputHeight=" << inputHeight
+                          << " inputWidth=" << inputWidth
+                          << " outputChannels=" << outputChannels
+                          << " filterHeight=" << filterHeight
+                          << " filterWidth=" << filterWidth
+                          << " outputHeight=" << outputHeight
+                          << " outputWidth=" << outputWidth
+                          << " stride=" << stride << " padding=" << padding;
+
+                  std::vector<size_t> paddings = {padding, padding};
+                  std::vector<size_t> strides = {stride, stride};
+                  Compare2Function<DType1, DType2> test(
+                      conv1,
+                      conv2,
+                      FuncConfig()
+                          .set("paddings", paddings)
+                          .set("strides", strides)
+                          .set("groups", (size_t)1)
+                          .set("algo", algo));
+
+                  TensorShape input{
+                      batchSize, inputChannels, inputHeight, inputWidth};
+                  TensorShape filter{
+                      outputChannels, inputChannels, filterHeight, filterWidth};
+                  TensorShape output{
+                      batchSize, outputChannels, outputHeight, outputWidth};
+
+                  if (type == kForwardTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.run();
+                  } else if (type == kBackwardInputTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
+                    test.run();
+                  } else if (type == kBackwardFilterTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.run();
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+TEST(Forward, GEMM) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
+      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test2(
+      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(Forward, GEMM2) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
+}
+
+TEST(BackwardInput, GEMM) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
+}
+
+TEST(BackwardFilter, GEMM) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
+}
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/CosSimOpTest.cpp b/paddle/function/CosSimOpTest.cpp
index 48c815f027161b48c17ce654ab819156fd856199..f6c0041101f50f8f47d45e0fe0fe1064e0f9cb69 100644
--- a/paddle/function/CosSimOpTest.cpp
+++ b/paddle/function/CosSimOpTest.cpp
@@ -22,7 +22,7 @@ void testCosSimForward(size_t height_x,
                        size_t height_y,
                        size_t width,
                        real scale) {
-  FunctionCompare test("CosSimForward", FuncConfig().set("scale", scale));
+  CpuGpuFuncCompare test("CosSimForward", FuncConfig().set("scale", scale));
   // prepare input arguments
   test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
   test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
@@ -36,7 +36,7 @@ void testCosSimBackward(size_t height_x,
                         size_t height_y,
                         size_t width,
                         real scale) {
-  FunctionCompare test("CosSimBackward", FuncConfig().set("scale", scale));
+  CpuGpuFuncCompare test("CosSimBackward", FuncConfig().set("scale", scale));
   // prepare input arguments
   test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
   test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
diff --git a/paddle/function/CrossMapNormalOpTest.cpp b/paddle/function/CrossMapNormalOpTest.cpp
index 51f5da81bfc9ae870ac9949ba74da01a9449a04d..ed17b17da616db9d52318f21c133458d698b0dd8 100644
--- a/paddle/function/CrossMapNormalOpTest.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
@@ -28,11 +28,11 @@ TEST(CrossMapNormal, real) {
                     << " size=" << size;
 
             // init Test object
-            FunctionCompare test("CrossMapNormal",
-                                 FuncConfig()
-                                     .set("size", size)
-                                     .set("scale", (real)1.5)
-                                     .set("pow", (real)0.5));
+            CpuGpuFuncCompare test("CrossMapNormal",
+                                   FuncConfig()
+                                       .set("size", size)
+                                       .set("scale", (real)1.5)
+                                       .set("pow", (real)0.5));
             // prepare input arguments
             TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
             test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
@@ -57,11 +57,11 @@ TEST(CrossMapNormalGrad, real) {
                     << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " size=" << size;
 
-            FunctionCompare test("CrossMapNormalGrad",
-                                 FuncConfig()
-                                     .set("size", size)
-                                     .set("scale", (real)1.5)
-                                     .set("pow", (real)0.5));
+            CpuGpuFuncCompare test("CrossMapNormalGrad",
+                                   FuncConfig()
+                                       .set("size", size)
+                                       .set("scale", (real)1.5)
+                                       .set("pow", (real)0.5));
             TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
             test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
             test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index 0cfafdb27f55a3e6617d31a968d2a05fc77f5b46..ba446bf92da264fafa1fb47a2c30da9cb13176ce 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -22,14 +22,62 @@ namespace paddle {
 
 typedef std::shared_ptr<BufferArg> BufferArgPtr;
 
+namespace test {
+template <DeviceType DType>
+struct Allocator;
+
+template <>
+struct Allocator<DEVICE_TYPE_CPU> {
+  using type = CpuMemoryHandle;
+};
+
+template <>
+struct Allocator<DEVICE_TYPE_GPU> {
+  using type = GpuMemoryHandle;
+};
+
+// Copy argument1 to argument2
+template <DeviceType DType1, DeviceType DType2>
+class CopyArgument {
+public:
+  void operator()(const BufferArg& arg1, BufferArg& arg2) {
+    CHECK_EQ(arg1.valueType(), arg2.valueType());
+    CHECK_LE(arg1.shape().getElements(), arg2.shape().getElements());
+
+    if (arg1.valueType() == VALUE_TYPE_INT32) {
+      IVectorPtr vector1 =
+          IVector::create((int*)arg1.data(),
+                          arg1.shape().getElements(),
+                          DType1 == DEVICE_TYPE_CPU ? false : true);
+      IVectorPtr vector2 =
+          IVector::create((int*)arg2.data(),
+                          arg2.shape().getElements(),
+                          DType2 == DEVICE_TYPE_CPU ? false : true);
+      vector2->copyFrom(*vector1);
+    } else {
+      VectorPtr vector1 =
+          Vector::create((real*)arg1.data(),
+                         arg1.shape().getElements(),
+                         DType1 == DEVICE_TYPE_CPU ? false : true);
+      VectorPtr vector2 =
+          Vector::create((real*)arg2.data(),
+                         arg2.shape().getElements(),
+                         DType2 == DEVICE_TYPE_CPU ? false : true);
+      vector2->copyFrom(*vector1);
+    }
+  }
+};
+}  // namespace test
+
 /**
- * \brief A class for comparing CPU and GPU implementations of Function.
- *
+ * \brief A class for comparing two Functions of different implementations.
+ *        For example, can be used to compare the CPU and GPU implementation
+ *        of the function is consistent.
  *
  * Use case:
  *  // Initializes a test object, the corresponding cpu and gpu Function
  *  // are constructed according to FunctionName and FuncConfig.
- *  FunctionCompare test(FunctionName, FuncConfig);
+ *  CpuGpuFuncCompare test(FunctionName, FuncConfig);
  *  // Prepare inputs and outputs arguments.
  *  // Here the input and output can not contain real data,
  *  // only contains the argument type and shape.
@@ -45,28 +93,38 @@ typedef std::shared_ptr<BufferArg> BufferArgPtr;
  *  // Compares CPU and GPU calculation results for consistency.
  *  test.run();
  */
-class FunctionCompare {
+template <DeviceType DType1, DeviceType DType2>
+class Compare2Function {
 public:
-  FunctionCompare(const std::string& name, const FuncConfig& config)
-      : cpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
-        gpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
-    cpuFunc_->init(config);
-    gpuFunc_->init(config);
+  typedef typename test::Allocator<DType1>::type Allocator1;
+  typedef typename test::Allocator<DType2>::type Allocator2;
+  typedef typename Tensor<real, DType1>::Vector Vector1;
+  typedef typename Tensor<real, DType2>::Vector Vector2;
+  typedef typename Tensor<real, DType1>::SparseMatrix SparseMatrix1;
+  typedef typename Tensor<real, DType2>::SparseMatrix SparseMatrix2;
+
+  Compare2Function(const std::string& name1,
+                   const std::string& name2,
+                   const FuncConfig& config)
+      : function1_(FunctionBase::funcRegistrar_.createByType(name1)),
+        function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
+    function1_->init(config);
+    function2_->init(config);
   }
 
-  ~FunctionCompare() {}
+  ~Compare2Function() {}
 
   // input need only contains shape, do not contains data.
   void addInputs(const BufferArg& input) {
     size_t size =
         input.shape().getElements() * sizeOfValuType(input.valueType());
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
 
-    cpuInputs_.emplace_back(std::make_shared<BufferArg>(
-        cpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
-    gpuInputs_.emplace_back(std::make_shared<BufferArg>(
-        gpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+    func1Inputs_.emplace_back(std::make_shared<BufferArg>(
+        func1Memory_.back()->getBuf(), input.valueType(), input.shape()));
+    func2Inputs_.emplace_back(std::make_shared<BufferArg>(
+        func2Memory_.back()->getBuf(), input.valueType(), input.shape()));
   }
 
   // assume one copy of sequence is shared by different SequenceArgs
@@ -75,62 +133,57 @@ public:
     size_t batchSize = input.shape()[0];
     size_t numSeqs = batchSize / 10 + 1;
     size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(sizeId));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(sizeId));
-    cpuSeq_ = std::make_shared<SequenceIdArg>(cpuMemory_.back()->getBuf(),
-                                              TensorShape{numSeqs + 1});
-    gpuSeq_ = std::make_shared<SequenceIdArg>(gpuMemory_.back()->getBuf(),
-                                              TensorShape{numSeqs + 1});
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(sizeId));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(sizeId));
+    seq1_ = std::make_shared<SequenceIdArg>(func1Memory_.back()->getBuf(),
+                                            TensorShape{numSeqs + 1});
+    seq2_ = std::make_shared<SequenceIdArg>(func2Memory_.back()->getBuf(),
+                                            TensorShape{numSeqs + 1});
     /// init sequence Id
-    initArg(*cpuSeq_, batchSize);
+    initArg(*seq1_, batchSize);
 
-    // todo(tianbing), delete it
-    CHECK_EQ(cpuSeq_->shape().getElements(), cpuSeq_->numSeqs() + 1);
-
-    CpuIVector cpuSeq(cpuSeq_->shape().getElements(), (int*)cpuSeq_->data());
-    GpuIVector gpuSeq(gpuSeq_->shape().getElements(), (int*)gpuSeq_->data());
-    gpuSeq.copyFrom(cpuSeq);
+    copyArg_(*seq1_, *seq2_);
   }
 
   void addInputs(const SequenceArg& input) {
     CHECK_EQ(input.shape().ndims(), 2UL);
     size_t batchSize = input.shape()[0];
-    if (!cpuSeq_ || !gpuSeq_) {  // sequence not exist
+    if (!seq1_ || !seq2_) {  // sequence not exist
       addSequence(SequenceIdArg(TensorShape{batchSize}));
     }
 
     size_t size =
         input.shape().getElements() * sizeOfValuType(input.valueType());
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
 
     /// SequenceArg
-    cpuInputs_.emplace_back(
-        std::make_shared<SequenceArg>(cpuMemory_.back()->getBuf(),
+    func1Inputs_.emplace_back(
+        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
                                       input.valueType(),
                                       input.shape(),
-                                      *cpuSeq_));
-    gpuInputs_.emplace_back(
-        std::make_shared<SequenceArg>(gpuMemory_.back()->getBuf(),
+                                      *seq1_));
+    func2Inputs_.emplace_back(
+        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
                                       input.valueType(),
                                       input.shape(),
-                                      *gpuSeq_));
+                                      *seq2_));
   }
 
   // output need only contains shape, do not contains data.
   void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
     size_t size =
         output.shape().getElements() * sizeOfValuType(output.valueType());
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
 
-    cpuOutputs_.emplace_back(
-        std::make_shared<BufferArg>(cpuMemory_.back()->getBuf(),
+    func1Outputs_.emplace_back(
+        std::make_shared<BufferArg>(func1Memory_.back()->getBuf(),
                                     output.valueType(),
                                     output.shape(),
                                     argType));
-    gpuOutputs_.emplace_back(
-        std::make_shared<BufferArg>(gpuMemory_.back()->getBuf(),
+    func2Outputs_.emplace_back(
+        std::make_shared<BufferArg>(func2Memory_.back()->getBuf(),
                                     output.valueType(),
                                     output.shape(),
                                     argType));
@@ -138,14 +191,14 @@ public:
 
   /// add and init output sparse matrix
   void addOutputs(const SparseMatrixArg& output, ArgType argType = ASSIGN_TO) {
-    cpuSparse_ = std::make_shared<CpuSparseMatrix>(
+    sparse1_ = std::make_shared<SparseMatrix1>(
         output.shape()[0],
         output.shape()[1],
         output.nnz(),
         static_cast<SparseValueType>(output.dataType()),
         static_cast<SparseFormat>(output.dataFormat()));
 
-    gpuSparse_ = std::make_shared<GpuSparseMatrix>(
+    sparse2_ = std::make_shared<SparseMatrix2>(
         output.shape()[0],
         output.shape()[1],
         output.nnz(),
@@ -154,52 +207,52 @@ public:
 
     /// init sparse matrix
     hl_stream_t stream(HPPL_STREAM_1);
-    cpuSparse_->randomizeUniform();
-    gpuSparse_->copyFrom(*cpuSparse_, stream);
+    sparse1_->randomizeUniform();
+    sparse2_->copyFrom(*sparse1_, stream);
     hl_stream_synchronize(stream);
 
-    cpuOutputs_.emplace_back(
-        std::make_shared<SparseMatrixArg>(*cpuSparse_, argType));
-    gpuOutputs_.emplace_back(
-        std::make_shared<SparseMatrixArg>(*gpuSparse_, argType));
+    func1Outputs_.emplace_back(
+        std::make_shared<SparseMatrixArg>(*sparse1_, argType));
+    func2Outputs_.emplace_back(
+        std::make_shared<SparseMatrixArg>(*sparse2_, argType));
   }
 
   void addOutputs(const SequenceArg& output, ArgType argType = ASSIGN_TO) {
     CHECK_EQ(output.shape().ndims(), 2UL);
     size_t batchSize = output.shape()[0];
 
-    if (!cpuSeq_ || !gpuSeq_) {  // sequence not exist
+    if (!seq1_ || !seq2_) {  // sequence not exist
       addSequence(SequenceIdArg(TensorShape{batchSize}));
     }
     size_t size =
         output.shape().getElements() * sizeOfValuType(output.valueType());
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
 
     /// SequenceArg
-    cpuOutputs_.emplace_back(
-        std::make_shared<SequenceArg>(cpuMemory_.back()->getBuf(),
+    func1Outputs_.emplace_back(
+        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
                                       output.valueType(),
                                       output.shape(),
-                                      *cpuSeq_,
+                                      *seq1_,
                                       argType));
-    gpuOutputs_.emplace_back(
-        std::make_shared<SequenceArg>(gpuMemory_.back()->getBuf(),
+    func2Outputs_.emplace_back(
+        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
                                       output.valueType(),
                                       output.shape(),
-                                      *gpuSeq_,
+                                      *seq2_,
                                       argType));
   }
 
   void addInputs(const SparseMatrixArg& input) {
-    cpuSparse_ = std::make_shared<CpuSparseMatrix>(
+    sparse1_ = std::make_shared<SparseMatrix1>(
         input.shape()[0],
         input.shape()[1],
         input.nnz(),
         static_cast<SparseValueType>(input.dataType()),
         static_cast<SparseFormat>(input.dataFormat()));
 
-    gpuSparse_ = std::make_shared<GpuSparseMatrix>(
+    sparse2_ = std::make_shared<SparseMatrix2>(
         input.shape()[0],
         input.shape()[1],
         input.nnz(),
@@ -208,12 +261,12 @@ public:
 
     /// init sparse matrix
     hl_stream_t stream(HPPL_STREAM_1);
-    cpuSparse_->randomizeUniform();
-    gpuSparse_->copyFrom(*cpuSparse_, stream);
+    sparse1_->randomizeUniform();
+    sparse2_->copyFrom(*sparse1_, stream);
     hl_stream_synchronize(stream);
 
-    cpuInputs_.emplace_back(std::make_shared<SparseMatrixArg>(*cpuSparse_));
-    gpuInputs_.emplace_back(std::make_shared<SparseMatrixArg>(*gpuSparse_));
+    func1Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse1_));
+    func2Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse2_));
   }
 
   void run() {
@@ -236,27 +289,27 @@ public:
       function->calc(inArgs, outArgs);
     };
 
-    callFunction(cpuFunc_.get(), cpuInputs_, cpuOutputs_);
-    callFunction(gpuFunc_.get(), gpuInputs_, gpuOutputs_);
+    callFunction(function1_.get(), func1Inputs_, func1Outputs_);
+    callFunction(function2_.get(), func2Inputs_, func2Outputs_);
 
     // check outputs
     compareOutputs();
   }
 
-  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpuFunc_; }
+  std::shared_ptr<FunctionBase> getFunction1() const { return function1_; }
 
-  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpuFunc_; }
+  std::shared_ptr<FunctionBase> getFunction2() const { return function2_; }
 
 protected:
   // only init cpu argument, gpu argument copy from cpu argument.
   void initArg(BufferArg& arg) {
-    CpuVector vector(arg.shape().getElements(), (real*)arg.data());
+    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
     vector.uniform(0.001, 1);
   }
 
   void initArg(SequenceArg& arg) {
     /// init only matrix
-    CpuVector vector(arg.shape().getElements(), (real*)arg.data());
+    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
     vector.uniform(0.001, 1);
   }
 
@@ -276,73 +329,72 @@ protected:
   }
 
   void initInputs() {
-    for (size_t i = 0; i < cpuInputs_.size(); i++) {
-      if (cpuInputs_[i]->isSparseArg()) {
+    for (size_t i = 0; i < func1Inputs_.size(); i++) {
+      if (func1Inputs_[i]->isSparseArg()) {
         continue;  /// sparse matrix already init
       }
 
-      if (cpuInputs_[i]->isSequenceArg()) {
-        initArg(dynamic_cast<SequenceArg&>(*cpuInputs_[i]));
+      if (func1Inputs_[i]->isSequenceArg()) {
+        initArg(dynamic_cast<SequenceArg&>(*func1Inputs_[i]));
       } else {
-        initArg(*cpuInputs_[i]);
+        initArg(*func1Inputs_[i]);
       }
-      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
-      CpuVector cpuVector(cpuInputs_[i]->shape().getElements(),
-                          (real*)cpuInputs_[i]->data());
-      GpuVector gpuVector(gpuInputs_[i]->shape().getElements(),
-                          (real*)gpuInputs_[i]->data());
 
-      gpuVector.copyFrom(cpuVector);
+      copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
     }
   }
 
   void initOutputs() {
-    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
-      if (cpuOutputs_[i]->isSparseArg()) {
+    for (size_t i = 0; i < func1Outputs_.size(); i++) {
+      if (func1Outputs_[i]->isSparseArg()) {
         continue;  /// sparse matrix already init
       }
 
-      if (cpuOutputs_[i]->isSequenceArg()) {
-        initArg(dynamic_cast<SequenceArg&>(*cpuOutputs_[i]));
+      if (func1Outputs_[i]->isSequenceArg()) {
+        initArg(dynamic_cast<SequenceArg&>(*func1Outputs_[i]));
       } else {
-        initArg(*cpuOutputs_[i]);
+        initArg(*func1Outputs_[i]);
       }
 
-      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
-      CpuVector cpuVector(cpuOutputs_[i]->shape().getElements(),
-                          (real*)cpuOutputs_[i]->data());
-      GpuVector gpuVector(gpuOutputs_[i]->shape().getElements(),
-                          (real*)gpuOutputs_[i]->data());
-
-      gpuVector.copyFrom(cpuVector);
+      copyArg_(*func1Outputs_[i], *func2Outputs_[i]);
     }
   }
 
   void compareOutputs() {
-    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
+    for (size_t i = 0; i < func1Outputs_.size(); i++) {
       // TODO, Need a BufferCheck used to compare the two buffers.
-      const auto cpu = cpuOutputs_[i];
-      const auto gpu = gpuOutputs_[i];
+      const auto cpu = func1Outputs_[i];
+      const auto gpu = func2Outputs_[i];
       CHECK_EQ(cpu->numElements(), gpu->numElements());
-      CpuVector cpuVector(cpu->numElements(), (real*)cpu->data());
-      GpuVector gpuVector(gpu->numElements(), (real*)gpu->data());
+      Vector1 cpuVector(cpu->numElements(), (real*)cpu->data());
+      Vector2 gpuVector(gpu->numElements(), (real*)gpu->data());
       autotest::TensorCheckErr(cpuVector, gpuVector);
     }
   }
 
 protected:
-  std::shared_ptr<FunctionBase> cpuFunc_;
-  std::shared_ptr<FunctionBase> gpuFunc_;
-  std::vector<CpuMemHandlePtr> cpuMemory_;
-  std::vector<GpuMemHandlePtr> gpuMemory_;
-  std::vector<BufferArgPtr> cpuInputs_;
-  std::vector<BufferArgPtr> cpuOutputs_;
-  std::vector<BufferArgPtr> gpuInputs_;
-  std::vector<BufferArgPtr> gpuOutputs_;
-  std::shared_ptr<CpuSparseMatrix> cpuSparse_;
-  std::shared_ptr<GpuSparseMatrix> gpuSparse_;
-  std::shared_ptr<SequenceIdArg> cpuSeq_;
-  std::shared_ptr<SequenceIdArg> gpuSeq_;
+  std::shared_ptr<FunctionBase> function1_;
+  std::shared_ptr<FunctionBase> function2_;
+  std::vector<std::shared_ptr<Allocator1>> func1Memory_;
+  std::vector<std::shared_ptr<Allocator2>> func2Memory_;
+  std::vector<BufferArgPtr> func1Inputs_;
+  std::vector<BufferArgPtr> func1Outputs_;
+  std::vector<BufferArgPtr> func2Inputs_;
+  std::vector<BufferArgPtr> func2Outputs_;
+  std::shared_ptr<SparseMatrix1> sparse1_;
+  std::shared_ptr<SparseMatrix2> sparse2_;
+  std::shared_ptr<SequenceIdArg> seq1_;
+  std::shared_ptr<SequenceIdArg> seq2_;
+  test::CopyArgument<DType1, DType2> copyArg_;
+};
+
+class CpuGpuFuncCompare
+    : public Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> {
+public:
+  CpuGpuFuncCompare(const std::string& name, const FuncConfig& config)
+      : Compare2Function(name + "-CPU", name + "-GPU", config) {}
+
+  ~CpuGpuFuncCompare() {}
 };
 
 }  // namespace paddle
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a40e5d9d2e76605525f0956445fc43c693933cf8
--- /dev/null
+++ b/paddle/function/GemmConvOp.cpp
@@ -0,0 +1,410 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "GemmConvOp.h"
+#include "GemmFunctor.h"
+#include "paddle/math/MemoryHandle.h"
+
+namespace paddle {
+
+/*
+ * imData = [input_channels, input_height, input_width]
+ * colData = [input_channels, filter_height, filter_width,
+ *            output_height, output_width]
+ */
+template <class T>
+class Im2ColFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* imData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* colData) {
+    int channelsCol = inputChannels * filterHeight * filterWidth;
+
+    for (int c = 0; c < channelsCol; ++c) {
+      int wOffset = c % filterWidth;
+      int hOffset = (c / filterWidth) % filterHeight;
+      int c_im = c / filterWidth / filterHeight;
+      for (int h = 0; h < outputHeight; ++h) {
+        for (int w = 0; w < outputWidth; ++w) {
+          int imRowIdx = h * strideHeight + hOffset;
+          int imColIdx = w * strideWidth + wOffset;
+          if ((imRowIdx - paddingHeight) < 0 ||
+              (imRowIdx - paddingHeight) >= inputHeight ||
+              (imColIdx - paddingWidth) < 0 ||
+              (imColIdx - paddingWidth) >= inputWidth) {
+            colData[(c * outputHeight + h) * outputWidth + w] = T(0);
+          } else {
+            imRowIdx += c_im * inputHeight - paddingHeight;
+            imColIdx -= paddingWidth;
+            colData[(c * outputHeight + h) * outputWidth + w] =
+                imData[imRowIdx * inputWidth + imColIdx];
+          }
+        }
+      }
+    }
+  }
+};
+
+template <class T>
+class Col2ImFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* colData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* imData) {
+    int channelsCol = inputChannels * filterHeight * filterWidth;
+
+    for (int c = 0; c < channelsCol; ++c) {
+      int wOffset = c % filterWidth;
+      int hOffset = (c / filterWidth) % filterHeight;
+      int c_im = c / filterWidth / filterHeight;
+      for (int h = 0; h < outputHeight; ++h) {
+        for (int w = 0; w < outputWidth; ++w) {
+          int imRowIdx = h * strideHeight + hOffset;
+          int imColIdx = w * strideWidth + wOffset;
+          if ((imRowIdx - paddingHeight) >= 0 &&
+              (imRowIdx - paddingHeight) < inputHeight &&
+              (imColIdx - paddingWidth) >= 0 &&
+              (imColIdx - paddingWidth) < inputWidth) {
+            imRowIdx += c_im * inputHeight - paddingHeight;
+            imColIdx -= paddingWidth;
+            imData[imRowIdx * inputWidth + imColIdx] +=
+                colData[(c * outputHeight + h) * outputWidth + w];
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * \brief Forward calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  virtual void check(const BufferArgs& inputs,
+                     const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // TODO(hedaoyuan): Need to define some index macros,
+    // to avoid useing 0 and 1.
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+
+    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
+                  outputHeight * outputWidth;
+    resizeBuffer<Device>(size);
+    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+
+    Im2ColFunctor<Device, real> im2col;
+    GemmFunctor<Device, real> gemm;
+    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        im2col(inputData + g * inputOffset,
+               inputChannels / groups_,
+               inputHeight,
+               inputWidth,
+               filterHeight,
+               filterWidth,
+               strideH(),
+               strideW(),
+               paddingH(),
+               paddingW(),
+               outputHeight,
+               outputWidth,
+               colData);
+
+        int M = outputChannels / groups_;
+        int N = outputHeight * outputWidth;
+        int K = inputChannels / groups_ * filterHeight * filterWidth;
+        gemm(CblasNoTrans,
+             CblasNoTrans,
+             M,
+             N,
+             K,
+             1.0f,
+             filterData + g * filterOffset,
+             K,
+             colData,
+             N,
+             beta,
+             outputData + g * outputOffset,
+             N);
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+/*
+ * \brief Backward input calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvGradInputFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  virtual void check(const BufferArgs& inputs,
+                     const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // Since the implementation of Col2ImFunctor is ADD_TO,
+    // this function only supports ADD_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* outputGrad = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* inputGrad = outputs[0].data<real>();
+
+    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
+                  outputHeight * outputWidth;
+    resizeBuffer<Device>(size);
+    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+
+    Col2ImFunctor<Device, real> col2im;
+    GemmFunctor<Device, real> gemm;
+    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        int K = outputChannels / groups_;
+        int N = outputHeight * outputWidth;
+        int M = inputChannels / groups_ * filterHeight * filterWidth;
+        gemm(CblasTrans,
+             CblasNoTrans,
+             M,
+             N,
+             K,
+             1.0f,
+             filterData + g * filterOffset,
+             M,
+             outputGrad + g * outputOffset,
+             N,
+             0.0f,
+             colData,
+             N);
+
+        col2im(colData,
+               inputChannels / groups_,
+               inputHeight,
+               inputWidth,
+               filterHeight,
+               filterWidth,
+               strideH(),
+               strideW(),
+               paddingH(),
+               paddingW(),
+               outputHeight,
+               outputWidth,
+               inputGrad + g * inputOffset);
+      }
+      inputGrad += inputChannels * inputHeight * inputWidth;
+      outputGrad += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+/*
+ * \brief Backward filter calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvGradFilterFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  virtual void check(const BufferArgs& inputs,
+                     const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* outputGrad = inputs[0].data<real>();
+    real* inputData = inputs[1].data<real>();
+    real* filterGrad = outputs[0].data<real>();
+
+    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
+                  outputHeight * outputWidth;
+    resizeBuffer<Device>(size);
+    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+
+    Im2ColFunctor<Device, real> im2col;
+    GemmFunctor<Device, real> gemm;
+    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        im2col(inputData + g * inputOffset,
+               inputChannels / groups_,
+               inputHeight,
+               inputWidth,
+               filterHeight,
+               filterWidth,
+               strideH(),
+               strideW(),
+               paddingH(),
+               paddingW(),
+               outputHeight,
+               outputWidth,
+               colData);
+
+        int M = outputChannels / groups_;
+        int K = outputHeight * outputWidth;
+        int N = inputChannels / groups_ * filterHeight * filterWidth;
+        gemm(CblasNoTrans,
+             CblasTrans,
+             M,
+             N,
+             K,
+             1.0f,
+             outputGrad + g * outputOffset,
+             K,
+             colData,
+             K,
+             i == 0 ? beta : 1.0f,
+             filterGrad + g * filterOffset,
+             N);
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputGrad += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
+REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
+REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction);
+REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction);
+REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/GemmConvOp.h b/paddle/function/GemmConvOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f11cce597a07ce2a54f518be30b657c26ab7516
--- /dev/null
+++ b/paddle/function/GemmConvOp.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ConvOp.h"
+
+namespace paddle {
+
+/*
+ * imData = [input_channels, input_height, input_width]
+ * colData = [input_channels, filter_height, filter_width,
+ *            output_height, output_width]
+ */
+template <DeviceType Device, class T>
+class Im2ColFunctor {
+public:
+  void operator()(const T* imData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* colData);
+};
+
+template <DeviceType Device, class T>
+class Col2ImFunctor {
+public:
+  void operator()(const T* colData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* imData);
+};
+
+}  // namespace paddle
diff --git a/paddle/function/GemmConvOpGpu.cu b/paddle/function/GemmConvOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2a1795ff0fb5643ea436c94fe893fe866056fccb
--- /dev/null
+++ b/paddle/function/GemmConvOpGpu.cu
@@ -0,0 +1,186 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvOp.h"
+#include "GemmConvOp.h"
+
+namespace paddle {
+
+template<class T>
+__global__
+void im2col(const T* data_im, int numOuts, int height, int width,
+            int blockH, int blockW,
+            int strideH, int strideW,
+            int paddingH, int paddingW,
+            int height_col, int width_col,
+            T* data_col) {
+  int index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < numOuts) {
+    int w_out = index % width_col;
+    index /= width_col;
+    int h_out = index % height_col;
+    int channel_in = index / height_col;
+    int channel_out = channel_in * blockH * blockW;
+    int h_in = h_out * strideH;
+    int w_in = w_out * strideW;
+
+    data_col += (channel_out * height_col + h_out) * width_col + w_out;
+    for (int i = 0; i < blockH; ++i) {
+      for (int j = 0; j < blockW; ++j) {
+        int rIdx = int(h_in+i);
+        int cIdx = int(w_in+j);
+        if ((rIdx-(int)paddingH) >= (int)height ||
+            (rIdx-(int)paddingH) < 0 ||
+            (cIdx-(int)paddingW) >= (int)width ||
+            (cIdx-(int)paddingW) < 0) {
+          *data_col = 0;
+        } else {
+          rIdx = rIdx + channel_in*height - paddingH;
+          cIdx = cIdx - paddingW;
+          *data_col = data_im[rIdx* width + cIdx];
+        }
+        data_col += height_col * width_col;
+      }
+    }
+  }
+}
+
+template <class T>
+class Im2ColFunctor<DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* imData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* colData) {
+    int numKernels = inputChannels * outputHeight * outputWidth;
+    int blocks = (numKernels + 1024 -1) / 1024;
+    int blockX = 512;
+    int blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+    im2col<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+        (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth,
+         strideHeight, strideWidth, paddingHeight, paddingWidth,
+         outputHeight, outputWidth, colData);
+    CHECK_SYNC("Im2ColFunctor GPU failed");
+  }
+};
+
+template<class T>
+__global__
+void col2im(size_t n, const T* data_col, size_t height,
+            size_t width, size_t channels,
+            size_t blockH, size_t blockW,
+            size_t strideH, size_t strideW,
+            size_t paddingH, size_t paddingW,
+            size_t height_col, size_t width_col,
+            T* data_im) {
+  size_t index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < n) {
+    T val = 0;
+    int w = int(index % width);
+    int h = int((index / width) % height);
+    int c = int(index / (width * height));
+    if ((w - (int)paddingW) >= 0 &&
+        (w - (int)paddingW) < (width-2 * paddingW) &&
+        (h - (int)paddingH) >= 0 &&
+        (h - paddingH) < (height - 2 * paddingH)) {
+      // compute the start and end of the output
+      int w_col_start =
+        (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
+      int w_col_end =
+        min((int)(w / (int)strideW + 1), (int)(width_col));
+      int h_col_start =
+        (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
+      int h_col_end = min(int(h / strideH + 1), int(height_col));
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          // the col location: [c * width * height + h_out, w_out]
+          int c_col = int(c * blockH* blockW) + \
+            (h - h_col * (int)strideH) * (int)blockW +
+            (w - w_col * (int)strideW);
+          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
+        }
+      }
+      h -= paddingH;
+      w -= paddingW;
+      data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
+              h*(width-2*paddingW) + w] += val;
+    }
+  }
+}
+
+template <class T>
+class Col2ImFunctor<DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* colData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* imData) {
+    size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight)
+        * (inputWidth + 2*paddingWidth);
+
+    size_t blocks = (numKernels + 1024 -1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks+512-1)/512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    // To avoid involving atomic operations, we will launch one kernel per
+    // bottom dimension, and then in the kernel add up the top dimensions.
+    col2im<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+             (numKernels,
+              colData,
+              inputHeight + 2*paddingHeight,
+              inputWidth + 2*paddingWidth,
+              inputChannels,
+              filterHeight,
+              filterWidth,
+              strideHeight,
+              strideWidth,
+              paddingHeight,
+              paddingWidth,
+              outputHeight,
+              outputWidth,
+              imData);
+    CHECK_SYNC("Col2ImFunctor GPU failed");
+  }
+};
+
+template class Im2ColFunctor<DEVICE_TYPE_GPU, float>;
+template class Im2ColFunctor<DEVICE_TYPE_GPU, double>;
+template class Col2ImFunctor<DEVICE_TYPE_GPU, float>;
+template class Col2ImFunctor<DEVICE_TYPE_GPU, double>;
+
+}  // namespace paddle
diff --git a/paddle/function/GemmFunctor.h b/paddle/function/GemmFunctor.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5db5cf5e7a855d89b262fe8cf42aa2c55f419f1
--- /dev/null
+++ b/paddle/function/GemmFunctor.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/math/MathFunctions.h"
+
+namespace paddle {
+
+// TODO(hedaoyuan): Since the hl_matrix_mul interface does not conform to the
+// cblas_dgemm interface's parameter format, it is necessary to introduce
+// GemmFunctor as a new interface. Later, when considering the implementation
+// of MatMulFunction, we need to consider the reconstruction of hl_matrix_mul
+// interface.
+template <DeviceType Device, class T>
+class GemmFunctor {
+public:
+  void operator()(const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE TransB,
+                  const int M,
+                  const int N,
+                  const int K,
+                  const T alpha,
+                  const T* A,
+                  const int lda,
+                  const T* B,
+                  const int ldb,
+                  const T beta,
+                  T* C,
+                  const int ldc);
+};
+
+template <class T>
+class GemmFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE TransB,
+                  const int M,
+                  const int N,
+                  const int K,
+                  const T alpha,
+                  const T* A,
+                  const int lda,
+                  const T* B,
+                  const int ldb,
+                  const T beta,
+                  T* C,
+                  const int ldc) {
+    gemm<T>(transA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+  }
+};
+
+template <class T>
+class GemmFunctor<DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE TransB,
+                  const int M,
+                  const int N,
+                  const int K,
+                  const T alpha,
+                  const T* A,
+                  const int lda,
+                  const T* B,
+                  const int ldb,
+                  const T beta,
+                  T* C,
+                  const int ldc) {
+    hl_matrix_mul((T*)A,
+                  transA == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T,
+                  (T*)B,
+                  TransB == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T,
+                  C,
+                  M,
+                  N,
+                  K,
+                  alpha,
+                  beta,
+                  lda,
+                  ldb,
+                  ldc);
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/function/MulOpTest.cpp b/paddle/function/MulOpTest.cpp
index 8753057ebf73c99336b2f5d9c610e4aaf293f845..d31eb0c74f25f5c2ef910264bdf2779e16b1a004 100644
--- a/paddle/function/MulOpTest.cpp
+++ b/paddle/function/MulOpTest.cpp
@@ -35,7 +35,7 @@ void testFuncDDDMatrix(
   size_t heightC = dimM;
   size_t widthC = dimN;
   // init Test object
-  FunctionCompare test(
+  CpuGpuFuncCompare test(
       "MulOp", FuncConfig().set("aTrans", transa).set("bTrans", transb));
   // prepare input arguments
   /// matrix A : HA * WA
@@ -81,8 +81,8 @@ void testFuncDSparseDMatrix(
     size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
   real scaleT = 1.0;
   // init Test object
-  FunctionCompare test("MulOp",
-                       FuncConfig().set("aTrans", false).set("bTrans", false));
+  CpuGpuFuncCompare test(
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
   // prepare input arguments
   /// sparse matrix A : M * K
   test.addInputs(SparseMatrixArg(
@@ -126,8 +126,8 @@ void testFuncDDSparseMatrix(
     size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
   real scaleT = 1.0;
   // init Test object
-  FunctionCompare test("MulOp",
-                       FuncConfig().set("aTrans", false).set("bTrans", false));
+  CpuGpuFuncCompare test(
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
   // prepare input arguments
   /// matrix A : M * K
   test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
@@ -172,8 +172,8 @@ void testFuncSparseDDMatrix(
     size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
   real scaleT = 1.0;
   // init Test object
-  FunctionCompare test("MulOp",
-                       FuncConfig().set("aTrans", false).set("bTrans", false));
+  CpuGpuFuncCompare test(
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
   // prepare input arguments
   /// matrix A : M * K
   test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
diff --git a/paddle/function/NaiveConvOp.cpp b/paddle/function/NaiveConvOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4348f0f775e9442c50a3c45b9a8e6dad5c6b198d
--- /dev/null
+++ b/paddle/function/NaiveConvOp.cpp
@@ -0,0 +1,142 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvOp.h"
+
+namespace paddle {
+
+/*
+ * The three arguments are stored in memory in row major order.
+ * inputData  = [batchSize, inputChannels, inputHeight, inputWidth]
+ * filterData = [outputChannels, inputChannels, filterHeight, filterWidth]
+ * outputData = [batchSize, outputChannels, outputHeight, outputWidth]
+ */
+template <class T>
+class NaiveConvFunctor {
+public:
+  void operator()(const T* inputData,
+                  size_t batchSize,
+                  size_t inputChannels,
+                  size_t inputHeight,
+                  size_t inputWidth,
+                  const T* filterData,
+                  size_t filterHeight,
+                  size_t filterWidth,
+                  T* outputData,
+                  size_t outputChannels,
+                  size_t outputHeight,
+                  size_t outputWidth,
+                  size_t paddingH,
+                  size_t paddingW,
+                  size_t strideH,
+                  size_t strideW) {
+    for (size_t batch = 0; batch < batchSize; batch++) {
+      for (size_t outC = 0; outC < outputChannels; outC++) {
+        for (size_t outH = 0; outH < outputHeight; outH++) {
+          for (size_t outW = 0; outW < outputWidth; outW++) {
+            const int inStartH = (outH * strideH) - paddingH;
+            const int inStartW = (outW * strideW) - paddingW;
+            T outValue = (T)0;
+            for (size_t inC = 0; inC < inputChannels; inC++) {
+              for (size_t fH = 0; fH < filterHeight; fH++) {
+                for (size_t fW = 0; fW < filterWidth; fW++) {
+                  T inValue;
+                  const int inH = inStartH + fH;
+                  const int inW = inStartW + fW;
+                  if ((inH >= 0 && inH < (int)inputHeight) &&
+                      (inW >= 0 && inW < (int)inputWidth)) {
+                    size_t offsetInput =
+                        batch * inputChannels * inputHeight * inputWidth +
+                        inC * inputHeight * inputWidth + inH * inputWidth + inW;
+                    inValue = inputData[offsetInput];
+                  } else {
+                    inValue = (T)0;
+                  }
+                  size_t offsetFilter =
+                      outC * inputChannels * filterHeight * filterWidth +
+                      inC * filterHeight * filterWidth + fH * filterWidth + fW;
+                  T filterValue = filterData[offsetFilter];
+                  outValue += (inValue * filterValue);
+                }
+              }
+            }
+
+            size_t offset =
+                batch * outputChannels * outputHeight * outputWidth +
+                outC * outputHeight * outputWidth + outH * outputWidth + outW;
+            outputData[offset] = outValue;
+          }
+        }
+      }
+    }
+  }
+};
+
+template <DeviceType Device>
+class NaiveConvFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  virtual void check(const BufferArgs& inputs,
+                     const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    check(inputs, outputs);
+
+    size_t batchSize = inputs[0].shape()[0];
+    size_t inputChannels = inputs[0].shape()[1];
+    size_t inputHeight = inputs[0].shape()[2];
+    size_t inputWidth = inputs[0].shape()[3];
+    size_t filterHeight = inputs[1].shape()[2];
+    size_t filterWidth = inputs[1].shape()[3];
+    size_t outputChannels = outputs[0].shape()[1];
+    size_t outputHeight = outputs[0].shape()[2];
+    size_t outputWidth = outputs[0].shape()[3];
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    NaiveConvFunctor<real> conv;
+    conv(inputData,
+         batchSize,
+         inputChannels,
+         inputHeight,
+         inputWidth,
+         filterData,
+         filterHeight,
+         filterWidth,
+         outputData,
+         outputChannels,
+         outputHeight,
+         outputWidth,
+         paddingH(),
+         paddingW(),
+         strideH(),
+         strideW());
+  }
+};
+
+REGISTER_TYPED_FUNC(NaiveConv, CPU, NaiveConvFunction);
+
+}  // namespace paddle
diff --git a/paddle/function/PadOpTest.cpp b/paddle/function/PadOpTest.cpp
index f77ac2a8c49c83f2d6c64c2a30b6a2f2eb09ac10..e286f4e5b8a42348b9d23fd4c3ad44194ca1f299 100644
--- a/paddle/function/PadOpTest.cpp
+++ b/paddle/function/PadOpTest.cpp
@@ -25,7 +25,7 @@ TEST(Pad, real) {
           VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
                   << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
           for (bool test_grad : {false, true}) {
-            FunctionCompare compare(
+            CpuGpuFuncCompare compare(
                 test_grad ? "PadGrad" : "Pad",
                 FuncConfig()
                     .set<std::vector<uint32_t>>("channel", {2, 3})
diff --git a/paddle/function/RowConvOpTest.cpp b/paddle/function/RowConvOpTest.cpp
index 1c95d3ff2cccbf33f4c5f91f6daf340871a8f7b0..f52d18b0491ec444e2fe89fe8fb5c1baa128823e 100644
--- a/paddle/function/RowConvOpTest.cpp
+++ b/paddle/function/RowConvOpTest.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 
 void testRowConvFw(size_t batchSize, size_t dim, size_t contextLength) {
-  FunctionCompare test("RowConv", FuncConfig());
+  CpuGpuFuncCompare test("RowConv", FuncConfig());
 
   test.addSequence(SequenceIdArg(TensorShape{batchSize}));
   test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
@@ -31,7 +31,7 @@ void testRowConvFw(size_t batchSize, size_t dim, size_t contextLength) {
 }
 
 void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) {
-  FunctionCompare test("RowConvGrad", FuncConfig());
+  CpuGpuFuncCompare test("RowConvGrad", FuncConfig());
 
   test.addSequence(SequenceIdArg(TensorShape{batchSize}));
   test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
diff --git a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9b825db574cf8bac2cf7b7538d0583a8adc2c158
--- /dev/null
+++ b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
@@ -0,0 +1,308 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Evaluator.h"
+#include "paddle/gserver/layers/DetectionUtil.h"
+
+using std::map;
+using std::vector;
+using std::pair;
+using std::make_pair;
+
+namespace paddle {
+
+/**
+ * @brief detection map Evaluator
+ *
+ * The config file api is detection_map_evaluator.
+ */
+class DetectionMAPEvaluator : public Evaluator {
+public:
+  DetectionMAPEvaluator()
+      : evaluateDifficult_(false), cpuOutput_(nullptr), cpuLabel_(nullptr) {}
+
+  virtual void start() {
+    Evaluator::start();
+    allTruePos_.clear();
+    allFalsePos_.clear();
+    numPos_.clear();
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    overlapThreshold_ = config_.overlap_threshold();
+    backgroundId_ = config_.background_id();
+    evaluateDifficult_ = config_.evaluate_difficult();
+    apType_ = config_.ap_type();
+
+    MatrixPtr detectTmpValue = arguments[0].value;
+    Matrix::resizeOrCreate(cpuOutput_,
+                           detectTmpValue->getHeight(),
+                           detectTmpValue->getWidth(),
+                           false,
+                           false);
+
+    MatrixPtr labelTmpValue = arguments[1].value;
+    Matrix::resizeOrCreate(cpuLabel_,
+                           labelTmpValue->getHeight(),
+                           labelTmpValue->getWidth(),
+                           false,
+                           false);
+
+    cpuOutput_->copyFrom(*detectTmpValue);
+    cpuLabel_->copyFrom(*labelTmpValue);
+
+    Argument label = arguments[1];
+    const int* labelIndex = label.sequenceStartPositions->getData(false);
+    size_t batchSize = label.getNumSequences();
+
+    vector<map<size_t, vector<NormalizedBBox>>> allGTBBoxes;
+    vector<map<size_t, vector<pair<real, NormalizedBBox>>>> allDetectBBoxes;
+
+    for (size_t n = 0; n < batchSize; ++n) {
+      map<size_t, vector<NormalizedBBox>> bboxes;
+      for (int i = labelIndex[n]; i < labelIndex[n + 1]; ++i) {
+        vector<NormalizedBBox> bbox;
+        getBBoxFromLabelData(cpuLabel_->getData() + i * 6, 1, bbox);
+        int c = cpuLabel_->getData()[i * 6];
+        bboxes[c].push_back(bbox[0]);
+      }
+      allGTBBoxes.push_back(bboxes);
+    }
+
+    size_t n = 0;
+    const real* cpuOutputData = cpuOutput_->getData();
+    for (size_t imgId = 0; imgId < batchSize; ++imgId) {
+      map<size_t, vector<pair<real, NormalizedBBox>>> bboxes;
+      size_t curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
+      while (curImgId == imgId && n < cpuOutput_->getHeight()) {
+        vector<real> label;
+        vector<real> score;
+        vector<NormalizedBBox> bbox;
+        getBBoxFromDetectData(cpuOutputData + n * 7, 1, label, score, bbox);
+        bboxes[label[0]].push_back(make_pair(score[0], bbox[0]));
+        ++n;
+        curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
+      }
+      allDetectBBoxes.push_back(bboxes);
+    }
+
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (map<size_t, vector<NormalizedBBox>>::iterator it =
+               allGTBBoxes[n].begin();
+           it != allGTBBoxes[n].end();
+           ++it) {
+        size_t count = 0;
+        if (evaluateDifficult_) {
+          count = it->second.size();
+        } else {
+          for (size_t i = 0; i < it->second.size(); ++i)
+            if (!(it->second[i].isDifficult)) ++count;
+        }
+        if (numPos_.find(it->first) == numPos_.end() && count != 0) {
+          numPos_[it->first] = count;
+        } else {
+          numPos_[it->first] += count;
+        }
+      }
+    }
+
+    // calcTFPos
+    calcTFPos(batchSize, allGTBBoxes, allDetectBBoxes);
+
+    return 0;
+  }
+
+  virtual void printStats(std::ostream& os) const {
+    real mAP = calcMAP();
+    os << "Detection mAP=" << mAP;
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    LOG(FATAL) << "Distribute detection evaluation not implemented.";
+  }
+
+protected:
+  void calcTFPos(const size_t batchSize,
+                 const vector<map<size_t, vector<NormalizedBBox>>>& allGTBBoxes,
+                 const vector<map<size_t, vector<pair<real, NormalizedBBox>>>>&
+                     allDetectBBoxes) {
+    for (size_t n = 0; n < allDetectBBoxes.size(); ++n) {
+      if (allGTBBoxes[n].size() == 0) {
+        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
+                 it = allDetectBBoxes[n].begin();
+             it != allDetectBBoxes[n].end();
+             ++it) {
+          size_t label = it->first;
+          for (size_t i = 0; i < it->second.size(); ++i) {
+            allTruePos_[label].push_back(make_pair(it->second[i].first, 0));
+            allFalsePos_[label].push_back(make_pair(it->second[i].first, 1));
+          }
+        }
+      } else {
+        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
+                 it = allDetectBBoxes[n].begin();
+             it != allDetectBBoxes[n].end();
+             ++it) {
+          size_t label = it->first;
+          vector<pair<real, NormalizedBBox>> predBBoxes = it->second;
+          if (allGTBBoxes[n].find(label) == allGTBBoxes[n].end()) {
+            for (size_t i = 0; i < predBBoxes.size(); ++i) {
+              allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
+              allFalsePos_[label].push_back(make_pair(predBBoxes[i].first, 1));
+            }
+          } else {
+            vector<NormalizedBBox> gtBBoxes =
+                allGTBBoxes[n].find(label)->second;
+            vector<bool> visited(gtBBoxes.size(), false);
+            // Sort detections in descend order based on scores
+            std::sort(predBBoxes.begin(),
+                      predBBoxes.end(),
+                      sortScorePairDescend<NormalizedBBox>);
+            for (size_t i = 0; i < predBBoxes.size(); ++i) {
+              real maxOverlap = -1.0;
+              size_t maxIdx = 0;
+              for (size_t j = 0; j < gtBBoxes.size(); ++j) {
+                real overlap =
+                    jaccardOverlap(predBBoxes[i].second, gtBBoxes[j]);
+                if (overlap > maxOverlap) {
+                  maxOverlap = overlap;
+                  maxIdx = j;
+                }
+              }
+              if (maxOverlap > overlapThreshold_) {
+                if (evaluateDifficult_ ||
+                    (!evaluateDifficult_ && !gtBBoxes[maxIdx].isDifficult)) {
+                  if (!visited[maxIdx]) {
+                    allTruePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 1));
+                    allFalsePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 0));
+                    visited[maxIdx] = true;
+                  } else {
+                    allTruePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 0));
+                    allFalsePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 1));
+                  }
+                }
+              } else {
+                allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
+                allFalsePos_[label].push_back(
+                    make_pair(predBBoxes[i].first, 1));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  real calcMAP() const {
+    real mAP = 0.0;
+    size_t count = 0;
+    for (map<size_t, size_t>::const_iterator it = numPos_.begin();
+         it != numPos_.end();
+         ++it) {
+      size_t label = it->first;
+      size_t labelNumPos = it->second;
+      if (labelNumPos == 0 || allTruePos_.find(label) == allTruePos_.end())
+        continue;
+      vector<pair<real, size_t>> labelTruePos = allTruePos_.find(label)->second;
+      vector<pair<real, size_t>> labelFalsePos =
+          allFalsePos_.find(label)->second;
+      // Compute average precision.
+      vector<size_t> tpCumSum;
+      getAccumulation(labelTruePos, &tpCumSum);
+      vector<size_t> fpCumSum;
+      getAccumulation(labelFalsePos, &fpCumSum);
+      std::vector<real> precision, recall;
+      size_t num = tpCumSum.size();
+      // Compute Precision.
+      for (size_t i = 0; i < num; ++i) {
+        CHECK_LE(tpCumSum[i], labelNumPos);
+        precision.push_back(static_cast<real>(tpCumSum[i]) /
+                            static_cast<real>(tpCumSum[i] + fpCumSum[i]));
+        recall.push_back(static_cast<real>(tpCumSum[i]) / labelNumPos);
+      }
+      // VOC2007 style
+      if (apType_ == "11point") {
+        vector<real> maxPrecisions(11, 0.0);
+        int startIdx = num - 1;
+        for (int j = 10; j >= 0; --j)
+          for (int i = startIdx; i >= 0; --i) {
+            if (recall[i] < j / 10.) {
+              startIdx = i;
+              if (j > 0) maxPrecisions[j - 1] = maxPrecisions[j];
+              break;
+            } else {
+              if (maxPrecisions[j] < precision[i])
+                maxPrecisions[j] = precision[i];
+            }
+          }
+        for (int j = 10; j >= 0; --j) mAP += maxPrecisions[j] / 11;
+        ++count;
+      } else if (apType_ == "Integral") {
+        // Nature integral
+        real averagePrecisions = 0.;
+        real prevRecall = 0.;
+        for (size_t i = 0; i < num; ++i) {
+          if (fabs(recall[i] - prevRecall) > 1e-6)
+            averagePrecisions += precision[i] * fabs(recall[i] - prevRecall);
+          prevRecall = recall[i];
+        }
+        mAP += averagePrecisions;
+        ++count;
+      } else {
+        LOG(FATAL) << "Unkown ap version: " << apType_;
+      }
+    }
+    if (count != 0) mAP /= count;
+    return mAP * 100;
+  }
+
+  void getAccumulation(vector<pair<real, size_t>> inPairs,
+                       vector<size_t>* accuVec) const {
+    std::stable_sort(
+        inPairs.begin(), inPairs.end(), sortScorePairDescend<size_t>);
+    accuVec->clear();
+    size_t sum = 0;
+    for (size_t i = 0; i < inPairs.size(); ++i) {
+      sum += inPairs[i].second;
+      accuVec->push_back(sum);
+    }
+  }
+
+  std::string getTypeImpl() const { return "detection_map"; }
+
+  real getValueImpl() const { return calcMAP(); }
+
+private:
+  real overlapThreshold_;  // overlap threshold when determining whether matched
+  bool evaluateDifficult_;  // whether evaluate difficult ground truth
+  size_t backgroundId_;     // class index of background
+  std::string apType_;      // how to calculate mAP (Integral or 11point)
+
+  MatrixPtr cpuOutput_;
+  MatrixPtr cpuLabel_;
+
+  map<size_t, size_t> numPos_;  // counts of true objects each classification
+  map<size_t, vector<pair<real, size_t>>>
+      allTruePos_;  // true positive prediction
+  map<size_t, vector<pair<real, size_t>>>
+      allFalsePos_;  // false positive prediction
+};
+
+REGISTER_EVALUATOR(detection_map, DetectionMAPEvaluator);
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 3159026e6b92355ba7480b09535388c969a504e2..8ef5e9d0c116dd088b5c5c318dfb47c245b471fa 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -166,11 +166,21 @@ MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
 
   outArgStream_ = HPPL_STREAM_1;
 
+  start();
+}
+
+void MultiGradientMachine::start() {
   for (auto& thread : threads_) {
     thread->start();
   }
 }
 
+void MultiGradientMachine::finish() {
+  for (auto& thread : threads_) {
+    thread->stop();
+  }
+}
+
 std::vector<const std::vector<ParameterPtr>*>
 MultiGradientMachine::getSlaveParameters() {
   std::vector<const std::vector<ParameterPtr>*> vec;
@@ -326,12 +336,6 @@ void MultiGradientMachine::onPassEnd() {
   }
 }
 
-void MultiGradientMachine::finish() {
-  for (auto& thread : threads_) {
-    thread->stop();
-  }
-}
-
 Evaluator* MultiGradientMachine::makeEvaluator() const {
   return threads_[0]->getGradientMachine()->makeEvaluator();
 }
@@ -445,7 +449,7 @@ TrainerThread::TrainerThread(const ModelConfig& config,
 
   gradStream_ = HPPL_STREAM_2;
   valueStream_ = HPPL_STREAM_3;
-  stopping_ = false;
+  stopping_ = true;
   updateCounter_ = 0;
   parameterUpdated_ = false;
 }
@@ -453,6 +457,10 @@ TrainerThread::TrainerThread(const ModelConfig& config,
 TrainerThread::~TrainerThread() { stop(); }
 
 void TrainerThread::start() {
+  if (!stopping_) return;
+
+  stopping_ = false;
+
   gradientMachine_->start();
 
   computeThread_.reset(new std::thread([this]() { computeThread(); }));
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
index 70203bbb97fe79d72fbc6bd2b5d427cb1de7b61f..5e7622f929fd57de6e38855528a752b5586c4cd1 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -176,6 +176,10 @@ public:
 
   explicit MultiGradientMachine(const ModelConfig& config, bool useGpu);
 
+  virtual void start();
+
+  virtual void finish();
+
   virtual void prefetch(const std::vector<Argument>& inArgs);
 
   virtual void forward(const std::vector<Argument>& inArgs,
@@ -193,8 +197,6 @@ public:
 
   virtual void onPassEnd();
 
-  virtual void finish();
-
   virtual Evaluator* makeEvaluator() const;
 
   virtual void eval(Evaluator* evaluator) const;
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 4512aacc81f86bf87fc9ea30adcf081327663f16..2e839f640503b8f4e390fc87d9d59960dbc37f6e 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -241,11 +241,14 @@ void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
     dataLayers_[i]->setData(inArgs[i]);
   }
 
+  gLayerStackTrace.set_stage(true);
+
   {
     for (auto& layer : layers_) {
       REGISTER_TIMER_INFO("ForwardTimer", layer->getName().c_str());
       gLayerStackTrace.push(layer->getName());
       layer->forward(passType);
+      gLayerStackTrace.pop(layer->getName());
     }
   }
 
@@ -254,9 +257,6 @@ void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
   for (auto& layer : outputLayers_) {
     outArgs->push_back(layer->getOutput());
   }
-  if (passType == PASS_TEST) {
-    gLayerStackTrace.clear();
-  }
 }
 
 void NeuralNetwork::resetState() {
@@ -283,9 +283,10 @@ void NeuralNetwork::getState(MachineState& machineState) {
 }
 
 void NeuralNetwork::backward(const UpdateCallback& callback) {
-  gLayerStackTrace.pop("");  // tell layer trace is during backward.
+  gLayerStackTrace.set_stage(false);
   FOR_EACH_R(layer, layers_) {
     REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str());
+    gLayerStackTrace.push((*layer)->getName());
     if ((*layer)->needGradient()) {
       (*layer)->backward(callback);
     }
@@ -308,35 +309,35 @@ public:
   void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
     evaluators_.emplace_back(std::move(evaluator));
   }
-  virtual void start() {
+  void start() override {
     for (auto& evaluator : evaluators_) {
       evaluator->start();
     }
   }
 
-  virtual void finish() {
+  void finish() override {
     for (auto& evaluator : evaluators_) {
       evaluator->finish();
     }
   }
 
-  virtual void eval(const NeuralNetwork& nn) {
+  void eval(const NeuralNetwork& nn) override {
     for (auto& evaluator : evaluators_) {
       evaluator->eval(nn);
     }
   }
-  virtual real evalImp(std::vector<Argument>& arguments) {
+  real evalImp(std::vector<Argument>& arguments) override {
     (void)arguments;
     return -1;
   }
-  virtual void printStats(std::ostream& os) const {
+  void printStats(std::ostream& os) const override {
     for (auto& evaluator : evaluators_) {
       evaluator->printStats(os);
       os << ' ';
     }
   }
 
-  virtual void distributeEval(ParameterClient2* client) {
+  void distributeEval(ParameterClient2* client) override {
     for (auto& evaluator : evaluators_) {
       evaluator->distributeEval(client);
     }
@@ -351,7 +352,7 @@ public:
    * @brief getNames will return all inside evaluators' names.
    * @param names [out]: return names.
    */
-  void getNames(std::vector<std::string>* names) {
+  void getNames(std::vector<std::string>* names) override {
     for (auto& eval : evaluators_) {
       eval->getNames(names);
     }
@@ -360,7 +361,7 @@ public:
   /**
    * @brief getValue could get all inside evaluators' value.
    */
-  real getValue(const std::string& name, Error* err) const {
+  real getValue(const std::string& name, Error* err) const override {
     return this->getMethodHelper<real>(
         name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
           return eval->getValue(name, err);
@@ -370,7 +371,7 @@ public:
   /**
    * @brief getType could get all inside evaluators' type.
    */
-  std::string getType(const std::string& name, Error* err) const {
+  std::string getType(const std::string& name, Error* err) const override {
     return this->getMethodHelper<std::string>(
         name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
           return eval->getType(name, err);
@@ -395,6 +396,30 @@ private:
   }
 };
 
+class SubnetEvaluator : public CombinedEvaluator {
+public:
+  SubnetEvaluator(const std::string& layerName,
+                  std::unique_ptr<Evaluator>&& evaluator)
+      : layerName_(layerName) {
+    addEvaluator(std::move(evaluator));
+  }
+  virtual void eval(const NeuralNetwork& nn) override {
+    const LayerPtr& layer = nn.getLayer(layerName_);
+    CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
+                 << nn.getName();
+    bool accessed = false;
+    layer->accessSubNetwork([this, &accessed](NeuralNetwork& subnet) {
+      subnet.eval(evaluators_[0].get());
+      accessed = true;
+    });
+    CHECK(accessed) << "There is no subnetwork for layer " << layerName_
+                    << " in submodel " << nn.getName();
+  }
+
+protected:
+  std::string layerName_;
+};
+
 Evaluator* NeuralNetwork::makeEvaluator() const {
   CombinedEvaluator* combinedEvaluator = new CombinedEvaluator();
   auto subModelConfig = std::find_if(config_.sub_models().begin(),
@@ -421,6 +446,15 @@ Evaluator* NeuralNetwork::makeEvaluator() const {
         combinedEvaluator->addEvaluator(std::move(evaluator));
       }
     }
+    for (auto& layer : layers_) {
+      layer->accessSubNetwork(
+          [layer, combinedEvaluator](NeuralNetwork& subnet) {
+            std::unique_ptr<Evaluator> subEvaluator(new SubnetEvaluator(
+                layer->getName(),
+                std::unique_ptr<Evaluator>(subnet.makeEvaluator())));
+            combinedEvaluator->addEvaluator(std::move(subEvaluator));
+          });
+    }
   } else {
     for (const EvaluatorConfig& evalConfig : config_.evaluators()) {
       std::unique_ptr<Evaluator> evaluator(Evaluator::create(evalConfig));
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index e7b6c438407e7eab6eab1f6ed496f35caa9f2177..12810f642519b7965fc1b7d751290445e3350dd5 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -129,6 +129,8 @@ public:
   static NeuralNetwork* newNeuralNetwork(const std::string& name = "",
                                          NeuralNetwork* rootNetwork = nullptr);
 
+  const std::string& getName() const { return subModelName_; }
+
 protected:
   /**
    * The constructor of NeuralNetwork.
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 01158d1dce8d711c67b1ecf29bb644e42ccf6ff5..9a972466d66ba1417b2c31e66dc375b3da229aa8 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -208,13 +208,13 @@ void RecurrentGradientMachine::init(
                    });
   CHECK(subModelConfig != config.sub_models().end());
   reversed_ = subModelConfig->reversed();
+  generating_ = subModelConfig->has_generator();
 
   inFrameLines_.resize(subModelConfig->in_links_size());
   for (size_t i = 0; i < inFrameLines_.size(); ++i) {
     inFrameLines_[i].linkName = subModelConfig->in_links(i).link_name();
     inFrameLines_[i].inLayer =
         rootNetwork_->getLayer(subModelConfig->in_links(i).layer_name());
-    inFrameLines_[i].hasSubseq = subModelConfig->in_links(i).has_subseq();
   }
 
   outFrameLines_.resize(subModelConfig->out_links_size());
@@ -241,11 +241,8 @@ void RecurrentGradientMachine::init(
           rootNetwork_->getLayer(memoryConfig.boot_layer_name());
 
       LayerConfig scatterConfig = *agentConfig;
-      memoryFrameLines_[i].is_sequence = memoryConfig.is_sequence();
       memoryFrameLines_[i].rootAgent.reset(
-          memoryConfig.is_sequence()
-              ? new SequenceScatterAgentLayer(scatterConfig)
-              : new ScatterAgentLayer(scatterConfig));
+          new ScatterAgentLayer(scatterConfig));
       memoryFrameLines_[i].rootAgent->init(LayerMap(), parameterMap_);
 
       memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].rootAgent;
@@ -267,9 +264,7 @@ void RecurrentGradientMachine::init(
     if (subModelConfig->has_generator()) {
       memoryFrameLines_[i].scatterAgents.resize(2);
       for (auto& agent : memoryFrameLines_[i].scatterAgents) {
-        agent.reset(memoryConfig.is_sequence()
-                        ? new SequenceScatterAgentLayer(*agentConfig)
-                        : new ScatterAgentLayer(*agentConfig));
+        agent.reset(new ScatterAgentLayer(*agentConfig));
         agent->init(LayerMap(), parameterMap_);
       }
     }
@@ -293,12 +288,6 @@ void RecurrentGradientMachine::init(
       parameterIds_.push_back(para->getID());
     }
   }
-
-  if (subModelConfig->evaluator_names_size() > 0) {
-    evaluator_.reset(frames_[0]->makeEvaluator());
-  }
-
-  targetInfoInlinkId_ = subModelConfig->target_inlinkid();
 }
 
 void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) {
@@ -376,108 +365,102 @@ void RecurrentGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
   LOG(FATAL) << "should not use this function";
 }
 
-void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
-                                       std::vector<Argument>* outArgs,
-                                       PassType passType) {
-  if (inFrameLines_.empty() && passType == PASS_TEST) {
-    generateSequence();
-    return;
-  }  // else forward..
-
-  const Argument& input = inFrameLines_[0].inLayer->getOutput();
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  size_t numSequences = input.getNumSequences();
-  const int* starts = input.sequenceStartPositions->getData(false);
-  bool hasSubseq = input.hasSubseq();
-
-  // In case of !hasSubseq or targetInfoInlinkId_ == -1, all inlinks share the
-  // same inframe info
-  bool shareInlinkInfo = !hasSubseq || targetInfoInlinkId_ == -1;
-
-  // Defaultly, share info with the first inlink
-  if (shareInlinkInfo) {
-    targetInfoInlinkId_ = 0;
-  }
-
-  // check hasSubseq in both config and input are the same
-  CHECK_EQ(hasSubseq, inFrameLines_[0].hasSubseq);
-
-  CHECK_EQ(starts[numSequences], batchSize);
-  CHECK(input.sequenceStartPositions);
-
-  // check other inputs has same sequence length and start
-  for (size_t i = 1; i < inFrameLines_.size(); ++i) {
-    const Argument& input1 = inFrameLines_[i].inLayer->getOutput();
-    CHECK_EQ((size_t)input1.getNumSequences(), numSequences);
-    // check all inputs should have same hasSubseq flag
-    CHECK_EQ(input.hasSubseq(), inFrameLines_[0].hasSubseq);
-
-    // if shareInlinkInfo, checks:
-    // 1. all inlinks have same number of total tokens
-    // 2. all inlinks have same number of tokens for each sentence of each
-    //    sample. If hasSubseq, one sample has multiple sentence, else, one
-    //    sample is one sentence
-    if (shareInlinkInfo) {
-      CHECK_EQ(input1.getBatchSize(), batchSize);
-      CHECK(std::equal(starts,
-                       starts + numSequences + 1,
-                       input1.sequenceStartPositions->getData(false)));
+void RecurrentGradientMachine::checkInputConsistency(
+    int inlinkId, const std::vector<Argument::SeqInfo>& seqInfo) {
+  if (commonSeqInfo_.empty()) {
+    commonSeqInfo_.resize(seqInfo.size());
+    for (size_t i = 0; i < seqInfo.size(); ++i) {
+      commonSeqInfo_[i].topLevelLength = seqInfo[i].topLevelLength;
+      commonSeqInfo_[i].seqId = seqInfo[i].seqId;
+    }
+  } else {
+    CHECK_EQ(commonSeqInfo_.size(), seqInfo.size())
+        << " RecurrentGroup " << subModelName_ << " input " << inlinkId
+        << " has mismatched number of sequences";
+    for (size_t i = 0; i < seqInfo.size(); ++i) {
+      CHECK_EQ(commonSeqInfo_[i].topLevelLength, seqInfo[i].topLevelLength)
+          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
+          << " has mismatched sequence length";
+      CHECK_EQ(commonSeqInfo_[i].seqId, seqInfo[i].seqId)
+          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
+          << " has mismatched sequence length";
     }
   }
+}
 
-  if (hasSubseq) {
-    CHECK(input.subSequenceStartPositions);
-    size_t numSubSequences = input.getNumSubSequences();
-    const int* subStarts = input.subSequenceStartPositions->getData(false);
-    CHECK_EQ(subStarts[numSubSequences], batchSize);
-    // if hasSubseq, check other inputs has same sub-sequence and sub-start
-    for (size_t i = 1; i < inFrameLines_.size(); ++i) {
-      const Argument& input1 = inFrameLines_[i].inLayer->getOutput();
-      CHECK_EQ((size_t)input1.getNumSubSequences(), numSubSequences);
-      if (shareInlinkInfo) {
-        CHECK(std::equal(subStarts,
-                         subStarts + numSubSequences + 1,
-                         input1.subSequenceStartPositions->getData(false)));
-      }
+void RecurrentGradientMachine::calcNumSequencesAtEachStep() {
+  int numSequences = commonSeqInfo_.size();
+  numSeqs_.resize(maxSequenceLength_);
+  for (int i = 0; i < numSequences; ++i) {
+    for (int j = 0; j < commonSeqInfo_[i].topLevelLength; ++j) {
+      numSeqs_[j] = i + 1;
     }
   }
+}
 
+void RecurrentGradientMachine::reorganizeInput(PassType passType) {
   info_.clear();
   info_.resize(inFrameLines_.size());
 
+  commonSeqInfo_.clear();
   seqInfos_.clear();
   seqInfos_.resize(inFrameLines_.size());
 
+  for (size_t i = 0; i < inFrameLines_.size(); i++) {
+    const Argument& input = inFrameLines_[i].inLayer->getOutput();
+    if (!input.hasSeq()) {
+      continue;
+    }
+    input.getSeqInfo(&seqInfos_[i]);
+    checkInputConsistency(i, seqInfos_[i]);
+  }
+  CHECK(!commonSeqInfo_.empty())
+      << "At least one input needs to be sequence or subsequence";
+  maxSequenceLength_ = commonSeqInfo_[0].topLevelLength;
+
+  calcNumSequencesAtEachStep();
+
+  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
+    const Argument& input = inFrameLines_[i].inLayer->getOutput();
+    if (!input.hasSeq()) {
+      seqInfos_[i] = commonSeqInfo_;
+    }
+    createInFrameInfo(i, input, passType);
+  }
+
   {
     AsyncGpuBlock asyncGpuBlock;
-    // if shareInlinkInfo, only calculate info of the first inlink
-    // else, calculate info for each inlink
-    if (shareInlinkInfo) {
-      input.getSeqInfo(&seqInfos_[0]);
-      maxSequenceLength_ = seqInfos_[0][0].topLevelLength;
-      createInFrameInfo(0, input, passType);
-    } else {
-      for (size_t i = 0; i < inFrameLines_.size(); i++) {
-        const Argument& input1 = inFrameLines_[i].inLayer->getOutput();
-        input1.getSeqInfo(&seqInfos_[i]);
-        maxSequenceLength_ = seqInfos_[i][0].topLevelLength;
-        createInFrameInfo(i, input1, passType);
-      }
-    }
 
     // inFrameLine select rows in real layer one time
     for (size_t i = 0; i < inFrameLines_.size(); i++) {
-      int curInlinkId = shareInlinkInfo ? 0 : i;
       selectRowsOneTime(inFrameLines_[i].inLayer,
-                        info_[curInlinkId].allIds,
+                        info_[i].allIds,
                         &(inFrameLines_[i].outArg),
                         passType);
     }
   }
-  resizeOrCreateFrames(maxSequenceLength_);
-  resizeBootFrame(numSequences);
+}
 
+void RecurrentGradientMachine::reorganizeOutput(PassType passType) {
+  calcSequenceStartPositions();
+  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
+    Info info;
+    auto& outFrameLine = outFrameLines_[i];
+    ICpuGpuVectorPtr sequenceStartPositions;
+    ICpuGpuVectorPtr subSequenceStartPositions;
+    createOutFrameInfo(
+        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
+    auto gatherAgent =
+        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
+    CHECK_NOTNULL(gatherAgent);
+    gatherAgent->copyIdAndSequenceInfo(sequenceStartPositions,
+                                       subSequenceStartPositions,
+                                       info.allIds,
+                                       info.idIndex);
+  }
+}
+
+void RecurrentGradientMachine::connectFrames(PassType passType) {
   for (auto& memoryFrameLine : memoryFrameLines_) {
     if (memoryFrameLine.rootAgent) {
       auto scatterAgent =
@@ -487,8 +470,9 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
                                           memoryFrameLine.outArg,
                                           memoryFrameLine.allIds,
                                           /* idIndex */ 0,
-                                          memoryFrameLine.allIds->getSize());
-      if (memoryFrameLine.is_sequence) {  // memoryConfig is sequence
+                                          memoryFrameLine.allIds->getSize(),
+                                          /* handleBackward */ true);
+      if (memoryFrameLine.sequenceStartPositions) {
         int size = memoryFrameLine.sequenceStartPositions->getSize();
         scatterAgent->setSequenceStartPositions(
             memoryFrameLine.sequenceStartPositions,
@@ -501,28 +485,26 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
   for (auto& outFrameLine : outFrameLines_) {
     auto gatherAgent =
         dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
-    CHECK_NOTNULL(gatherAgent);
-    gatherAgent->copyIdAndSequenceInfo(input,
-                                       info_[targetInfoInlinkId_].allIds,
-                                       info_[targetInfoInlinkId_].idIndex);
+    gatherAgent->clearRealLayers();
   }
-
   for (int i = 0; i < maxSequenceLength_; ++i) {
-    int idSize = 0;
     // connect in_links
     for (size_t j = 0; j < inFrameLines_.size(); ++j) {
-      Info& info = info_[shareInlinkInfo ? 0 : j];
+      Info& info = info_[j];
       // idSize denotes the sum number of tokens in each length i
-      idSize = info.idIndex[i + 1] - info.idIndex[i];
+      int idIndex = info.idIndex.empty() ? 0 : info.idIndex[i];
+      int idSize = info.idIndex.empty() ? numSeqs_[i]
+                                        : info.idIndex[i + 1] - info.idIndex[i];
       InFrameLine inFrameLine = inFrameLines_[j];
       auto scatterAgent =
           dynamic_cast<ScatterAgentLayer*>(inFrameLine.agents[i].get());
       scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer,
                                           inFrameLine.outArg,
                                           info.allIds,
-                                          info.idIndex[i],
-                                          idSize);
-      if (hasSubseq) {
+                                          idIndex,
+                                          idSize,
+                                          i == 0);
+      if (info.sequenceStartPositions) {
         // size: the length of subsequence
         int size = info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i];
         scatterAgent->setSequenceStartPositions(
@@ -536,11 +518,6 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
           dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
       gatherAgent->addRealLayer(outFrameLine.frames[i]);
     }
-    // connect memory links
-    // Adopt info_[0].idIndex because seq which has_subseq=True
-    // doesn't support Memory with !hasSubseq bootlayer;
-    // And inlinks that !hasSubSeq must have same inlink length.
-    idSize = info_[0].idIndex[i + 1] - info_[0].idIndex[i];
     for (auto& memoryFrameLine : memoryFrameLines_) {
       NeuralNetwork::connect(
           memoryFrameLine.agents[i],
@@ -548,6 +525,28 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
           numSeqs_[i] /*height of agent*/);
     }
   }
+}
+
+void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
+                                       std::vector<Argument>* outArgs,
+                                       PassType passType) {
+  /* inArgs and outArgs are not used.
+     The inputs are inFrameLines_[i].inLayer.
+     The outputs are outFramesLines_[i].agentLayer
+   */
+
+  if (generating_) {
+    generateSequence();
+    return;
+  }  // else forward..
+
+  reorganizeInput(passType);
+  int numSequences = commonSeqInfo_.size();
+
+  resizeOrCreateFrames(maxSequenceLength_);
+  resizeBootFrame(numSequences);
+
+  connectFrames(passType);
 
   REGISTER_TIMER_INFO("RecurrentFwTime", "RecurrentFwTime");
   // forward
@@ -558,19 +557,15 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     const std::vector<Argument> inArgs;
     std::vector<Argument> outArgs;
     frames_[i]->forward(inArgs, &outArgs, passType);
-    if (hasSubseq) {
-      for (auto& outFrameLine : outFrameLines_) {
-        CHECK(outFrameLine.frames[i]->getOutput().sequenceStartPositions)
-            << "In hierachical RNN, all out links should be from sequences.";
-      }
-    }
-  }
-  if (evaluator_ && passType == PASS_TEST) {
-    this->eval(evaluator_.get());
   }
+
+  reorganizeOutput(passType);
 }
 
 void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
+  if (generating_) {
+    return;
+  }
   REGISTER_TIMER_INFO("RecurrentBwTime", "RecurrentBwTime");
   AsyncGpuBlock asyncGpuBlock;
   for (int i = maxSequenceLength_ - 1; i >= 0; --i) {
@@ -579,11 +574,6 @@ void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
   for (auto& memoryFrameLine : memoryFrameLines_) {
     memoryFrameLine.bootLayer->backward(nullptr);
   }
-
-  // call printers here so the gradient can be printed
-  if (evaluator_) {
-    this->eval(evaluator_.get());
-  }
 }
 
 void RecurrentGradientMachine::forwardBackward(
@@ -597,9 +587,9 @@ void RecurrentGradientMachine::forwardBackward(
 void RecurrentGradientMachine::eval(Evaluator* evaluator) const {
   // call printers frame by frame
   for (int i = 0; i < maxSequenceLength_; ++i) {
-    LOG(INFO) << "Recurrent Layer Group eval frame " << i << " begin";
+    VLOG(2) << "Recurrent Layer Group eval frame " << i << " begin";
     evaluator->eval(*(frames_[i].get()));
-    LOG(INFO) << "Recurrent Layer Group eval frame " << i << " end";
+    VLOG(2) << "Recurrent Layer Group eval frame " << i << " end";
   }
 }
 
@@ -634,76 +624,228 @@ void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() {
     this->beamSearchStatistics_ = nullptr;
   }
 }
+
+namespace {
+void lenToStarts(std::vector<int>& starts) {
+  int pos = 0;
+  starts.back() = 0;
+  for (auto& start : starts) {
+    int tmp = start;
+    start = pos;
+    pos += tmp;
+  }
+  starts.back() = pos;
+}
+}
+
+void RecurrentGradientMachine::calcSequenceStartPositions() {
+  std::vector<int> starts(commonSeqInfo_.size() + 1);
+  for (auto& seqInfo : commonSeqInfo_) {
+    starts[seqInfo.seqId] = seqInfo.topLevelLength;
+  }
+  lenToStarts(starts);
+  ICpuGpuVector::resizeOrCreate(sequenceStartPositions_, starts.size(), false);
+  std::copy(starts.begin(),
+            starts.end(),
+            sequenceStartPositions_->getMutableData(false));
+}
+
+void RecurrentGradientMachine::checkOutputConsistency(
+    OutFrameLine& outFrameLine) {
+  bool hasSeq = outFrameLine.frames[0]->getOutput().hasSeq();
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    CHECK_EQ(hasSeq, frame->getOutput().hasSeq());
+    int numSequences = frame->getOutput().getNumSequences();
+    CHECK_EQ(numSeqs_[i], numSequences);
+  }
+}
+
+void RecurrentGradientMachine::createOutFrameInfo(
+    OutFrameLine& outFrameLine,
+    Info& info,
+    ICpuGpuVectorPtr& sequenceStartPositions,
+    ICpuGpuVectorPtr& subSequenceStartPositions) {
+  checkOutputConsistency(outFrameLine);
+
+  if (!outFrameLine.frames[0]->getOutput().hasSeq()) {
+    createOutFrameInfo_seq(
+        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
+  } else {
+    createOutFrameInfo_subseq(
+        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
+  }
+}
+
+void RecurrentGradientMachine::createOutFrameInfo_seq(
+    OutFrameLine& outFrameLine,
+    Info& info,
+    ICpuGpuVectorPtr& sequenceStartPositions,
+    ICpuGpuVectorPtr& subSequenceStartPositions) {
+  std::vector<int> allIds;
+  info.idIndex.resize(1, 0);  // first idIndex = 0
+
+  const int* starts = sequenceStartPositions_->getData(false);
+
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    size_t numSequences = frame->getOutput().getNumSequences();
+    for (size_t j = 0; j < numSequences; ++j) {
+      int seqStart = starts[commonSeqInfo_[j].seqId];
+      int seqLength = commonSeqInfo_[j].topLevelLength;
+      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
+                                 : (seqStart + i));
+    }
+    info.idIndex.push_back(allIds.size());
+  }
+  sequenceStartPositions = sequenceStartPositions_;
+  copyScattedId(allIds, &info.allIds, allIds.size());
+  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
+}
+
+void RecurrentGradientMachine::createOutFrameInfo_subseq(
+    OutFrameLine& outFrameLine,
+    Info& info,
+    ICpuGpuVectorPtr& sequenceStartPositions,
+    ICpuGpuVectorPtr& subSequenceStartPositions) {
+  size_t numSequences = commonSeqInfo_.size();
+  std::vector<int> allIds;
+  info.idIndex.resize(1, 0);  // first idIndex = 0
+
+  const int* starts = sequenceStartPositions_->getData(false);
+  std::vector<int> subStarts(starts[numSequences] + 1);
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    size_t numSequences = frame->getOutput().getNumSequences();
+    const int* seqStarts =
+        frame->getOutput().sequenceStartPositions->getData(false);
+    for (size_t j = 0; j < numSequences; ++j) {
+      subStarts[starts[commonSeqInfo_[j].seqId] + i] =
+          seqStarts[j + 1] - seqStarts[j];
+    }
+  }
+  lenToStarts(subStarts);
+
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    size_t numSequences = frame->getOutput().getNumSequences();
+    for (size_t j = 0; j < numSequences; ++j) {
+      int pos = starts[commonSeqInfo_[j].seqId] + i;
+      int subSeqStart = subStarts[pos];
+      int subSeqEnd = subStarts[pos + 1];
+      for (int k = subSeqStart; k < subSeqEnd; ++k) {
+        allIds.push_back(k);
+      }
+    }
+    info.idIndex.push_back(allIds.size());
+  }
+
+  ICpuGpuVector::resizeOrCreate(
+      subSequenceStartPositions, subStarts.size(), false);
+  int* cpuSubSequenceStartPositions =
+      subSequenceStartPositions->getMutableData(false);
+  std::copy(subStarts.begin(), subStarts.end(), cpuSubSequenceStartPositions);
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, numSequences + 1, false);
+  int* cpuSequenceStartPositions =
+      sequenceStartPositions->getMutableData(false);
+  for (size_t i = 0; i <= numSequences; ++i) {
+    cpuSequenceStartPositions[i] = subStarts[starts[i]];
+  }
+  copyScattedId(allIds, &info.allIds, allIds.size());
+  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
+}
+
 /* create scattered id infomation for all realLayer of inFrameLines one time.
  * If hasSubseq, will also create scattered sequenceStartPositions infomation
  * for all realLayer of inFrameLines one time.
  */
-
 void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
                                                  const Argument& input,
                                                  PassType passType) {
-  bool hasSubseq = input.hasSubseq();
-  // numSequences: # samples(sequences) in a batch
-  size_t numSequences = input.getNumSequences();
+  if (!input.hasSeq()) {
+    createInFrameInfo_nonseq(inlinkId, input, passType);
+  } else if (!input.hasSubseq()) {
+    createInFrameInfo_seq(inlinkId, input, passType);
+  } else {
+    createInFrameInfo_subseq(inlinkId, input, passType);
+  }
+}
+
+void RecurrentGradientMachine::createInFrameInfo_nonseq(int inlinkId,
+                                                        const Argument& input,
+                                                        PassType passType) {
   std::vector<int> allIds;
 
   auto& seqInfo = seqInfos_[inlinkId];
-
-  numSeqs_.clear();
   Info* inlinkInfo = &info_[inlinkId];
   inlinkInfo->idIndex.clear();
-  inlinkInfo->idIndex.push_back(0);  // first idIndex = 0
+  for (size_t i = 0; i < seqInfo.size(); ++i) {
+    allIds.push_back(seqInfo[i].seqId);
+  }
+  // copy and check scatterId
+  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
+}
 
+void RecurrentGradientMachine::createInFrameInfo_seq(int inlinkId,
+                                                     const Argument& input,
+                                                     PassType passType) {
+  std::vector<int> allIds;
+  auto& seqInfo = seqInfos_[inlinkId];
+  Info* inlinkInfo = &info_[inlinkId];
+  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
+
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    for (int j = 0; j < numSeqs_[i]; ++j) {
+      int seqLength = seqInfo[j].topLevelLength;
+      int seqStart = seqInfo[j].seqStart;
+      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
+                                 : (seqStart + i));
+    }
+    inlinkInfo->idIndex.push_back(allIds.size());
+  }
+
+  // copy and check scatterId
+  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
+  CHECK_EQ(inlinkInfo->idIndex.size(),
+           static_cast<size_t>(maxSequenceLength_ + 1));
+}
+void RecurrentGradientMachine::createInFrameInfo_subseq(int inlinkId,
+                                                        const Argument& input,
+                                                        PassType passType) {
+  std::vector<int> allIds;
+
+  auto& seqInfo = seqInfos_[inlinkId];
+
+  Info* inlinkInfo = &info_[inlinkId];
+  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
   std::vector<int> sequenceStartPositions;
   const int* subSequenceStartPositions = nullptr;
 
-  if (hasSubseq) {  // for sequenceScatterAgentLayer
-    subSequenceStartPositions = input.subSequenceStartPositions->getData(false);
-    inlinkInfo->seqStartPosIndex.clear();
-    inlinkInfo->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
-  }
-  // maxSequenceLength_: max topLevelLength in allsamples
+  subSequenceStartPositions = input.subSequenceStartPositions->getData(false);
+  inlinkInfo->seqStartPosIndex.clear();
+  inlinkInfo->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
   for (int i = 0; i < maxSequenceLength_; ++i) {
-    if (hasSubseq) {
-      sequenceStartPositions.push_back(0);  // first element = 0
-    }
-    int numSeqs = 0;
-    for (size_t j = 0; j < numSequences; ++j) {
-      int seqLength = seqInfo[j].topLevelLength;
-      if (i >= seqLength) {
-        break;
-      }
-      ++numSeqs;
-      if (hasSubseq) {
-        int subSeqStart = subSequenceStartPositions[seqInfo[j].subSeqStart + i];
-        int subSeqEnd =
-            subSequenceStartPositions[seqInfo[j].subSeqStart + i + 1];
-        for (int k = subSeqStart; k < subSeqEnd; ++k) {
-          allIds.push_back(k);
-        }
-        sequenceStartPositions.push_back(sequenceStartPositions.back() +
-                                         subSeqEnd - subSeqStart);
-      } else {
-        int seqStart = seqInfo[j].seqStart;
-        allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
-                                   : (seqStart + i));
+    sequenceStartPositions.push_back(0);  // first element = 0
+    for (int j = 0; j < numSeqs_[i]; ++j) {
+      int subSeqStart = subSequenceStartPositions[seqInfo[j].subSeqStart + i];
+      int subSeqEnd = subSequenceStartPositions[seqInfo[j].subSeqStart + i + 1];
+      for (int k = subSeqStart; k < subSeqEnd; ++k) {
+        allIds.push_back(k);
       }
+      sequenceStartPositions.push_back(sequenceStartPositions.back() +
+                                       subSeqEnd - subSeqStart);
     }
     inlinkInfo->idIndex.push_back(allIds.size());
-    numSeqs_.push_back(numSeqs);
-    if (hasSubseq) {
-      inlinkInfo->seqStartPosIndex.push_back(sequenceStartPositions.size());
-    }
-  }
-  if (hasSubseq) {
-    // inFrameLine create sequenceStartPositions one time
-    CHECK_EQ(
-        sequenceStartPositions.size(),
-        static_cast<size_t>(maxSequenceLength_ + input.getNumSubSequences()));
-    CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
-             static_cast<size_t>(maxSequenceLength_ + 1));
-    createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
+    inlinkInfo->seqStartPosIndex.push_back(sequenceStartPositions.size());
   }
+  // inFrameLine create sequenceStartPositions one time
+  CHECK_EQ(
+      sequenceStartPositions.size(),
+      static_cast<size_t>(maxSequenceLength_ + input.getNumSubSequences()));
+  CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
+           static_cast<size_t>(maxSequenceLength_ + 1));
+  createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
 
   // copy and check scatterId
   copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
@@ -717,11 +859,11 @@ void RecurrentGradientMachine::createMemoryFrameInfo(
   const Argument& input = (*memoryFrameLine).rootLayer->getOutput();
   size_t numSequences = input.getNumSequences();
   std::vector<int> allIds;
-  bool seqFlag = (*memoryFrameLine).is_sequence;
+  bool seqFlag = input.hasSeq();
+  CHECK(!input.hasSubseq())
+      << "Subsequence boot layer for memory is not supported";
 
   if (seqFlag) {  // for sequenceScatterAgentLayer
-    CHECK(input.sequenceStartPositions)
-        << "boot layer must be a sequence when is_sequence = true";
     std::vector<int> sequenceStartPositions;
     sequenceStartPositions.push_back(0);  // first element = 0
     const int* starts = input.sequenceStartPositions->getData(false);
@@ -804,8 +946,7 @@ size_t RecurrentGradientMachine::getGenBatchSize() {
   for (auto& memoryFrameLine : memoryFrameLines_) {
     if (!memoryFrameLine.rootLayer) continue;
     Argument& bootArg = memoryFrameLine.rootLayer->getOutput();
-    size_t batchSize = memoryFrameLine.is_sequence ? bootArg.getNumSequences()
-                                                   : bootArg.getBatchSize();
+    size_t batchSize = bootArg.getNumSequences();
     if (numSequences) {
       CHECK_EQ(numSequences, batchSize);
     } else {
@@ -845,12 +986,7 @@ void RecurrentGradientMachine::generateSequence() {
     if (memoryFrameLine.rootAgent) {
       auto scatterAgent =
           dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
-      bool seqFlag = memoryFrameLine.is_sequence;
-      scatterAgent->setRealLayer(memoryFrameLine.rootLayer, ids, seqFlag);
-      if (seqFlag) {
-        CHECK(memoryFrameLine.rootLayer->getOutput().sequenceStartPositions)
-            << "boot layer must be a sequence when is_sequence = true";
-      }
+      scatterAgent->setRealLayer(memoryFrameLine.rootLayer, ids);
     }
     NeuralNetwork::connect(
         memoryFrameLine.agents[0], memoryFrameLine.bootLayer, ids.size());
@@ -858,6 +994,7 @@ void RecurrentGradientMachine::generateSequence() {
 
   // boot layer forward
   AsyncGpuBlock asyncGpuBlock;
+
   for (auto& memoryFrameLine : memoryFrameLines_) {
     memoryFrameLine.bootLayer->forward(PASS_TEST);
   }
@@ -930,8 +1067,7 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
         auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
             memoryFrameLine.scatterAgents[machineCur].get());
         scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
-                                   scatterIds,
-                                   memoryFrameLine.is_sequence);
+                                   scatterIds);
         scatterAgent->forward(PASS_TEST);
         NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
                                memoryFrameLine.scatterAgents[machineCur]);
@@ -949,10 +1085,6 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
 
     copyDataOutlinkFrame(machineCur);
 
-    // call value printer
-    if (evaluator_) {
-      evaluator_->eval(*(frames_[machineCur].get()));
-    }
     // check eos
     const IVectorPtr& eosVec =
         eosFrameLine_->layers[machineCur]->getOutput().ids;
@@ -1003,8 +1135,7 @@ void RecurrentGradientMachine::connectPrevFrame(int stepId,
     auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
         memoryFrameLine.scatterAgents[machineCur].get());
     scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
-                               isOutIds ? topIds_ : machineIds_,
-                               memoryFrameLine.is_sequence);
+                               isOutIds ? topIds_ : machineIds_);
     scatterAgent->forward(PASS_TEST);
     NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
                            memoryFrameLine.scatterAgents[machineCur]);
@@ -1178,11 +1309,10 @@ void RecurrentGradientMachine::fillGenOutputs() {
 
   batchMachineIdVec_.clear();
   generator_.ids.clear();
+  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
+  starts[0] = 0;
   if (numResults > 1) {
     real* probs = generator_.outArg.in->getData();
-    int* starts =
-        generator_.outArg.sequenceStartPositions->getMutableData(false);
-    starts[0] = 0;
     for (size_t i = 0; i < finalPaths_.size(); ++i) {
       for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
         Path& path = finalPaths_[i][j];
@@ -1205,7 +1335,10 @@ void RecurrentGradientMachine::fillGenOutputs() {
   } else {
     for (size_t i = 0; i < finalPaths_.size(); ++i) {
       CHECK(!finalPaths_[i].empty());
-      generator_.ids = finalPaths_[i][0].ids;
+      generator_.ids.insert(generator_.ids.begin(),
+                            finalPaths_[i][0].ids.begin(),
+                            finalPaths_[i][0].ids.end());
+      starts[i + 1] = starts[i] + finalPaths_[i][0].ids.size();
     }
   }
 }
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index c2bc52709ab42bbe21dcc3951f23f2e0b5e6793d..f245620cf668bb341df99cf498105cbd996a6b24 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -284,6 +284,16 @@ public:
   }
 
 protected:
+  std::vector<Argument::SeqInfo> commonSeqInfo_;
+  ICpuGpuVectorPtr sequenceStartPositions_;
+  void calcSequenceStartPositions();
+  void checkInputConsistency(int inlinkId,
+                             const std::vector<Argument::SeqInfo>& seqInfo);
+  void reorganizeInput(PassType passType);
+  void reorganizeOutput(PassType passType);
+  void connectFrames(PassType passType);
+  void calcNumSequencesAtEachStep();
+
   void resizeOrCreateFrames(int numFrames);
   void resizeBootFrame(int numSequences);
 
@@ -295,8 +305,7 @@ protected:
     std::string linkName;
     LayerPtr inLayer;
     std::vector<LayerPtr> agents;  // Scatter Agents to reform batch input
-    bool hasSubseq;
-    Argument outArg;  // scatter output argument
+    Argument outArg;               // scatter output argument
   };
   std::vector<InFrameLine> inFrameLines_;
 
@@ -318,7 +327,6 @@ protected:
     std::vector<LayerPtr> agents;
     std::vector<LayerPtr> scatterAgents;  // scatter agent used by beam search
     Argument outArg;                      // scatter output argument
-    bool is_sequence;
     // Different memoryFrameLine have different element as follows
     IVectorPtr allIds;  // scattered id of realLayer
     ICpuGpuVectorPtr
@@ -330,22 +338,27 @@ protected:
   // and all outFrameLines(outlinks) share the info with one inFrameLine,
   // which is assigned by targetInfoInlinkId_.
   struct Info {
-    IVectorPtr allIds;         // scattered id of realLayer
-    std::vector<int> idIndex;  // index of allIds
+    // The original positions in the original batch
+    IVectorPtr allIds;  // scattered id of realLayer [batchSize]
+
+    // index of allIds for each step [maxSequenceLength_]
+    // idIndex[i] is the total length of the first i sequences
+    std::vector<int> idIndex;
+
     ICpuGpuVectorPtr
         sequenceStartPositions;         // scattered sequenceStartPositions
     std::vector<int> seqStartPosIndex;  // index of sequenceStartPositions
   };
-  std::vector<Info> info_;
+  std::vector<Info> info_;  // for input
 
   // numSeqs_[i] is the number sequences which is longer than i (for sequence
   // data) or has more than i subsequences (for subsequence data)
+  // Equivalently, numSeqs_[i] is the number of sequences at step i;
   std::vector<int> numSeqs_;
 
   std::vector<std::vector<Argument::SeqInfo>> seqInfos_;
 
-  // the id of inlink which share info with outlinks
-  int targetInfoInlinkId_;
+  void checkOutputConsistency(OutFrameLine& outFrameLine);
 
   /* create scattered id infomation for all realLayer of inFrameLines one time.
    *  If hasSubseq, will also create scattered sequenceStartPositions infomation
@@ -354,6 +367,28 @@ protected:
   void createInFrameInfo(int inlinks_id,
                          const Argument& input,
                          PassType passType);
+  void createInFrameInfo_nonseq(int inlinks_id,
+                                const Argument& input,
+                                PassType passType);
+  void createInFrameInfo_seq(int inlinks_id,
+                             const Argument& input,
+                             PassType passType);
+  void createInFrameInfo_subseq(int inlinks_id,
+                                const Argument& input,
+                                PassType passType);
+
+  void createOutFrameInfo(OutFrameLine& outFrameLine,
+                          Info& info,
+                          ICpuGpuVectorPtr& sequenceStartPositions,
+                          ICpuGpuVectorPtr& subSequenceStartPositions);
+  void createOutFrameInfo_seq(OutFrameLine& outFrameLine,
+                              Info& info,
+                              ICpuGpuVectorPtr& sequenceStartPositions,
+                              ICpuGpuVectorPtr& subSequenceStartPositions);
+  void createOutFrameInfo_subseq(OutFrameLine& outFrameLine,
+                                 Info& info,
+                                 ICpuGpuVectorPtr& sequenceStartPositions,
+                                 ICpuGpuVectorPtr& subSequenceStartPositions);
 
   void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine,
                              PassType passType);
@@ -379,6 +414,7 @@ protected:
     std::vector<int> ids;  // store generated sequences
     Argument outArg;       // final output argument
   };
+  bool generating_;
   Generator generator_;
 
   std::vector<std::unique_ptr<NeuralNetwork>> frames_;
@@ -386,17 +422,13 @@ protected:
   NeuralNetwork* rootNetwork_;
   bool reversed_;
 
-  // if hasSubseq: max number of sentences(subseq)in batchsize samples
-  // else: max number of tokens in batchsize samples(sentences)
-  int maxSequenceLength_;
+  int maxSequenceLength_;  // Max top-level length
   bool useGpu_;
   bool stopBeamSearch_;
 
   std::vector<int>
       parameterIds_;  // parameters actually used by this Layer Group
 
-  std::unique_ptr<Evaluator> evaluator_;  // frame printers in this layer group
-
   // store final argument of outFrameLines_
   std::vector<Argument> dataArgs_;
   // store each frame's output argument of outFrameLines_
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
index 7b1b99b135e35e5fe41dbb3d053a96e3e31e5cf1..15e7411b5fde0fa3a532394cf7d0e8477ef052d0 100644
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -36,14 +36,23 @@ void AgentLayer::forward(PassType passType) {
   Layer::forward(passType);
 
   Argument& realOutput = realLayer_->getOutput();
-  int realHeight = realOutput.getBatchSize();
-  CHECK_LE(numSamples_, realHeight);
+  int realNumSequences = realOutput.getNumSequences();
+  CHECK_LE(numSamples_, realNumSequences);
 
   // get Arguments from real layers
-  if (numSamples_ > 0 && numSamples_ < realHeight) {
-    if (realOutput.ids) {
-      output_.ids =
-          IVector::create(realOutput.ids->getData(), numSamples_, useGpu_);
+  if (numSamples_ > 0 && numSamples_ < realNumSequences) {
+    if (realOutput.hasSeq()) {
+      int numRows =
+          realOutput.sequenceStartPositions->getData(false)[numSamples_];
+      output_.subArgFrom(realOutput,
+                         /* offset */ 0,
+                         numRows,
+                         getSize(),
+                         useGpu_,
+                         /* trans */ false,
+                         /* seqFlag */ true,
+                         /* seqStart */ 0,
+                         /* seqSize */ numSamples_ + 1);
     } else {
       output_.subArgFrom(
           realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
@@ -53,34 +62,6 @@ void AgentLayer::forward(PassType passType) {
   }
 }
 
-void SequenceAgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  Argument& realOutput = realLayer_->getOutput();
-  int realNumSequences = realOutput.getNumSequences();
-  CHECK_LE(numSamples_, realNumSequences);
-
-  // get Arguments from real layers
-  if (numSamples_ > 0 && numSamples_ < realNumSequences) {
-    int numRows =
-        realOutput.sequenceStartPositions->getData(false)[numSamples_];
-    CHECK(!realOutput.ids) << "Not supported";
-    output_.subArgFrom(realOutput,
-                       /* offset */ 0,
-                       numRows,
-                       getSize(),
-                       useGpu_,
-                       /* trans */ false,
-                       /* seqFlag */ true,
-                       /* seqStart */ 0,
-                       /* seqSize */ numSamples_ + 1);
-  } else {
-    output_ = realOutput;
-  }
-}
-
-REGISTER_LAYER(sequence_agent, SequenceAgentLayer);
-
 bool GatherAgentLayer::init(const LayerMap& layerMap,
                             const ParameterMap& parameterMap) {
   CHECK_EQ(config_.inputs_size(), 0);
@@ -91,18 +72,26 @@ bool GatherAgentLayer::init(const LayerMap& layerMap,
   return true;
 }
 
-void GatherAgentLayer::copyIdAndSequenceInfo(const Argument& input,
-                                             const IVectorPtr& ids,
-                                             const std::vector<int>& idIndex) {
-  output_.sequenceStartPositions = input.sequenceStartPositions;
-  output_.subSequenceStartPositions = input.subSequenceStartPositions;
-  realLayers_.clear();
+void GatherAgentLayer::copyIdAndSequenceInfo(
+    ICpuGpuVectorPtr sequenceStartPositions,
+    ICpuGpuVectorPtr subSequenceStartPositions,
+    const IVectorPtr& ids,
+    const std::vector<int>& idIndex) {
+  output_.sequenceStartPositions = sequenceStartPositions;
+  output_.subSequenceStartPositions = subSequenceStartPositions;
   allIds_ = ids;
   idIndex_ = idIndex;
 }
 
 void GatherAgentLayer::forward(PassType passType) {
   Layer::forward(passType);
+  forwardIds(passType);
+  forwardValue(passType);
+}
+
+void GatherAgentLayer::forwardValue(PassType passType) {
+  MatrixPtr valueReal = realLayers_[0]->getOutputValue();
+  if (!valueReal) return;
 
   int height = allIds_->getSize();
   int width = this->getSize();
@@ -120,6 +109,40 @@ void GatherAgentLayer::forward(PassType passType) {
   }
 }
 
+namespace {
+
+// dest[index[i]] <- src[i] for each i
+void copyElements(const IVector& srcVec,
+                  const IVector& indexVec,
+                  IVector& destVec) {
+  const int* src = srcVec.getData();
+  const int* index = indexVec.getData();
+  int* dest = destVec.getData();
+  int len = indexVec.getSize();
+  CHECK_EQ(srcVec.getSize(), indexVec.getSize());
+  for (int i = 0; i < len; ++i) {
+    dest[index[i]] = src[i];
+  }
+}
+}
+
+void GatherAgentLayer::forwardIds(PassType passType) {
+  IVectorPtr realId = realLayers_[0]->getOutputLabel();
+  if (!realId) return;
+
+  IVector::resizeOrCreate(output_.ids, allIds_->getSize(), useGpu_);
+  IVectorPtr outId = output_.ids;
+  idsVec_.resize(idIndex_.size());
+
+  for (size_t i = 0; i < realLayers_.size(); ++i) {
+    const IVectorPtr& realId = realLayers_[i]->getOutputLabel();
+    idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
+                                 /* size */ realId->getSize(),
+                                 useGpu_);
+    execViaCpu(&copyElements, *realId, *idsVec_[i], *outId);
+  }
+}
+
 void GatherAgentLayer::backward(const UpdateCallback& callback) {
   (void)callback;
   const MatrixPtr& outputGrad = getOutputGrad();
@@ -147,21 +170,22 @@ void ScatterAgentLayer::forward(PassType passType) {
   CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
 
   int width = this->getSize();
-  if (realOutArg_.value || realOutArg_.ids) {
-    output_.subArgFrom(
-        realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_);
-  } else {  // used in generation
-    if (realLayer_->getOutput().ids) {
-      IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
-      output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_);
-    }
-    if (realLayer_->getOutput().value) {
-      int height = ids_->getSize();
-      resetOutput(height, width);
-
-      const MatrixPtr& outV = getOutputValue();
-      const MatrixPtr& realV = realLayer_->getOutputValue();
-      outV->selectRows(*realV, *ids_);
+  if (selectionMode_) {
+    forwardWithSelection(passType);
+  } else {
+    if (realOutArg_.hasSeq()) {
+      output_.subArgFrom(realOutArg_,
+                         /* offset */ idIndex_,
+                         idSize_,
+                         width,
+                         useGpu_,
+                         /* trans */ false,
+                         /* seqFlag */ true,
+                         /* seqStart */ seqStartPosIndex_,
+                         /* seqSize */ numSequences_);
+    } else {
+      output_.subArgFrom(
+          realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_);
     }
   }
 }
@@ -169,12 +193,14 @@ void ScatterAgentLayer::forward(PassType passType) {
 void ScatterAgentLayer::backward(const UpdateCallback& callback) {
   (void)callback;
 
+  CHECK(!selectionMode_);
+
   const MatrixPtr& outputGrad = realOutArg_.grad;
   const MatrixPtr& realGrad = realLayer_->getOutputGrad();
   if (realGrad) {
     // for agent in inFrameLines and memoryFrameLines,
     // only first scatterAgentLayer should do addToRows in backward
-    if (idIndex_ == 0) {
+    if (handleBackward_) {
       outputGrad->addToRows(*realGrad, *ids_);
     }
   }
@@ -183,42 +209,7 @@ void ScatterAgentLayer::backward(const UpdateCallback& callback) {
 REGISTER_LAYER(gather_agent, GatherAgentLayer);
 REGISTER_LAYER(scatter_agent, ScatterAgentLayer);
 
-void SequenceGatherAgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  int height = 0;
-  int* starts = output_.subSequenceStartPositions->getMutableData(false);
-  IVectorPtr idReal = realLayers_[0]->getOutputLabel();
-  if (idReal) {
-    // Gather generator.idsVec
-    // if is beam search generation result. Get first result.
-    if (idReal->getData()[idReal->getSize() - 1] == -1) {
-      for (size_t i = 0; i < realLayers_.size(); ++i) {
-        // The first element stores first result size
-        idReal = realLayers_[i]->getOutputLabel();
-        idReal->subVecFrom(*idReal, 1, idReal->getData()[0]);
-      }
-    }
-    for (size_t i = 0; i < realLayers_.size(); ++i) {
-      CHECK(realLayers_[i]->getOutputLabel());
-      starts[i] = height;
-      height += realLayers_[i]->getOutputLabel()->getSize();
-    }
-    starts[realLayers_.size()] = height;
-    output_.sequenceStartPositions->getMutableData(false)[1] = height;
-
-    IVector::resizeOrCreate(output_.ids, height, false);
-    for (size_t i = 0; i < realLayers_.size(); ++i) {
-      output_.ids->subVec(starts[i], starts[i + 1] - starts[i])
-          ->copyFrom(*realLayers_[i]->getOutputLabel());
-    }
-  } else {
-    // Gather output.value, same as GatherAgentLayer
-    CHECK(output_.subSequenceStartPositions);
-    GatherAgentLayer::forward(passType);
-  }
-}
-
-void SequenceScatterAgentLayer::forward(PassType passType) {
+void ScatterAgentLayer::forwardWithSelection(PassType passType) {
   Layer::forward(passType);
   CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
 
@@ -229,18 +220,21 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
   AsyncGpuBlock asyncGpuBlock;
   REGISTER_TIMER_INFO("SequenceAgentLayerForward", getName().c_str());
 
-  if (realOutArg_.value || realOutArg_.ids) {
-    CHECK(realOutArg_.sequenceStartPositions);
-    output_.subArgFrom(realOutArg_,
-                       /* offset */ idIndex_,
-                       idSize_,
-                       width,
-                       useGpu_,
-                       /* trans */ false,
-                       /* seqFlag */ true,
-                       /* seqStart */ seqStartPosIndex_,
-                       /* seqSize */ numSequences_);
+  if (!input.hasSeq()) {
+    if (realLayer_->getOutput().ids) {
+      IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
+      output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_);
+    }
+    if (realLayer_->getOutput().value) {
+      int height = ids_->getSize();
+      resetOutput(height, width);
+
+      const MatrixPtr& outV = getOutputValue();
+      const MatrixPtr& realV = realLayer_->getOutputValue();
+      outV->selectRows(*realV, *ids_);
+    }
   } else {
+    // Putting the generation logic here is really an ugly hack!
     // used in generation
     int height = 0;
     size_t numSequences = ids_->getSize();
@@ -284,7 +278,4 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
   }
 }
 
-REGISTER_LAYER(sequence_gather_agent, SequenceGatherAgentLayer);
-REGISTER_LAYER(sequence_scatter_agent, SequenceScatterAgentLayer);
-
 }  // namespace paddle
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
index b6dac7ae6fec2d61c60c9548d466233efe9febd5..29681b29c6a9a10715548839f2d365eb4a0c7381 100644
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -49,18 +49,6 @@ public:
   void backward(const UpdateCallback& callback = nullptr) override {}
 };
 
-/**
- * like AgentLayer, but use first *numSamples* sequences
- */
-class SequenceAgentLayer : public AgentLayer {
-public:
-  explicit SequenceAgentLayer(const LayerConfig& config) : AgentLayer(config) {}
-  ~SequenceAgentLayer() {}
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override {}
-};
-
 /**
  * Like AgentLayer, but it can gather many real layers. Each real
  * layer give a few rows of a sequence, after gather all real layers,
@@ -83,7 +71,10 @@ public:
             const ParameterMap& parameterMap) override;
 
   // call before addRealLayer
-  void copyIdAndSequenceInfo(const Argument& input,
+  void clearRealLayers() { realLayers_.clear(); }
+
+  void copyIdAndSequenceInfo(ICpuGpuVectorPtr sequenceStartPositions,
+                             ICpuGpuVectorPtr subSequenceStartPositions,
                              const IVectorPtr& allIds,
                              const std::vector<int>& idIndex);
 
@@ -92,24 +83,8 @@ public:
 
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
-};
-
-/**
- * Like GatherAgentLayer, but select a few sequence in real layer.
- * *ids* in addRealLayer() are the ids of selected sequence.
- * It's used to reorder sequence output.
- */
-class SequenceGatherAgentLayer : public GatherAgentLayer {
-public:
-  explicit SequenceGatherAgentLayer(const LayerConfig& config)
-      : GatherAgentLayer(config) {}
-  virtual ~SequenceGatherAgentLayer() {}
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback) {
-    // same as GatherAgentLayer
-    GatherAgentLayer::backward(callback);
-  }
+  void forwardValue(PassType passType);
+  void forwardIds(PassType passType);
 };
 
 /**
@@ -129,6 +104,14 @@ protected:
   int idSize_;
   int seqStartPosIndex_;
   int numSequences_;  // number of sequences in this scatterAgentLayer
+  bool handleBackward_;
+
+  // use to store expanded cpuStartPositions or subSequenceStartPositions
+  // of real layer.
+  ICpuGpuVectorPtr inputStartPos_;
+
+  // true for setRealLayer, false for setRealLayerAndOutput
+  bool selectionMode_;
 
 public:
   explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {}
@@ -147,20 +130,17 @@ public:
    *                        false(default) in ScatterAgentLayer, and
    *                        true in SequenceScatterAgentLayer.
    */
-  void setRealLayer(LayerPtr layer,
-                    const std::vector<int>& ids,
-                    bool copyId = false) {
+  void setRealLayer(LayerPtr layer, const std::vector<int>& ids) {
     realLayer_ = layer;
     IVector::resizeOrCreate(ids_, ids.size(), useGpu_);
     ids_->copyFrom(ids.data(), ids.size());
-    if (copyId) {
-      if (useGpu_) {
-        IVector::resizeOrCreate(cpuIds_, ids.size(), false);
-        cpuIds_->copyFrom(ids.data(), ids.size());
-      } else {
-        cpuIds_ = ids_;
-      }
+    if (useGpu_) {
+      IVector::resizeOrCreate(cpuIds_, ids.size(), false);
+      cpuIds_->copyFrom(ids.data(), ids.size());
+    } else {
+      cpuIds_ = ids_;
     }
+    selectionMode_ = true;
   }
 
   // set real layer and output, [idIndex, idIndex + idSize) of *ids*
@@ -169,12 +149,15 @@ public:
                              const Argument& outArg,
                              const IVectorPtr& ids,
                              int idIndex,
-                             int idSize) {
+                             int idSize,
+                             bool handleBackward) {
     realLayer_ = layer;
     realOutArg_ = outArg;
     ids_ = ids;
     idIndex_ = idIndex;
     idSize_ = idSize;
+    handleBackward_ = handleBackward;
+    selectionMode_ = false;
   }
 
   void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions,
@@ -187,28 +170,8 @@ public:
 
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
-};
 
-/**
- * Like ScatterAgentLayer, but select a few sequence in real layer.
- * *ids* in setRealLayer() or setRealLayerAndOutput() are the ids of
- * selected sequence. It's used to reorder sequence input.
- */
-class SequenceScatterAgentLayer : public ScatterAgentLayer {
-protected:
-  // use to store expanded cpuStartPositions or subSequenceStartPositions
-  // of real layer.
-  ICpuGpuVectorPtr inputStartPos_;
-
-public:
-  explicit SequenceScatterAgentLayer(const LayerConfig& config)
-      : ScatterAgentLayer(config) {}
-  virtual ~SequenceScatterAgentLayer() {}
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback) {
-    ScatterAgentLayer::backward(callback);
-  }
+  void forwardWithSelection(PassType passType);
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
index 7b234dc2a6663dc677affcae7dc6306c104c1250..e161d89c38a290000a2cbdb2905e56901ae4c144 100644
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -118,11 +118,7 @@ size_t ConvBaseLayer::calOutputSize() {
     layerSize = outH[0] * outW[0] * size_t(numFilters_);
   };
 
-  if (isDeconv_) {
-    setLayerSize(outputH_, outputW_, imgSizeH_, imgSizeW_);
-  } else {
-    setLayerSize(imgSizeH_, imgSizeW_, outputH_, outputW_);
-  }
+  setLayerSize(imgSizeH_, imgSizeW_, outputH_, outputW_);
 
   return layerSize;
 }
diff --git a/paddle/gserver/layers/CudnnConvBaseLayer.cpp b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
index 24363bb8b09cc354c25abe512257be68566c10e1..c056bbe4d1d354751d4f85f8d0743cf30486c087 100644
--- a/paddle/gserver/layers/CudnnConvBaseLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
@@ -70,14 +70,8 @@ void CudnnConvBaseLayer::forward(PassType passType) {
   if (biases_) {
     REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
     int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-    int outH, outW;
-    if (isDeconv_) {
-      outH = imgSizeH_[0];
-      outW = imgSizeW_[0];
-    } else {
-      outH = outputH_[0];
-      outW = outputW_[0];
-    }
+    int outH = outputH_[0];
+    int outW = outputW_[0];
 
     hl_tensor_reshape(outputDesc_,
                       batchSize,
diff --git a/paddle/gserver/layers/DetectionUtil.cpp b/paddle/gserver/layers/DetectionUtil.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e61adc66e60c54250e4f323452aa13045310879
--- /dev/null
+++ b/paddle/gserver/layers/DetectionUtil.cpp
@@ -0,0 +1,576 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DetectionUtil.h"
+
+namespace paddle {
+
+size_t appendWithPermute(const Matrix& inMatrix,
+                         size_t height,
+                         size_t width,
+                         size_t outTotalSize,
+                         size_t outOffset,
+                         size_t batchSize,
+                         Matrix& outMatrix,
+                         PermMode permMode) {
+  CHECK_EQ(inMatrix.useGpu(), outMatrix.useGpu());
+  bool useGpu = inMatrix.useGpu();
+  if (permMode == kNCHWToNHWC) {
+    size_t inElementCnt = inMatrix.getElementCnt();
+    size_t channels = inElementCnt / (height * width * batchSize);
+    size_t imgSize = height * width;
+    for (size_t i = 0; i < batchSize; ++i) {
+      size_t offset = i * (outTotalSize / batchSize) + outOffset;
+      const MatrixPtr inTmp = Matrix::create(
+          const_cast<real*>(inMatrix.getData()) + i * channels * imgSize,
+          channels,
+          imgSize,
+          false,
+          useGpu);
+      MatrixPtr outTmp =
+          Matrix::create(const_cast<real*>(outMatrix.getData()) + offset,
+                         imgSize,
+                         channels,
+                         false,
+                         useGpu);
+      inTmp->transpose(outTmp, false);
+    }
+    return channels * imgSize;
+  } else {
+    LOG(FATAL) << "Unkown permute mode";
+  }
+}
+
+size_t decomposeWithPermute(const Matrix& inMatrix,
+                            size_t height,
+                            size_t width,
+                            size_t inTotalSize,
+                            size_t inOffset,
+                            size_t batchSize,
+                            Matrix& outMatrix,
+                            PermMode permMode) {
+  CHECK_EQ(inMatrix.useGpu(), outMatrix.useGpu());
+  bool useGpu = inMatrix.useGpu();
+  if (permMode == kNHWCToNCHW) {
+    size_t outElementCnt = outMatrix.getElementCnt();
+    size_t channels = outElementCnt / (height * width * batchSize);
+    size_t imgSize = height * width;
+    for (size_t i = 0; i < batchSize; ++i) {
+      size_t offset = i * (inTotalSize / batchSize) + inOffset;
+      const MatrixPtr inTmp =
+          Matrix::create(const_cast<real*>(inMatrix.getData()) + offset,
+                         imgSize,
+                         channels,
+                         false,
+                         useGpu);
+      MatrixPtr outTmp = Matrix::create(
+          const_cast<real*>(outMatrix.getData()) + i * channels * imgSize,
+          channels,
+          imgSize,
+          false,
+          useGpu);
+      inTmp->transpose(outTmp, false);
+    }
+    return channels * imgSize;
+  } else {
+    LOG(FATAL) << "Unkown permute mode";
+  }
+}
+
+real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2) {
+  if (bbox2.xMin > bbox1.xMax || bbox2.xMax < bbox1.xMin ||
+      bbox2.yMin > bbox1.yMax || bbox2.yMax < bbox1.yMin) {
+    return 0.0;
+  } else {
+    real interXMin = std::max(bbox1.xMin, bbox2.xMin);
+    real interYMin = std::max(bbox1.yMin, bbox2.yMin);
+    real interXMax = std::min(bbox1.xMax, bbox2.xMax);
+    real interYMax = std::min(bbox1.yMax, bbox2.yMax);
+
+    real interWidth = interXMax - interXMin;
+    real interHeight = interYMax - interYMin;
+    real interArea = interWidth * interHeight;
+
+    real bboxArea1 = bbox1.getArea();
+    real bboxArea2 = bbox2.getArea();
+
+    return interArea / (bboxArea1 + bboxArea2 - interArea);
+  }
+}
+
+void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                       const vector<real>& priorBBoxVar,
+                       const NormalizedBBox& gtBBox,
+                       vector<real>& outVec) {
+  real priorBBoxWidth = priorBBox.getWidth();
+  real priorBBoxHeight = priorBBox.getHeight();
+  real priorBBoxCenterX = priorBBox.getCenterX();
+  real priorBBoxCenterY = priorBBox.getCenterY();
+
+  real gtBBoxWidth = gtBBox.getWidth();
+  real gtBBoxHeight = gtBBox.getHeight();
+  real gtBBoxCenterX = gtBBox.getCenterX();
+  real gtBBoxCenterY = gtBBox.getCenterY();
+
+  outVec.clear();
+  outVec.push_back((gtBBoxCenterX - priorBBoxCenterX) / priorBBoxWidth /
+                   priorBBoxVar[0]);
+  outVec.push_back((gtBBoxCenterY - priorBBoxCenterY) / priorBBoxHeight /
+                   priorBBoxVar[1]);
+  outVec.push_back(std::log(std::fabs(gtBBoxWidth / priorBBoxWidth)) /
+                   priorBBoxVar[2]);
+  outVec.push_back(std::log(std::fabs(gtBBoxHeight / priorBBoxHeight)) /
+                   priorBBoxVar[3]);
+}
+
+NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                                 const vector<real>& priorBBoxVar,
+                                 const vector<real>& locPredData) {
+  real priorBBoxWidth = priorBBox.getWidth();
+  real priorBBoxHeight = priorBBox.getHeight();
+  real priorBBoxCenterX = priorBBox.getCenterX();
+  real priorBBoxCenterY = priorBBox.getCenterY();
+
+  real decodedBBoxCenterX =
+      priorBBoxVar[0] * locPredData[0] * priorBBoxWidth + priorBBoxCenterX;
+  real decodedBBoxCenterY =
+      priorBBoxVar[1] * locPredData[1] * priorBBoxHeight + priorBBoxCenterY;
+  real decodedBBoxWidth =
+      std::exp(priorBBoxVar[2] * locPredData[2]) * priorBBoxWidth;
+  real decodedBBoxHeight =
+      std::exp(priorBBoxVar[3] * locPredData[3]) * priorBBoxHeight;
+
+  NormalizedBBox decodedBBox;
+  decodedBBox.xMin = decodedBBoxCenterX - decodedBBoxWidth / 2;
+  decodedBBox.yMin = decodedBBoxCenterY - decodedBBoxHeight / 2;
+  decodedBBox.xMax = decodedBBoxCenterX + decodedBBoxWidth / 2;
+  decodedBBox.yMax = decodedBBoxCenterY + decodedBBoxHeight / 2;
+
+  return decodedBBox;
+}
+
+void getBBoxFromPriorData(const real* priorData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec) {
+  size_t outOffset = bboxVec.size();
+  bboxVec.resize(bboxVec.size() + numBBoxes);
+  for (size_t i = 0; i < numBBoxes; ++i) {
+    NormalizedBBox bbox;
+    bbox.xMin = *(priorData + i * 8);
+    bbox.yMin = *(priorData + i * 8 + 1);
+    bbox.xMax = *(priorData + i * 8 + 2);
+    bbox.yMax = *(priorData + i * 8 + 3);
+    bboxVec[outOffset + i] = bbox;
+  }
+}
+
+void getBBoxVarFromPriorData(const real* priorData,
+                             const size_t num,
+                             vector<vector<real>>& varVec) {
+  size_t outOffset = varVec.size();
+  varVec.resize(varVec.size() + num);
+  for (size_t i = 0; i < num; ++i) {
+    vector<real> var;
+    var.push_back(*(priorData + i * 8 + 4));
+    var.push_back(*(priorData + i * 8 + 5));
+    var.push_back(*(priorData + i * 8 + 6));
+    var.push_back(*(priorData + i * 8 + 7));
+    varVec[outOffset + i] = var;
+  }
+}
+
+void getBBoxFromLabelData(const real* labelData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec) {
+  size_t outOffset = bboxVec.size();
+  bboxVec.resize(bboxVec.size() + numBBoxes);
+  for (size_t i = 0; i < numBBoxes; ++i) {
+    NormalizedBBox bbox;
+    bbox.xMin = *(labelData + i * 6 + 1);
+    bbox.yMin = *(labelData + i * 6 + 2);
+    bbox.xMax = *(labelData + i * 6 + 3);
+    bbox.yMax = *(labelData + i * 6 + 4);
+    real isDifficult = *(labelData + i * 6 + 5);
+    if (std::abs(isDifficult - 0.0) < 1e-6)
+      bbox.isDifficult = false;
+    else
+      bbox.isDifficult = true;
+    bboxVec[outOffset + i] = bbox;
+  }
+}
+
+void getBBoxFromDetectData(const real* detectData,
+                           const size_t numBBoxes,
+                           vector<real>& labelVec,
+                           vector<real>& scoreVec,
+                           vector<NormalizedBBox>& bboxVec) {
+  size_t outOffset = bboxVec.size();
+  labelVec.resize(outOffset + numBBoxes);
+  scoreVec.resize(outOffset + numBBoxes);
+  bboxVec.resize(outOffset + numBBoxes);
+  for (size_t i = 0; i < numBBoxes; ++i) {
+    labelVec[outOffset + i] = *(detectData + i * 7 + 1);
+    scoreVec[outOffset + i] = *(detectData + i * 7 + 2);
+    NormalizedBBox bbox;
+    bbox.xMin = *(detectData + i * 7 + 3);
+    bbox.yMin = *(detectData + i * 7 + 4);
+    bbox.xMax = *(detectData + i * 7 + 5);
+    bbox.yMax = *(detectData + i * 7 + 6);
+    bboxVec[outOffset + i] = bbox;
+  }
+}
+
+void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
+               const vector<NormalizedBBox>& gtBBoxes,
+               real overlapThreshold,
+               vector<int>* matchIndices,
+               vector<real>* matchOverlaps) {
+  map<size_t, map<size_t, real>> overlaps;
+  size_t numPriors = priorBBoxes.size();
+  size_t numGTs = gtBBoxes.size();
+
+  matchIndices->clear();
+  matchIndices->resize(numPriors, -1);
+  matchOverlaps->clear();
+  matchOverlaps->resize(numPriors, 0.0);
+
+  // Store the positive overlap between predictions and ground truth
+  for (size_t i = 0; i < numPriors; ++i) {
+    for (size_t j = 0; j < numGTs; ++j) {
+      real overlap = jaccardOverlap(priorBBoxes[i], gtBBoxes[j]);
+      if (overlap > 1e-6) {
+        (*matchOverlaps)[i] = std::max((*matchOverlaps)[i], overlap);
+        overlaps[i][j] = overlap;
+      }
+    }
+  }
+  // Bipartite matching
+  vector<int> gtPool;
+  for (size_t i = 0; i < numGTs; ++i) {
+    gtPool.push_back(i);
+  }
+  while (gtPool.size() > 0) {
+    // Find the most overlapped gt and corresponding predictions
+    int maxPriorIdx = -1;
+    int maxGTIdx = -1;
+    real maxOverlap = -1.0;
+    for (map<size_t, map<size_t, real>>::iterator it = overlaps.begin();
+         it != overlaps.end();
+         ++it) {
+      size_t i = it->first;
+      if ((*matchIndices)[i] != -1) {
+        // The prediction already has matched ground truth or is ignored
+        continue;
+      }
+      for (size_t p = 0; p < gtPool.size(); ++p) {
+        int j = gtPool[p];
+        if (it->second.find(j) == it->second.end()) {
+          // No overlap between the i-th prediction and j-th ground truth
+          continue;
+        }
+        // Find the maximum overlapped pair
+        if (it->second[j] > maxOverlap) {
+          maxPriorIdx = (int)i;
+          maxGTIdx = (int)j;
+          maxOverlap = it->second[j];
+        }
+      }
+    }
+    if (maxPriorIdx == -1) {
+      break;
+    } else {
+      (*matchIndices)[maxPriorIdx] = maxGTIdx;
+      (*matchOverlaps)[maxPriorIdx] = maxOverlap;
+      gtPool.erase(std::find(gtPool.begin(), gtPool.end(), maxGTIdx));
+    }
+  }
+
+  // Get most overlaped for the rest prediction bboxes
+  for (map<size_t, map<size_t, real>>::iterator it = overlaps.begin();
+       it != overlaps.end();
+       ++it) {
+    size_t i = it->first;
+    if ((*matchIndices)[i] != -1) {
+      // The prediction already has matched ground truth or is ignored
+      continue;
+    }
+    int maxGTIdx = -1;
+    real maxOverlap = -1;
+    for (size_t j = 0; j < numGTs; ++j) {
+      if (it->second.find(j) == it->second.end()) {
+        // No overlap between the i-th prediction and j-th ground truth
+        continue;
+      }
+      // Find the maximum overlapped pair
+      real overlap = it->second[j];
+      if (overlap > maxOverlap && overlap >= overlapThreshold) {
+        maxGTIdx = j;
+        maxOverlap = overlap;
+      }
+    }
+    if (maxGTIdx != -1) {
+      (*matchIndices)[i] = maxGTIdx;
+      (*matchOverlaps)[i] = maxOverlap;
+    }
+  }
+}
+
+pair<size_t, size_t> generateMatchIndices(
+    const Matrix& priorValue,
+    const size_t numPriorBBoxes,
+    const Matrix& gtValue,
+    const int* gtStartPosPtr,
+    const size_t seqNum,
+    const vector<vector<real>>& maxConfScore,
+    const size_t batchSize,
+    const real overlapThreshold,
+    const real negOverlapThreshold,
+    const size_t negPosRatio,
+    vector<vector<int>>* matchIndicesVecPtr,
+    vector<vector<int>>* negIndicesVecPtr) {
+  vector<NormalizedBBox> priorBBoxes;  // share same prior bboxes
+  getBBoxFromPriorData(priorValue.getData(), numPriorBBoxes, priorBBoxes);
+  size_t totalPos = 0;
+  size_t totalNeg = 0;
+  for (size_t n = 0; n < batchSize; ++n) {
+    vector<int> matchIndices;
+    vector<int> negIndices;
+    vector<real> matchOverlaps;
+    matchIndices.resize(numPriorBBoxes, -1);
+    matchOverlaps.resize(numPriorBBoxes, 0.0);
+    size_t numGTBBoxes = 0;
+    if (n < seqNum) numGTBBoxes = gtStartPosPtr[n + 1] - gtStartPosPtr[n];
+    if (!numGTBBoxes) {
+      matchIndicesVecPtr->push_back(matchIndices);
+      negIndicesVecPtr->push_back(negIndices);
+      continue;
+    }
+    vector<NormalizedBBox> gtBBoxes;
+    getBBoxFromLabelData(
+        gtValue.getData() + gtStartPosPtr[n] * 6, numGTBBoxes, gtBBoxes);
+
+    matchBBox(
+        priorBBoxes, gtBBoxes, overlapThreshold, &matchIndices, &matchOverlaps);
+
+    size_t numPos = 0;
+    size_t numNeg = 0;
+    for (size_t i = 0; i < matchIndices.size(); ++i)
+      if (matchIndices[i] != -1) ++numPos;
+    totalPos += numPos;
+    vector<pair<real, size_t>> scoresIndices;
+    for (size_t i = 0; i < matchIndices.size(); ++i)
+      if (matchIndices[i] == -1 && matchOverlaps[i] < negOverlapThreshold) {
+        scoresIndices.push_back(std::make_pair(maxConfScore[n][i], i));
+        ++numNeg;
+      }
+    numNeg = std::min(static_cast<size_t>(numPos * negPosRatio), numNeg);
+    std::sort(scoresIndices.begin(),
+              scoresIndices.end(),
+              sortScorePairDescend<size_t>);
+    for (size_t i = 0; i < numNeg; ++i)
+      negIndices.push_back(scoresIndices[i].second);
+    totalNeg += numNeg;
+    matchIndicesVecPtr->push_back(matchIndices);
+    negIndicesVecPtr->push_back(negIndices);
+  }
+  return std::make_pair(totalPos, totalNeg);
+}
+
+void getMaxConfidenceScores(const real* confData,
+                            const size_t batchSize,
+                            const size_t numPriorBBoxes,
+                            const size_t numClasses,
+                            const size_t backgroundId,
+                            vector<vector<real>>* maxConfScoreVecPtr) {
+  maxConfScoreVecPtr->clear();
+  for (size_t i = 0; i < batchSize; ++i) {
+    vector<real> maxConfScore;
+    for (size_t j = 0; j < numPriorBBoxes; ++j) {
+      int offset = j * numClasses;
+      real maxVal = -FLT_MAX;
+      real maxPosVal = -FLT_MAX;
+      real maxScore = 0.0;
+      for (size_t c = 0; c < numClasses; ++c) {
+        maxVal = std::max<real>(confData[offset + c], maxVal);
+        if (c != backgroundId)
+          maxPosVal = std::max<real>(confData[offset + c], maxPosVal);
+      }
+      real sum = 0.0;
+      for (size_t c = 0; c < numClasses; ++c)
+        sum += std::exp(confData[offset + c] - maxVal);
+      maxScore = std::exp(maxPosVal - maxVal) / sum;
+      maxConfScore.push_back(maxScore);
+    }
+    confData += numPriorBBoxes * numClasses;
+    maxConfScoreVecPtr->push_back(maxConfScore);
+  }
+}
+
+template <typename T>
+bool sortScorePairDescend(const pair<real, T>& pair1,
+                          const pair<real, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <>
+bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
+                          const pair<real, NormalizedBBox>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+void applyNMSFast(const vector<NormalizedBBox>& bboxes,
+                  const real* confScoreData,
+                  size_t classIdx,
+                  size_t topK,
+                  real confThreshold,
+                  real nmsThreshold,
+                  size_t numPriorBBoxes,
+                  size_t numClasses,
+                  vector<size_t>* indices) {
+  vector<pair<real, size_t>> scores;
+  for (size_t i = 0; i < numPriorBBoxes; ++i) {
+    size_t confOffset = i * numClasses + classIdx;
+    if (confScoreData[confOffset] > confThreshold)
+      scores.push_back(std::make_pair(confScoreData[confOffset], i));
+  }
+  std::stable_sort(scores.begin(), scores.end(), sortScorePairDescend<size_t>);
+  if (topK > 0 && topK < scores.size()) scores.resize(topK);
+  while (scores.size() > 0) {
+    const size_t idx = scores.front().second;
+    bool keep = true;
+    for (size_t i = 0; i < indices->size(); ++i) {
+      if (keep) {
+        const size_t savedIdx = (*indices)[i];
+        real overlap = jaccardOverlap(bboxes[idx], bboxes[savedIdx]);
+        keep = overlap <= nmsThreshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) indices->push_back(idx);
+    scores.erase(scores.begin());
+  }
+}
+
+size_t getDetectionIndices(
+    const real* confData,
+    const size_t numPriorBBoxes,
+    const size_t numClasses,
+    const size_t backgroundId,
+    const size_t batchSize,
+    const size_t confThreshold,
+    const size_t nmsTopK,
+    const real nmsThreshold,
+    const size_t keepTopK,
+    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+    vector<map<size_t, vector<size_t>>>* allDetectionIndices) {
+  size_t totalKeepNum = 0;
+  for (size_t n = 0; n < batchSize; ++n) {
+    const vector<NormalizedBBox>& decodedBBoxes = allDecodedBBoxes[n];
+    size_t numDetected = 0;
+    map<size_t, vector<size_t>> indices;
+    size_t confOffset = n * numPriorBBoxes * numClasses;
+    for (size_t c = 0; c < numClasses; ++c) {
+      if (c == backgroundId) continue;
+      applyNMSFast(decodedBBoxes,
+                   confData + confOffset,
+                   c,
+                   nmsTopK,
+                   confThreshold,
+                   nmsThreshold,
+                   numPriorBBoxes,
+                   numClasses,
+                   &(indices[c]));
+      numDetected += indices[c].size();
+    }
+    if (keepTopK > 0 && numDetected > keepTopK) {
+      vector<pair<real, pair<size_t, size_t>>> scoreIndexPairs;
+      for (size_t c = 0; c < numClasses; ++c) {
+        const vector<size_t>& labelIndices = indices[c];
+        for (size_t i = 0; i < labelIndices.size(); ++i) {
+          size_t idx = labelIndices[i];
+          scoreIndexPairs.push_back(
+              std::make_pair((confData + confOffset)[idx * numClasses + c],
+                             std::make_pair(c, idx)));
+        }
+      }
+      std::sort(scoreIndexPairs.begin(),
+                scoreIndexPairs.end(),
+                sortScorePairDescend<pair<size_t, size_t>>);
+      scoreIndexPairs.resize(keepTopK);
+      map<size_t, vector<size_t>> newIndices;
+      for (size_t i = 0; i < scoreIndexPairs.size(); ++i) {
+        size_t label = scoreIndexPairs[i].second.first;
+        size_t idx = scoreIndexPairs[i].second.second;
+        newIndices[label].push_back(idx);
+      }
+      allDetectionIndices->push_back(newIndices);
+      totalKeepNum += keepTopK;
+    } else {
+      allDetectionIndices->push_back(indices);
+      totalKeepNum += numDetected;
+    }
+  }
+  return totalKeepNum;
+}
+
+void getDetectionOutput(const real* confData,
+                        const size_t numKept,
+                        const size_t numPriorBBoxes,
+                        const size_t numClasses,
+                        const size_t batchSize,
+                        const vector<map<size_t, vector<size_t>>>& allIndices,
+                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+                        Matrix& out) {
+  MatrixPtr outBuffer;
+  Matrix::resizeOrCreate(outBuffer, numKept, 7, false, false);
+  real* bufferData = outBuffer->getData();
+  size_t count = 0;
+  for (size_t n = 0; n < batchSize; ++n) {
+    for (map<size_t, vector<size_t>>::const_iterator it = allIndices[n].begin();
+         it != allIndices[n].end();
+         ++it) {
+      size_t label = it->first;
+      const vector<size_t>& indices = it->second;
+      const vector<NormalizedBBox>& decodedBBoxes = allDecodedBBoxes[n];
+      for (size_t i = 0; i < indices.size(); ++i) {
+        size_t idx = indices[i];
+        size_t confOffset = n * numPriorBBoxes * numClasses + idx * numClasses;
+        bufferData[count * 7] = n;
+        bufferData[count * 7 + 1] = label;
+        bufferData[count * 7 + 2] = (confData + confOffset)[label];
+        NormalizedBBox clippedBBox = clipBBox(decodedBBoxes[idx]);
+        bufferData[count * 7 + 3] = clippedBBox.xMin;
+        bufferData[count * 7 + 4] = clippedBBox.yMin;
+        bufferData[count * 7 + 5] = clippedBBox.xMax;
+        bufferData[count * 7 + 6] = clippedBBox.yMax;
+        ++count;
+      }
+    }
+  }
+  out.copyFrom(bufferData, numKept * 7);
+}
+
+NormalizedBBox clipBBox(const NormalizedBBox& bbox) {
+  real realOne = static_cast<real>(1.0);
+  real realZero = static_cast<real>(0.0);
+  NormalizedBBox clippedBBox;
+  clippedBBox.xMin = std::max(std::min(bbox.xMin, realOne), realZero);
+  clippedBBox.yMin = std::max(std::min(bbox.yMin, realOne), realZero);
+  clippedBBox.xMax = std::max(std::min(bbox.xMax, realOne), realZero);
+  clippedBBox.yMax = std::max(std::min(bbox.yMax, realOne), realZero);
+  return clippedBBox;
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DetectionUtil.h b/paddle/gserver/layers/DetectionUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe4f9f075e4cf011c97f68f49598a828d62327b3
--- /dev/null
+++ b/paddle/gserver/layers/DetectionUtil.h
@@ -0,0 +1,307 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <float.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/math/Matrix.h"
+
+using std::vector;
+using std::pair;
+using std::map;
+
+namespace paddle {
+
+template <typename T>
+struct BBoxBase {
+  BBoxBase(T xMin, T yMin, T xMax, T yMax)
+      : xMin(xMin), yMin(yMin), xMax(xMax), yMax(yMax), isDifficult(false) {}
+
+  BBoxBase() {}
+
+  T getWidth() const { return xMax - xMin; }
+
+  T getHeight() const { return yMax - yMin; }
+
+  T getCenterX() const { return (xMin + xMax) / 2; }
+
+  T getCenterY() const { return (yMin + yMax) / 2; }
+
+  T getArea() const { return getWidth() * getHeight(); }
+
+  // coordinate of bounding box
+  T xMin;
+  T yMin;
+  T xMax;
+  T yMax;
+  // whether difficult object (e.g. object with heavy occlusion is difficult)
+  bool isDifficult;
+};
+
+struct NormalizedBBox : BBoxBase<real> {
+  NormalizedBBox() : BBoxBase<real>() {}
+};
+
+enum PermMode { kNCHWToNHWC, kNHWCToNCHW };
+
+/**
+ * @brief First permute input maxtrix then append to output matrix
+ */
+size_t appendWithPermute(const Matrix& inMatrix,
+                         size_t height,
+                         size_t width,
+                         size_t outTotalSize,
+                         size_t outOffset,
+                         size_t batchSize,
+                         Matrix& outMatrix,
+                         PermMode permMode);
+
+/**
+ * @brief First permute input maxtrix then decompose to output
+ */
+size_t decomposeWithPermute(const Matrix& inMatrix,
+                            size_t height,
+                            size_t width,
+                            size_t totalSize,
+                            size_t offset,
+                            size_t batchSize,
+                            Matrix& outMatrix,
+                            PermMode permMode);
+
+/**
+ * @brief Compute jaccard overlap between two bboxes.
+ * @param bbox1 The first bbox
+ * @param bbox2 The second bbox
+ */
+real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
+
+/**
+ * @brief Compute offset parameters between prior bbox and ground truth bbox
+ * and variances of prior bbox are considered
+ * @param priorBBox Input prior bbox
+ * @param priorBBoxVar Variance parameters of prior bbox
+ * @param gtBBox Groundtruth bbox
+ * @param outVec Output vector
+ */
+void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                       const vector<real>& priorBBoxVar,
+                       const NormalizedBBox& gtBBox,
+                       vector<real>& outVec);
+
+/**
+ * @brief Decode prior bbox with offset parameters
+ * and variances of prior bbox are considered
+ * @param priorBBox Prior bbox to be decoded
+ * @param priorBBoxVar Variance parameters of prior bbox
+ * @param locPredData Offset parameters
+ */
+NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                                 const vector<real>& priorBBoxVar,
+                                 const vector<real>& locPredData);
+
+/**
+ * @brief Extract bboxes from prior matrix, the layout is
+ * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
+ * @param priorData Matrix of prior value
+ * @param numBBoxes Number of bbox to be extracted
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromPriorData(const real* priorData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec);
+
+/**
+ * @brief Extract labels, scores and bboxes from detection matrix, the layout is
+ * imageId | label | score | xmin | ymin | xmax | ymax
+ * @param detectData Matrix of detection value
+ * @param numBBoxes Number of bbox to be extracted
+ * @param labelVec Label of bbox
+ * @param scoreVec Score of bbox
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromDetectData(const real* detectData,
+                           const size_t numBBoxes,
+                           vector<real>& labelVec,
+                           vector<real>& scoreVec,
+                           vector<NormalizedBBox>& bboxVec);
+
+/**
+ * @brief Extract variances from prior matrix, the layout is
+ * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
+ * @param priorData Matrix of prior value
+ * @param num Number to be extracted
+ * @param varVec Append to the vector
+ */
+void getBBoxVarFromPriorData(const real* priorData,
+                             const size_t num,
+                             vector<vector<real>>& varVec);
+
+/**
+ * @brief Extract bboxes from label matrix, the layout is
+ * class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ...
+ * @param labelData Matrix of label value
+ * @param numBBoxes Number to be extracted
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromLabelData(const real* labelData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec);
+
+/**
+* @brief Match prior bbox to groundtruth bbox, the strategy is:
+1. Find the most overlaped bbox pair (prior and groundtruth)
+2. For rest of prior bboxes find the most overlaped groundtruth bbox
+* @param priorBBoxes prior bbox
+* @param gtBBoxes groundtruth bbox
+* @param overlapThreshold Low boundary of overlap (judge whether matched)
+* @param matchIndices For each prior bbox, groundtruth bbox index if matched
+otherwise -1
+* @param matchOverlaps For each prior bbox, overap with all groundtruth bboxes
+*/
+void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
+               const vector<NormalizedBBox>& gtBBoxes,
+               real overlapThreshold,
+               vector<int>* matchIndices,
+               vector<real>* matchOverlaps);
+
+/**
+* @brief Generate positive bboxes and negative bboxes,
+|positive bboxes|/|negative bboxes| is negPosRatio
+* @param priorValue Prior value
+* @param numPriorBBoxes Number of prior bbox
+* @param gtValue Groundtruth value
+* @param gtStartPosPtr Since groundtruth value stored as sequence type,
+this parameter indicates start position of each record
+* @param seqNum Number of sequence
+* @param maxConfScore Classification score for prior bbox, used to mine
+negative examples
+* @param batchSize Image number
+* @param overlapThreshold Low boundary of overap
+* @param negOverlapThreshold Upper boundary of overap (judge negative example)
+* @param negPosRatio Control number of negative bboxes
+* @param matchIndicesVecPtr Save indices of matched prior bbox
+* @param negIndicesVecPtr Save indices of negative prior bbox
+*/
+pair<size_t, size_t> generateMatchIndices(
+    const Matrix& priorValue,
+    const size_t numPriorBBoxes,
+    const Matrix& gtValue,
+    const int* gtStartPosPtr,
+    const size_t seqNum,
+    const vector<vector<real>>& maxConfScore,
+    const size_t batchSize,
+    const real overlapThreshold,
+    const real negOverlapThreshold,
+    const size_t negPosRatio,
+    vector<vector<int>>* matchIndicesVecPtr,
+    vector<vector<int>>* negIndicesVecPtr);
+
+/**
+ * @brief Get max confidence score for each prior bbox
+ * @param confData Confidence scores, layout is
+ * class1 score | class2 score | ... | classN score ...
+ * @param batchSize Image number
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Classes number
+ * @param backgroundId Background id
+ * @param maxConfScoreVecPtr Ouput
+ */
+void getMaxConfidenceScores(const real* confData,
+                            const size_t batchSize,
+                            const size_t numPriorBBoxes,
+                            const size_t numClasses,
+                            const size_t backgroundId,
+                            vector<vector<real>>* maxConfScoreVecPtr);
+
+template <typename T>
+bool sortScorePairDescend(const pair<real, T>& pair1,
+                          const pair<real, T>& pair2);
+
+template <>
+bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
+                          const pair<real, NormalizedBBox>& pair2);
+
+/**
+ * @brief Do NMS for bboxes to remove duplicated bboxes
+ * @param bboxes BBoxes to apply NMS
+ * @param confScoreData Confidence scores
+ * @param classIdx Class to do NMS
+ * @param topK Number to keep
+ * @param confThreshold Low boundary of confidence score
+ * @param nmsThreshold Threshold of overlap
+ * @param numPriorBBoxes Total number of prior bboxes
+ * @param numClasses Total class number
+ * @param indices Indices of high quality bboxes
+ */
+void applyNMSFast(const vector<NormalizedBBox>& bboxes,
+                  const real* confScoreData,
+                  size_t classIdx,
+                  size_t topK,
+                  real confThreshold,
+                  real nmsThreshold,
+                  size_t numPriorBBoxes,
+                  size_t numClasses,
+                  vector<size_t>* indices);
+
+/**
+ * @brief Get detection results which satify requirements
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Class number
+ * @param backgroundId Background class
+ * @param batchSize Image number
+ * @param confThreshold Threshold of class confidence
+ * @param nmsTopK Used in NMS operation to keep top k bbox
+ * @param nmsThreshold Used in NMS, threshold of overlap
+ * @param keepTopK How many bboxes keeped in an image
+ * @param allDecodedBBoxes Decoded bboxes for all images
+ * @param allDetectionIndices Save detection bbox indices
+ */
+size_t getDetectionIndices(
+    const real* confData,
+    const size_t numPriorBBoxes,
+    const size_t numClasses,
+    const size_t backgroundId,
+    const size_t batchSize,
+    const size_t confThreshold,
+    const size_t nmsTopK,
+    const real nmsThreshold,
+    const size_t keepTopK,
+    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+    vector<map<size_t, vector<size_t>>>* allDetectionIndices);
+
+/**
+ * @brief Get detection results
+ * @param confData Confidence scores
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Class number
+ * @param batchSize Image number
+ * @param allIndices Indices of predicted bboxes
+ * @param allDecodedBBoxes BBoxes decoded
+ * @param out Output matrix
+ * image number | label | confidence score | xMin | yMin | xMax | yMax
+ */
+void getDetectionOutput(const real* confData,
+                        const size_t numKept,
+                        const size_t numPriorBBoxes,
+                        const size_t numClasses,
+                        const size_t batchSize,
+                        const vector<map<size_t, vector<size_t>>>& allIndices,
+                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+                        Matrix& out);
+
+NormalizedBBox clipBBox(const NormalizedBBox& bbox);
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.cpp b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
index fdcf994cdb47f2409b045a1337332e2f4c304fbc..77736e78f9349c0393e1e53ac700817a70893e53 100644
--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
@@ -22,26 +22,8 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
   /* Initialize the basic convolutional parent class */
   ConvBaseLayer::init(layerMap, parameterMap);
 
-  /* The class fields channels_ and numFilters_ are the same as in the config
-   * i.e., channels_ is the for the input and numFilters_ is for the output
-   *
-   * But in order for the variables in convTrans having the same semantic
-   * meaning as in conv, we need to swap channels_ and numFilters here for
-   * convTrans, and in other functions too.
-   * */
-
-  /* Initialize the projection */
   for (auto &inputConfig : config_.inputs()) {
     const ConvConfig &conf = inputConfig.conv_conf();
-    int numFilters = isDeconv_ ? conf.channels() : numFilters_;
-    subM_.push_back(numFilters / conf.groups());
-    subN_.push_back(conf.output_x() *
-                    (conf.has_output_y() ? conf.output_y() : conf.output_x()));
-    int channel = isDeconv_ ? numFilters_ : conf.channels();
-    subK_.push_back(
-        channel * conf.filter_size() *
-        (conf.has_filter_size_y() ? conf.filter_size_y() : conf.filter_size()) /
-        conf.groups());
     /* Consistent caffe mode for multiple input */
     caffeMode_ = conf.caffe_mode();
   }
@@ -54,17 +36,9 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
 size_t ExpandConvBaseLayer::getOutputSize() {
   CHECK_NE(inputLayers_.size(), 0UL);
   size_t layerSize = ConvBaseLayer::calOutputSize();
-  subN_.clear();
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    subN_.push_back(outputH_[i] * outputW_[i]);
-  }
   return layerSize;
 }
 
-void ExpandConvBaseLayer::resetExpandInput(size_t height, size_t width) {
-  Matrix::resizeOrCreate(expandInput_, height, width, false, useGpu_);
-}
-
 void ExpandConvBaseLayer::addSharedBias() {
   size_t mapW = getOutputSize() / numFilters_;
   size_t mapH = getOutputValue()->getElementCnt() / mapW;
@@ -101,173 +75,6 @@ void ExpandConvBaseLayer::addUnsharedBias() {
   outValue->addBias(*bias, 1.0f);
 }
 
-void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image,
-                                         size_t startIdx,
-                                         int inIdx) {
-  int channel = isDeconv_ ? numFilters_ : channels_[inIdx];
-
-  resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]);
-
-  CHECK_EQ(image->getWidth(),
-           static_cast<size_t>(imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel));
-
-  real *imgData = image->getData() + startIdx * image->getWidth();
-  MatrixPtr imageTmp =
-      Matrix::create(imgData,
-                     1,
-                     imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel,
-                     false,
-                     useGpu_);
-  expandInput_->convExpand(*imageTmp,
-                           imgSizeH_[inIdx],
-                           imgSizeW_[inIdx],
-                           channel,
-                           filterSizeY_[inIdx],
-                           filterSize_[inIdx],
-                           strideY_[inIdx],
-                           stride_[inIdx],
-                           paddingY_[inIdx],
-                           padding_[inIdx],
-                           outputH_[inIdx],
-                           outputW_[inIdx]);
-  imageTmp->clear();
-}
-
-void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image,
-                                        MatrixPtr out,
-                                        int inIdx,
-                                        int startIdx) {
-  int subM = subM_[inIdx];
-  int subN = subN_[inIdx];
-  int subK = subK_[inIdx];
-
-  expandOneFrame(image, startIdx, inIdx);
-
-  int numFilters = isDeconv_ ? channels_[inIdx] : numFilters_;
-
-  real *outData = out->getData() + startIdx * subN * numFilters;
-
-  real *wgtData = weights_[inIdx]->getW()->getData();
-  real *expInData = expandInput_->getData();
-  for (int g = 0; g < groups_[inIdx]; ++g) {
-    MatrixPtr A =
-        Matrix::create(wgtData, subM, subK, false, useGpu_);  // mark transpose
-    MatrixPtr B = Matrix::create(expInData, subK, subN, false, useGpu_);
-    MatrixPtr C = Matrix::create(outData, subM, subN, false, useGpu_);
-    C->mul(*A, *B, 1, 1);
-
-    A->clear();
-    B->clear();
-    C->clear();
-    wgtData += subK * subM;
-    expInData += subK * subN;
-    outData += subM * subN;
-  }
-}
-
-void ExpandConvBaseLayer::bpropActs(MatrixPtr out,
-                                    MatrixPtr image,
-                                    int inpIdx) {
-  int channel = isDeconv_ ? numFilters_ : channels_[inpIdx];
-
-  int subM = subM_[inpIdx];
-  int subN = subN_[inpIdx];
-  int subK = subK_[inpIdx];
-  size_t batchSize = image->getHeight();
-
-  /* reset the expand-grad memory */
-  resetExpandInput(subK * groups_[inpIdx], subN);
-
-  real *localGradData = out->getData();
-  real *tgtGradData = image->getData();
-  for (size_t n = 0; n < batchSize; n++) {
-    real *wgtData = weights_[inpIdx]->getW()->getData();
-    real *expandInData = expandInput_->getData();
-
-    for (int g = 0; g < groups_[inpIdx]; g++) {
-      // create temporary matrix
-      MatrixPtr C = Matrix::create(expandInData, subK, subN, false, useGpu_);
-      MatrixPtr B = Matrix::create(localGradData, subM, subN, false, useGpu_);
-      MatrixPtr A = Matrix::create(wgtData, subM, subK, true, useGpu_);
-      C->mul(*A, *B);  // mul
-
-      // clear the temporary matrix
-      A->clear();
-      B->clear();
-      C->clear();
-
-      expandInData += subK * subN;
-      localGradData += subM * subN;
-      wgtData += subK * subM;
-    }
-
-    // shrink one frame outGrad
-    MatrixPtr oneGradTmp = Matrix::create(
-        expandInput_->getData(), subK * groups_[inpIdx], subN, false, useGpu_);
-    MatrixPtr vTmp =
-        Matrix::create(tgtGradData,
-                       1,
-                       imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel,
-                       false,
-                       useGpu_);
-    vTmp->convShrink(*oneGradTmp,
-                     imgSizeH_[inpIdx],
-                     imgSizeW_[inpIdx],
-                     channel,
-                     filterSizeY_[inpIdx],
-                     filterSize_[inpIdx],
-                     strideY_[inpIdx],
-                     stride_[inpIdx],
-                     paddingY_[inpIdx],
-                     padding_[inpIdx],
-                     outputH_[inpIdx],
-                     outputW_[inpIdx],
-                     1.0f,
-                     1.0f);
-    vTmp->clear();
-    oneGradTmp->clear();
-
-    // move the data-pointer
-    tgtGradData += imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel;
-  }
-}
-
-void ExpandConvBaseLayer::bpropWeights(MatrixPtr image,
-                                       MatrixPtr out,
-                                       int inpIdx) {
-  MatrixPtr weightGrad = weights_[inpIdx]->getWGrad();
-
-  int subM = subM_[inpIdx];
-  int subN = subN_[inpIdx];
-  int subK = subK_[inpIdx];
-  size_t batchSize = image->getHeight();
-  resetExpandInput(subK * groups_[inpIdx], subN);
-
-  real *gradData = out->getData();
-
-  for (size_t n = 0; n < batchSize; n++) {  // frame by frame
-    // expand
-    expandOneFrame(image, n, inpIdx);
-    real *wGradData = weightGrad->getData();
-    real *expandInData = expandInput_->getData();
-
-    // expand-mul one-group by one
-    for (int g = 0; g < groups_[inpIdx]; g++) {
-      MatrixPtr A = Matrix::create(expandInData, subK, subN, true, useGpu_);
-      MatrixPtr B = Matrix::create(gradData, subM, subN, false, useGpu_);
-      MatrixPtr C = Matrix::create(wGradData, subM, subK, false, useGpu_);
-      C->mul(*B, *A, 1, 1);
-
-      A->clear();
-      B->clear();
-      C->clear();
-      gradData += subM * subN;
-      wGradData += subK * subM;
-      expandInData += subK * subN;
-    }
-  }
-}
-
 void ExpandConvBaseLayer::bpropSharedBias(MatrixPtr biases, MatrixPtr v) {
   size_t mapW = getOutputSize() / numFilters_;
   size_t mapH = v->getElementCnt() / mapW;
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.h b/paddle/gserver/layers/ExpandConvBaseLayer.h
index aabcdfc392d3e242df84c820c336d8b32c7cb04f..01c699d2344443a1887ec0b5005125f617cbe279 100644
--- a/paddle/gserver/layers/ExpandConvBaseLayer.h
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.h
@@ -26,19 +26,6 @@ namespace paddle {
  */
 class ExpandConvBaseLayer : public ConvBaseLayer {
 protected:
-  /// For expand convolution.
-  /// subM_ = numFilters_ / groups_.
-  IntV subM_;
-  /// subN_ = outputH_ * outputW_.
-  IntV subN_;
-  /// subK_ = channels_ * filterPixels_ * groups_.
-  IntV subK_;
-
-  /*The expandInput_ and transOutValue_ are used for CPU expand conv calc
-   * Expand one sample at a time. shape:
-   * (numChannels * filterPixels_, outputSizeH * outputSizeW)
-   * */
-  MatrixPtr expandInput_;
   /// The transpose of output, which is an auxiliary matrix.
   MatrixPtr transOutValue_;
 
@@ -52,10 +39,6 @@ public:
             const ParameterMap& parameterMap) override;
 
   size_t getOutputSize();
-  /**
-   * Create or resize expandInput_.
-   */
-  void resetExpandInput(size_t height, size_t width);
 
   /**
    * Add shared bias.
@@ -66,20 +49,9 @@ public:
    * Add unshared bias.
    */
   void addUnsharedBias();
-  /**
-   * Expand one input sample.
-   */
-  void expandOneFrame(MatrixPtr image, size_t startIdx, int inIdx);
-
-  /**
-   * Expand one input sample and perform matrix multiplication.
-   */
-  void expandFwdOnce(MatrixPtr image, MatrixPtr out, int inIdx, int startIdx);
 
   void bpropSharedBias(MatrixPtr biases, MatrixPtr v);
   void bpropBiases(MatrixPtr v);
-  void bpropWeights(MatrixPtr image, MatrixPtr out, int inpIdx);
-  void bpropActs(MatrixPtr image, MatrixPtr out, int inpIdx);
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index f9267b81a7d4264f5f43552e3d54a45e4b212e00..914689e66cdb8947e886e17e75829183c1af1a42 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -18,32 +18,94 @@ limitations under the License. */
 
 namespace paddle {
 
+/*
+ * The calculation of the exconvt(convolution transpose (deconv) operation)
+ * is a swap of forward and backward of the calculation of exconv.
+ * */
 REGISTER_LAYER(exconv, ExpandConvLayer);
+REGISTER_LAYER(exconvt, ExpandConvLayer);
 
 bool ExpandConvLayer::init(const LayerMap &layerMap,
                            const ParameterMap &parameterMap) {
   /* Initialize the basic convolutional parent class */
   ExpandConvBaseLayer::init(layerMap, parameterMap);
+
+  size_t numInputs = config_.inputs_size();
+  inputShape_.resize(numInputs);
+  filterShape_.resize(numInputs);
+  outputShape_.resize(numInputs);
+  for (int i = 0; i < config_.inputs_size(); i++) {
+    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
+    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
+    createFunction(forward_,
+                   !isDeconv_ ? "GemmConv" : "GemmConvGradInput",
+                   FuncConfig()
+                       .set("paddings", paddings)
+                       .set("strides", strides)
+                       .set("groups", (size_t)groups_[i]));
+
+    createFunction(backward_,
+                   !isDeconv_ ? "GemmConvGradInput" : "GemmConv",
+                   FuncConfig()
+                       .set("paddings", paddings)
+                       .set("strides", strides)
+                       .set("groups", (size_t)groups_[i]));
+
+    createFunction(backward_,
+                   "GemmConvGradFilter",
+                   FuncConfig()
+                       .set("paddings", paddings)
+                       .set("strides", strides)
+                       .set("groups", (size_t)groups_[i]));
+  }
   return true;
 }
 
+// i is the index of input layers
+#define BACKWARD_INPUT(i, inputs, outputs) \
+  backward_[2 * i]->calc(inputs, outputs)
+#define BACKWARD_FILTER(i, inputs, outputs) \
+  backward_[2 * i + 1]->calc(inputs, outputs)
+
 void ExpandConvLayer::forward(PassType passType) {
   Layer::forward(passType);
 
-  /* malloc memory for the output_ if necessary */
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
   resetOutput(batchSize, getOutputSize());
 
-  MatrixPtr image = nullptr;
-  MatrixPtr outV = getOutputValue();
+  // Calculate the shape of the input, output, and filter.
   for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    LayerPtr prevLayer = getPrev(i);
-    image = prevLayer->getOutputValue();
-    for (size_t off = 0; off < image->getHeight(); off++) {
-      REGISTER_TIMER_INFO("expandFwdOnce", getName().c_str());
-      expandFwdOnce(image, outV, i, off);
-    }
+    inputShape_[i] = TensorShape({(size_t)batchSize,
+                                  (size_t)channels_[i],
+                                  (size_t)imgSizeH_[i],
+                                  (size_t)imgSizeW_[i]});
+    filterShape_[i] =
+        TensorShape({(size_t)groups_[i],
+                     !isDeconv_ ? (size_t)numFilters_ / groups_[i]
+                                : (size_t)channels_[i] / groups_[i],
+                     !isDeconv_ ? (size_t)channels_[i] / groups_[i]
+                                : (size_t)numFilters_ / groups_[i],
+                     (size_t)filterSizeY_[i],
+                     (size_t)filterSize_[i]});
+    outputShape_[i] = TensorShape({(size_t)batchSize,
+                                   (size_t)numFilters_,
+                                   (size_t)outputH_[i],
+                                   (size_t)outputW_[i]});
   }
+
+  // Calculate the output value.
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*getInputValue(i), inputShape_[i]);
+    inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
+    outputs.addArg(*getOutputValue(),
+                   outputShape_[i],
+                   !isDeconv_ && i == 0 ? ASSIGN_TO : ADD_TO);
+
+    forward_[i]->calc(inputs, outputs);
+  }
+
   /* add the bias-vector */
   if (biases_.get()) {
     if (sharedBiases_) {
@@ -67,14 +129,30 @@ void ExpandConvLayer::backward(const UpdateCallback &callback) {
     biases_->getParameterPtr()->incUpdate(callback);
   }
 
+  // Calculate the input grad and filter grad.
   for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    /* First, calculate the input layers error */
-    if (getPrev(i)->getOutputGrad()) {
-      bpropActs(outGrad, getPrev(i)->getOutputGrad(), i);
+    if (getInputGrad(i)) {
+      BufferArgs inputs;
+      BufferArgs outputs;
+      inputs.addArg(*getOutputGrad(), outputShape_[i]);
+      inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
+      outputs.addArg(*getInputGrad(i), inputShape_[i], ADD_TO);
+      BACKWARD_INPUT(i, inputs, outputs);
     }
+
     if (weights_[i]->getWGrad()) {
-      /* Then, calculate the W-gradient for the current layer */
-      bpropWeights(getPrev(i)->getOutputValue(), outGrad, i);
+      BufferArgs inputs;
+      BufferArgs outputs;
+      if (!isDeconv_) {
+        inputs.addArg(*getOutputGrad(), outputShape_[i]);
+        inputs.addArg(*getInputValue(i), inputShape_[i]);
+      } else {
+        inputs.addArg(*getInputValue(i), inputShape_[i]);
+        inputs.addArg(*getOutputGrad(), outputShape_[i]);
+      }
+      outputs.addArg(*weights_[i]->getWGrad(), filterShape_[i], ADD_TO);
+      BACKWARD_FILTER(i, inputs, outputs);
+
       /* Increasing the number of gradient */
       weights_[i]->getParameterPtr()->incUpdate(callback);
     }
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
index 60681690e5dd55b2e9aa4e1f25758db6033665a6..a1f943d1521547af0f82cec7da8a4efe9037cd71 100644
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -40,6 +40,11 @@ public:
 
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
+
+protected:
+  std::vector<TensorShape> inputShape_;
+  std::vector<TensorShape> filterShape_;
+  std::vector<TensorShape> outputShape_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvTransLayer.cpp b/paddle/gserver/layers/ExpandConvTransLayer.cpp
deleted file mode 100644
index 520586b13889790c94a3e29902a4ea0ee55e8555..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ExpandConvTransLayer.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ExpandConvTransLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-/* The implementation of the convTransLayer is basically a swap of forward and
- * backward of the original convLayer.
- * The variable naming follows the convention of the convLayer.
- * */
-
-namespace paddle {
-
-REGISTER_LAYER(exconvt, ExpandConvTransLayer);
-
-bool ExpandConvTransLayer::init(const LayerMap &layerMap,
-                                const ParameterMap &parameterMap) {
-  /* Initialize the basic convolutional parent class */
-  ExpandConvBaseLayer::init(layerMap, parameterMap);
-
-  return true;
-}
-
-void ExpandConvTransLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  resetOutput(batchSize, getOutputSize());
-
-  MatrixPtr output = nullptr;
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    LayerPtr prevLayer = getPrev(i);
-    output = prevLayer->getOutputValue();
-    REGISTER_TIMER_INFO("shrinkFwd", getName().c_str());
-    bpropActs(output, getOutputValue(), i);
-  }
-
-  /* add the bias-vector */
-  if (biases_.get()) {
-    if (sharedBiases_) {
-      addSharedBias();
-    } else {
-      addUnsharedBias();
-    }
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void ExpandConvTransLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-
-  MatrixPtr imageGrad = getOutputGrad();
-  if (biases_ && biases_->getWGrad()) {
-    bpropBiases(imageGrad);
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    /* First, calculate the input layers error */
-    for (size_t off = 0; off < imageGrad->getHeight(); off++) {
-      if (getPrev(i)->getOutputGrad()) {
-        expandFwdOnce(imageGrad, getPrev(i)->getOutputGrad(), i, off);
-      }
-    }
-    if (weights_[i]->getWGrad()) {
-      /* Then, calculate the W-gradient for the current layer */
-      bpropWeights(imageGrad, getPrev(i)->getOutputValue(), i);
-      /* Increasing the number of gradient */
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvTransLayer.h b/paddle/gserver/layers/ExpandConvTransLayer.h
deleted file mode 100644
index 00b8f241889fdd3f423d75dedd9068aa3674f190..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ExpandConvTransLayer.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "ExpandConvBaseLayer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A subclass of convolution layer.
- * This layer expands input and use matrix multiplication to
- * calculate convolution transpose (deconv) operation.
- *
- * The config file api is img_conv_layer with flag trans=True.
- */
-class ExpandConvTransLayer : public ExpandConvBaseLayer {
-public:
-  explicit ExpandConvTransLayer(const LayerConfig& config)
-      : ExpandConvBaseLayer(config) {}
-
-  ~ExpandConvTransLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
index b3850f543af74abbddaac5bb0a32851f2d3297d0..8a2ae6b49fcc13ed22eca2a33c8296827812bff9 100644
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
@@ -40,6 +40,7 @@ namespace paddle {
 class FeatureMapExpandLayer : public Layer {
 private:
   int numFilters_;
+  bool asRowVector_;
 
 public:
   explicit FeatureMapExpandLayer(const LayerConfig& config) : Layer(config) {}
@@ -62,6 +63,7 @@ bool FeatureMapExpandLayer::init(const LayerMap& layerMap,
 
   CHECK_EQ(inputLayers_.size(), 1UL);
   numFilters_ = config_.num_filters();
+  asRowVector_ = config_.user_arg() != "as_col_vec";
   return true;
 }
 
@@ -76,16 +78,30 @@ void FeatureMapExpandLayer::forward(PassType passType) {
 
   {
     AsyncGpuBlock asyncGpuBlock;
-    for (size_t i = 0; i < batchSize; i++) {
-      MatrixPtr outVTmp =
-          Matrix::create(outputV->getData() + i * imgSize * numFilters_,
-                         numFilters_,
-                         imgSize,
-                         false,
-                         useGpu_);
-      MatrixPtr inVTmp = Matrix::create(
-          inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
-      outVTmp->addRowVector(*inVTmp);
+    if (asRowVector_) {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outVTmp =
+            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
+                           numFilters_,
+                           imgSize,
+                           false,
+                           useGpu_);
+        MatrixPtr inVTmp = Matrix::create(
+            inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
+        outVTmp->addRowVector(*inVTmp);
+      }
+    } else {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outVTmp =
+            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
+                           imgSize,
+                           numFilters_,
+                           false,
+                           useGpu_);
+        MatrixPtr inVTmp = Matrix::create(
+            inputV->getData() + i * imgSize, imgSize, 1, false, useGpu_);
+        outVTmp->addColVector(*inVTmp);
+      }
     }
   }
   /* activation */ {
@@ -102,24 +118,38 @@ void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
   MatrixPtr outGrad = getOutputGrad();
   size_t batchSize = getInput(0).getBatchSize();
   int imgSize = inGrad->getWidth();
+  /* Do activation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
   {
     AsyncGpuBlock asyncGpuBlock;
-    for (size_t i = 0; i < batchSize; i++) {
-      MatrixPtr outGradTmp =
-          Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
-                         numFilters_,
-                         imgSize,
-                         false,
-                         useGpu_);
-      MatrixPtr inGradTmp = Matrix::create(
-          inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
-      inGradTmp->collectBias(*outGradTmp, 1);
+    if (asRowVector_) {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outGradTmp =
+            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
+                           numFilters_,
+                           imgSize,
+                           false,
+                           useGpu_);
+        MatrixPtr inGradTmp = Matrix::create(
+            inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
+        inGradTmp->collectBias(*outGradTmp, 1);
+      }
+    } else {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outGradTmp =
+            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
+                           imgSize,
+                           numFilters_,
+                           false,
+                           useGpu_);
+        MatrixPtr inGradTmp = Matrix::create(
+            inGrad->getData() + i * imgSize, imgSize, 1, false, useGpu_);
+        inGradTmp->sumRows(*outGradTmp, 1, 1);
+      }
     }
   }
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
 }
 
 }  // namespace paddle.
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index 125aaf947f3c9d976b117667d1d1b7700a029cc6..4b92b5d163ad107c0783beae45f8c936112fcccf 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -191,6 +191,11 @@ void Layer::addOutputArgument(int deviceId) {
 void Layer::copyOutputToOtherDevice() {
   for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
     SetDevice device(outputOtherDevice_[i].deviceId);
+    // If outputOtherDevice_[i].value is a CpuMatrix,
+    // the copyFrom is a synchronous interface.
+    // If outputOtherDevice_[i].value is a GpuMatrix, since subsequent
+    // calculations are all on HPPL_STREAM_DEFAULT,
+    // copyFrom can be an asynchronous interface.
     outputOtherDevice_[i].value->copyFrom(*getOutputValue(),
                                           HPPL_STREAM_DEFAULT);
     outputOtherDevice_[i].sequenceStartPositions =
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp
index de198af111be4200dd1b240f6de9464e3f43b06d..a97fa6bf78fce27a4e0cf329bf3309ba4a439965 100644
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -22,10 +22,33 @@ public:
 
   void forward(PassType passType) override {
     Layer::forward(passType);
+    std::vector<std::string> vals;
     for (size_t i = 0; i != inputLayers_.size(); ++i) {
-      getInput(i).printValueString(LOG(INFO),
-                                   "layer=" + inputLayers_[i]->getName() + " ");
+      std::ostringstream s;
+      getInput(i).printValueString(s, "");
+      vals.push_back(s.str());
     }
+    size_t pos = 0;
+    int i = 0;
+    std::ostringstream s;
+    const std::string& format = config_.user_arg();
+    while (true) {
+      size_t pos1 = format.find("%s", pos);
+      if (pos1 == std::string::npos) break;
+      if (i >= vals.size()) {
+        break;
+      }
+      s << format.substr(pos, pos1 - pos) << vals[i];
+      pos = pos1 + 2;
+      ++i;
+    }
+    if (i != inputLayers_.size()) {
+      LOG(ERROR) << "Number of value in the format (" << format
+                 << ") is not same as the number of inputs ("
+                 << inputLayers_.size() << ") at " << getName();
+    }
+    s << format.substr(pos);
+    LOG(INFO) << s.str();
   }
 
   void backward(const UpdateCallback& callback) override {}
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
index 235d9a9b0f0653df5c0b671092df9e195f08fc48..4179a9e7e0cb58fcb49bff712e62b9f3fea373bd 100644
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -46,6 +46,9 @@ void SequencePoolLayer::forward(PassType passType) {
   Layer::forward(passType);
 
   const Argument& input = getInput(0);
+  CHECK(input.hasSeq() || input.hasSubseq())
+      << "Input should be a sequence or subsequence for layer " << getName();
+
   newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences();
   size_t dim = getSize();
   // check
diff --git a/paddle/gserver/tests/rnn_data_provider.py b/paddle/gserver/tests/rnn_data_provider.py
index 3afd45c72f4dd071ddca569caac8716fe102299b..913365a5a4037d14fcba1e1546508ba89668e0d6 100644
--- a/paddle/gserver/tests/rnn_data_provider.py
+++ b/paddle/gserver/tests/rnn_data_provider.py
@@ -95,3 +95,22 @@ def process_unequalength_seq(settings, file_name):
         words1 = reduce(lambda x, y: x + y, d[0])
         words2 = reduce(lambda x, y: x + y, d[1])
         yield words1, words2, d[2]
+
+
+###########################################################
+data3 = [
+    [[[1, 2], [4, 5, 2]], [1, 2], 0],
+    [[[0, 2], [2, 5], [0, 1, 2]], [2, 3, 0], 1],
+]
+
+
+# Used for sequence_nest_mixed_inputs.conf
+@provider(
+    input_types=[
+        integer_value_sub_sequence(10), integer_value_sequence(10),
+        integer_value(2)
+    ],
+    should_shuffle=False)
+def process_mixed(settings, file_name):
+    for d in data3:
+        yield d
diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
index ad14a2c927c89c9b480af5ad565c37e8b2e54469..afdacfffd7aecfe2f4762f04a987126381bcea34 100644
--- a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
 define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
                         test_list=None,
                         module='rnn_data_provider',
-                        obj='process_subseq2')
+                        obj='process_subseq')
 
 
 settings(batch_size=2, learning_rate=0.01)
@@ -57,7 +57,7 @@ def outer_step(wid, x):
     last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
 
     # "return last" should also work. But currently RecurrentGradientMachine
-    # does not handle it, and will report error: In hierachical RNN, all out 
+    # does not handle it, and will report error: In hierachical RNN, all out
     # links should be from sequences now.
     return inner_rnn_output
 
diff --git a/paddle/gserver/tests/sequence_rnn_matched_inputs.py b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2635b4400b13517bac716a5a0affeb16c218b09
--- /dev/null
+++ b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
@@ -0,0 +1,85 @@
+# edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_mixed')
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 2
+hidden_dim = 2
+label_dim = 2
+
+data1 = data_layer(name="word1", size=dict_dim)
+data2 = data_layer(name="word2", size=dict_dim)
+label = data_layer(name="label", size=label_dim)
+
+encoding = embedding_layer(input=data2, size=word_dim)
+
+subseq = embedding_layer(input=data1, size=word_dim)
+seq = embedding_layer(input=data2, size=word_dim)
+nonseq = embedding_layer(input=label, size=word_dim)
+
+
+# This hierarchical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn_multi_unequalength_inputs.conf
+def outer_step(subseq, seq, nonseq, encoding):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+
+    def inner_step(subseq, seq, nonseq):
+        inner_mem = memory(
+            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
+
+        out = fc_layer(
+            input=[subseq, seq, nonseq, inner_mem],
+            size=hidden_dim,
+            act=TanhActivation(),
+            bias_attr=True,
+            name='inner_rnn_state')
+        return out
+
+    decoder = recurrent_group(
+        step=inner_step, name='inner', input=[subseq, seq, nonseq])
+    last = last_seq(name="outer_rnn_state", input=decoder)
+    context = simple_attention(
+        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
+    return context
+
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[
+        subseq, expand_layer(
+            seq, expand_as=subseq,
+            expand_level=ExpandLevel.FROM_SEQUENCE), expand_layer(
+                nonseq,
+                expand_as=subseq,
+                expand_level=ExpandLevel.FROM_NO_SEQUENCE),
+        StaticInput(encoding)
+    ])
+
+rep = last_seq(input=out)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+
+outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..84a66e294495c01e03dc83b38a531e482bed1292
--- /dev/null
+++ b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
@@ -0,0 +1,79 @@
+# edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_mixed')
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 2
+hidden_dim = 2
+label_dim = 2
+
+data1 = data_layer(name="word1", size=dict_dim)
+data2 = data_layer(name="word2", size=dict_dim)
+label = data_layer(name="label", size=label_dim)
+
+encoding = embedding_layer(input=data2, size=word_dim)
+
+
+# This hierarchical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn_multi_unequalength_inputs.conf
+def outer_step(subseq, seq, nonseq, encoding):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+
+    def inner_step(data1, data2, label):
+        inner_mem = memory(
+            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
+
+        subseq = embedding_layer(input=data1, size=word_dim)
+        seq = embedding_layer(input=data2, size=word_dim)
+        nonseq = embedding_layer(input=label, size=word_dim)
+
+        print_layer(input=[data1, seq, label, inner_mem])
+        out = fc_layer(
+            input=[subseq, seq, nonseq, inner_mem],
+            size=hidden_dim,
+            act=TanhActivation(),
+            bias_attr=True,
+            name='inner_rnn_state')
+        return out
+
+    decoder = recurrent_group(
+        step=inner_step, name='inner',
+        input=[subseq, StaticInput(seq), nonseq])
+    last = last_seq(name="outer_rnn_state", input=decoder)
+    context = simple_attention(
+        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
+    return context
+
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[data1, data2, StaticInput(label), StaticInput(encoding)])
+
+rep = last_seq(input=out)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+
+outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/gserver/tests/sequence_rnn_multi_input.conf b/paddle/gserver/tests/sequence_rnn_multi_input.conf
index 40d031741573251aa94d2a0f355470c53c51de7e..9fae974f3079c49ad03d6ba34e30190f325414e8 100644
--- a/paddle/gserver/tests/sequence_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_rnn_multi_input.conf
@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
 define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
                         test_list=None,
                         module='rnn_data_provider',
-                        obj='process_seq2')
+                        obj='process_seq')
 
 
 settings(batch_size=2, learning_rate=0.01)
diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp
index d07299bfe3c4147742384a45dc6f1698d9c382f4..83fcfed46cd568d22237eeef9c0215e4e3ad2666 100644
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/ExpandConvTransLayer.h"
 #include "paddle/trainer/Trainer.h"
 #include "paddle/utils/GlobalConstants.h"
 
diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp
index 40bb1e2d73c81280a9b12114c13de851285c276b..6035a866b4eee4c6a61fa93f3adbf5e1d2d549f7 100644
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/ExpandConvTransLayer.h"
 #include "paddle/math/MathUtils.h"
 #include "paddle/trainer/Trainer.h"
 #include "paddle/utils/GlobalConstants.h"
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
index 54b72375b743fe025e0ded5fdbce5699a0b4be1a..e7325e0cc3b7195b5fec77c878e3e087cfc643e0 100644
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/ExpandConvTransLayer.h"
 #include "paddle/math/MathUtils.h"
 #include "paddle/trainer/Trainer.h"
 #include "paddle/utils/GlobalConstants.h"
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index 4f5fdbb37ce024e18b8d39c5dda74c69bf82166a..93996392d221d531f65caf465decbffdbc2d0384 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -138,6 +138,23 @@ void testEvaluatorAll(TestConfig testConf,
   testEvaluator(testConf, testEvaluatorName, batchSize, false);
 }
 
+TEST(Evaluator, detection_map) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("detection_map");
+  config.evaluatorConfig.set_overlap_threshold(0.5);
+  config.evaluatorConfig.set_background_id(0);
+  config.evaluatorConfig.set_ap_type("Integral");
+  config.evaluatorConfig.set_evaluate_difficult(0);
+
+  config.inputDefs.push_back({INPUT_DATA, "output", 7});
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "label", 6});
+  config.evaluatorConfig.set_evaluate_difficult(false);
+  testEvaluatorAll(config, "detection_map", 100);
+
+  config.evaluatorConfig.set_evaluate_difficult(true);
+  testEvaluatorAll(config, "detection_map", 100);
+}
+
 TEST(Evaluator, classification_error) {
   TestConfig config;
   config.evaluatorConfig.set_type("classification_error");
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 6adffcf53b7966bd6f3d02970e5f07cc9802f469..297756025bcad79d49ec321414ed2e91f1c0758a 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1598,12 +1598,15 @@ TEST(Layer, FeatureMapExpandLayer) {
                               /* paraSize= */ 0});
   config.layerConfig.add_inputs();
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "featmap_expand",
-                  /*batch_size*/ 100,
-                  /* trans= */ false,
-                  useGpu,
-                  /* useWeight */ true);
+    for (auto asRowVec : {false, true}) {
+      config.layerConfig.set_user_arg(asRowVec ? "as_row_vec" : "as_col_vec");
+      testLayerGrad(config,
+                    "featmap_expand",
+                    /*batch_size*/ 100,
+                    /* trans= */ false,
+                    useGpu,
+                    /* useWeight */ true);
+    }
   }
 }
 
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index 4a846397e6cf3100f948af46874b0739e32bf4a5..6b19eb0ce520a625ac68582d5c1e11c168127dc7 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -155,6 +155,15 @@ TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
   }
 }
 
+TEST(RecurrentGradientMachine, rnn_mixed_input) {
+  for (bool useGpu : {false, true}) {
+    test("gserver/tests/sequence_rnn_mixed_inputs.py",
+         "gserver/tests/sequence_rnn_matched_inputs.py",
+         1e-6,
+         useGpu);
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
 
diff --git a/paddle/majel/.gitignore b/paddle/majel/.gitignore
deleted file mode 100644
index 1f5acdebb56971202b63d2485e2ac5042786f13c..0000000000000000000000000000000000000000
--- a/paddle/majel/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-build
-third-party
\ No newline at end of file
diff --git a/paddle/majel/detail/cuda_assert.h b/paddle/majel/detail/cuda_assert.h
deleted file mode 100644
index 9490d0ae3eff01bdb4403de710b7bfd878e87f03..0000000000000000000000000000000000000000
--- a/paddle/majel/detail/cuda_assert.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#pragma once
-
-#define STRINGIFY(x) #x
-#define TOSTRING(x) STRINGIFY(x)
-
-#if defined(__APPLE__) && defined(__CUDA_ARCH__) && !defined(NDEBUG)
-#include <stdio.h>
-#define MAJEL_ASSERT(e)                                                       \
-  do {                                                                        \
-    if (!(e)) {                                                               \
-      printf(                                                                 \
-          "%s:%d Assertion `%s` failed.\n", __FILE__, __LINE__, TOSTRING(e)); \
-      asm("trap;");                                                           \
-    }                                                                         \
-  } while (0)
-
-#define MAJEL_ASSERT_MSG(e, m)                      \
-  do {                                              \
-    if (!(e)) {                                     \
-      printf("%s:%d Assertion `%s` failed (%s).\n", \
-             __FILE__,                              \
-             __LINE__,                              \
-             TOSTRING(e),                           \
-             m);                                    \
-      asm("trap;");                                 \
-    }                                               \
-  } while (0)
-#else
-#include <assert.h>
-#define MAJEL_ASSERT(e) assert(e)
-#define MAJEL_ASSERT_MSG(e, m) assert((e) && (m))
-#endif
diff --git a/paddle/majel/dim_test.cu b/paddle/majel/dim_test.cu
deleted file mode 100644
index a7d81e595bea7fa6326ea350e2702e1ef8f5caa4..0000000000000000000000000000000000000000
--- a/paddle/majel/dim_test.cu
+++ /dev/null
@@ -1,128 +0,0 @@
-#include <thrust/device_vector.h>
-#include <sstream>
-
-#include "paddle/majel/dim.h"
-#include "gtest/gtest.h"
-
-__global__ void test(majel::Dim<2>* o) {
-    o[0] = majel::make_dim(5, 6);
-}
-
-__global__ void dyn_idx_gpu(int* o) {
-    auto d = majel::make_dim(5, 6);
-    o[0] = d[1];
-}
-
-TEST(Dim, Equality) {
-    // construct a Dim on the CPU
-    auto a = majel::make_dim(3, 4);
-    EXPECT_EQ(majel::get<0>(a), 3);
-    EXPECT_EQ(majel::get<1>(a), 4);
-
-    // construct a Dim on the GPU
-    thrust::device_vector<majel::Dim<2>> t(2);
-    test<<<1,1>>>(thrust::raw_pointer_cast(t.data()));
-    a = t[0];
-    EXPECT_EQ(majel::get<0>(a), 5);
-    EXPECT_EQ(majel::get<1>(a), 6);
-
-    // linearization
-    auto b = majel::make_dim(7, 8);
-    EXPECT_EQ(majel::linearize(a, b), 83);
-
-    // product
-    EXPECT_EQ(majel::product(a), 30);
-
-    // mutate a Dim
-    majel::get<1>(b) = 10;
-    EXPECT_EQ(majel::get<0>(b), 7);
-    EXPECT_EQ(majel::get<1>(b), 10);
-
-    // dynamic access
-    majel::get(b, 0) = 8;
-    b[1] = 11;
-    EXPECT_EQ(majel::get<0>(b), 8);
-    EXPECT_EQ(majel::get<1>(b), 11);
-    EXPECT_EQ(majel::get(b, 0), 8);
-    EXPECT_EQ(b[1], 11);
-
-    // dynamic access on GPU
-    thrust::device_vector<int> r(1);
-    dyn_idx_gpu<<<1,1>>>(thrust::raw_pointer_cast(r.data()));
-    int res = r[0];
-    EXPECT_EQ(res, 6);
-
-    // ex_prefix_mul
-    majel::Dim<3> c = majel::ex_prefix_mul(majel::Dim<3>(3, 4, 5));
-    EXPECT_EQ(majel::get<0>(c), 1);
-    EXPECT_EQ(majel::get<1>(c), 3);
-    EXPECT_EQ(majel::get<2>(c), 12);
-
-    // contiguous_strides
-    c = majel::contiguous_strides(majel::Dim<3>(10, 1, 10));
-    EXPECT_EQ(majel::get<0>(c), 1);
-    EXPECT_EQ(majel::get<1>(c), 0);
-    EXPECT_EQ(majel::get<2>(c), 10);
-    c = majel::contiguous_strides(majel::Dim<3>(10, 10, 1));
-    EXPECT_EQ(majel::get<0>(c), 1);
-    EXPECT_EQ(majel::get<1>(c), 10);
-    EXPECT_EQ(majel::get<2>(c), 0);
-    c = majel::contiguous_strides(majel::Dim<3>(1, 10, 10));
-    EXPECT_EQ(majel::get<0>(c), 0);
-    EXPECT_EQ(majel::get<1>(c), 1);
-    EXPECT_EQ(majel::get<2>(c), 10);
-    c = majel::contiguous_strides(majel::Dim<3>(2, 3, 4));
-    EXPECT_EQ(majel::get<0>(c), 1);
-    EXPECT_EQ(majel::get<1>(c), 2);
-    EXPECT_EQ(majel::get<2>(c), 6);
-
-    // generate from an index
-    auto size = majel::make_dim(4, 5, 2);
-    c = majel::Dim<3>(14, size);
-    EXPECT_EQ(majel::get<0>(c), 2);
-    EXPECT_EQ(majel::get<1>(c), 3);
-    EXPECT_EQ(majel::get<2>(c), 0);
-    c = majel::Dim<3>(25, size);
-    EXPECT_EQ(majel::get<0>(c), 1);
-    EXPECT_EQ(majel::get<1>(c), 1);
-    EXPECT_EQ(majel::get<2>(c), 1);
-}
-
-TEST(Dim, Bool) {
-    auto a = majel::make_dim(3, 4);
-    auto b = majel::make_dim(5, 6);
-    auto c = majel::make_dim(3, 4);
-
-    // in_bounds check
-    EXPECT_TRUE(majel::contained(a, b));
-    EXPECT_FALSE(majel::contained(b, a));
-
-    // comparison
-    EXPECT_TRUE(a == a);
-    EXPECT_FALSE(a == b);
-    EXPECT_TRUE(a == c);
-
-    // contiguous check
-    int x = 4, y = 5, z = 2;
-    majel::Dim<3> sizef(x, y, z);
-    majel::Dim<3> stridea(1, x, x*y);
-    majel::Dim<3> strideb(2, 2*x, 2*x*y);
-    majel::Dim<3> stridec(1, x, 2*x*y);
-    EXPECT_TRUE(majel::contiguous(sizef, stridea));
-    EXPECT_FALSE(majel::contiguous(sizef, strideb));
-    EXPECT_FALSE(majel::contiguous(sizef, stridec));
-}
-
-TEST(Dim, Print) {
-    {
-        std::stringstream ss;
-        auto a = majel::make_dim(2, 3);
-        ss << a;
-        EXPECT_EQ(ss.str(), "2, 3");
-    }
-    {
-        std::stringstream ss;
-        ss << majel::make_dim(8);
-        EXPECT_EQ(ss.str(), "8");
-    }
-}
diff --git a/paddle/majel/place.cc b/paddle/majel/place.cc
deleted file mode 100644
index ca50b37843e0ba047f8f8b8d24a3d3c131587382..0000000000000000000000000000000000000000
--- a/paddle/majel/place.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-#include "paddle/majel/place.h"
-
-namespace majel {
-
-namespace detail {
-
-class PlacePrinter : public boost::static_visitor<> {
-private:
-  std::ostream& os_;
-
-public:
-  PlacePrinter(std::ostream& os) : os_(os) {}
-
-  void operator()(const CpuPlace&) { os_ << "CpuPlace"; }
-
-  void operator()(const GpuPlace& p) { os_ << "GpuPlace(" << p.device << ")"; }
-};
-
-}  // namespace detail
-
-static Place the_default_place;
-
-void set_place(const Place& place) { the_default_place = place; }
-
-const Place& get_place() { return the_default_place; }
-
-const GpuPlace default_gpu() { return GpuPlace(0); }
-
-const CpuPlace default_cpu() { return CpuPlace(); }
-
-bool is_gpu_place(const Place& p) {
-  return boost::apply_visitor(IsGpuPlace(), p);
-}
-
-bool is_cpu_place(const Place& p) {
-  return !boost::apply_visitor(IsGpuPlace(), p);
-}
-
-bool places_are_same_class(const Place& p1, const Place& p2) {
-  return is_gpu_place(p1) == is_gpu_place(p2);
-}
-
-std::ostream& operator<<(std::ostream& os, const majel::Place& p) {
-  majel::detail::PlacePrinter printer(os);
-  boost::apply_visitor(printer, p);
-  return os;
-}
-
-}  // namespace majel
diff --git a/paddle/majel/place.h b/paddle/majel/place.h
deleted file mode 100644
index ad3dc3fe0b80ac5dc10a59910c580d7912469cd4..0000000000000000000000000000000000000000
--- a/paddle/majel/place.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#pragma once
-#include <boost/variant.hpp>
-#include <iostream>
-
-namespace majel {
-
-struct CpuPlace {
-  CpuPlace() {}  // WORKAROUND: for some reason, omitting this constructor
-                 // causes errors with boost 1.59 and OSX
-  // needed for variant equality comparison
-  inline bool operator==(const CpuPlace&) const { return true; }
-
-  inline bool operator!=(const CpuPlace&) const { return false; }
-};
-
-struct GpuPlace {
-  GpuPlace(int d) : device(d) {}
-
-  // needed for variant equality comparison
-  inline bool operator==(const GpuPlace& o) const { return device == o.device; }
-
-  inline bool operator!=(const GpuPlace& o) const { return !(*this == o); }
-
-  GpuPlace() : GpuPlace(0) {}
-  int device;
-};
-
-class IsGpuPlace : public boost::static_visitor<bool> {
-public:
-  bool operator()(const CpuPlace&) const { return false; }
-
-  bool operator()(const GpuPlace& gpu) const { return true; }
-};
-
-typedef boost::variant<GpuPlace, CpuPlace> Place;
-
-void set_place(const Place&);
-
-const Place& get_place();
-
-const GpuPlace default_gpu();
-const CpuPlace default_cpu();
-
-bool is_gpu_place(const Place&);
-bool is_cpu_place(const Place&);
-bool places_are_same_class(const Place&, const Place&);
-
-std::ostream& operator<<(std::ostream&, const majel::Place&);
-
-}  // namespace majel
diff --git a/paddle/majel/place_test.cc b/paddle/majel/place_test.cc
deleted file mode 100644
index 6a099ae6b6e4f63a6ce845ab17eaab6e12c2c0b0..0000000000000000000000000000000000000000
--- a/paddle/majel/place_test.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "paddle/majel/place.h"
-#include <sstream>
-#include "gtest/gtest.h"
-
-TEST(Place, Equality) {
-  majel::CpuPlace cpu;
-  majel::GpuPlace g0(0), g1(1), gg0(0);
-
-  EXPECT_EQ(cpu, cpu);
-  EXPECT_EQ(g0, g0);
-  EXPECT_EQ(g1, g1);
-  EXPECT_EQ(g0, gg0);
-
-  EXPECT_NE(g0, g1);
-
-  EXPECT_TRUE(majel::places_are_same_class(g0, gg0));
-  EXPECT_FALSE(majel::places_are_same_class(g0, cpu));
-}
-
-TEST(Place, Default) {
-  EXPECT_TRUE(majel::is_gpu_place(majel::get_place()));
-  EXPECT_TRUE(majel::is_gpu_place(majel::default_gpu()));
-  EXPECT_TRUE(majel::is_cpu_place(majel::default_cpu()));
-
-  majel::set_place(majel::CpuPlace());
-  EXPECT_TRUE(majel::is_cpu_place(majel::get_place()));
-}
-
-TEST(Place, Print) {
-  {
-    std::stringstream ss;
-    ss << majel::GpuPlace(1);
-    EXPECT_EQ("GpuPlace(1)", ss.str());
-  }
-  {
-    std::stringstream ss;
-    ss << majel::CpuPlace();
-    EXPECT_EQ("CpuPlace", ss.str());
-  }
-}
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index c910146164ebfb0737583c72c48ce6dbc5b49939..4431d613f655c1d0c8da13bb5ac9225980c650ad 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1565,6 +1565,8 @@ void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
                     const_cast<real*>(src.getData()),
                     sizeof(real) * elementCnt_,
                     stream);
+    // There is a need to add synchronization to ensure that the data is copied.
+    hl_stream_synchronize(stream);
   } else if (typeid(src) == typeid(CpuMatrix)) {
     memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
   } else {
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 748be850b4c902d1b48c1dafbb0d5ea2bf197e6e..7dfd593225065e18830b2b0c0ce854fe7a2d5178 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -239,7 +239,8 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  // asynchronous copy
+  // For GpuMatrix this is an asynchronous copy interface
+  // For CpuMatrix this is an synchronous copy interface
   virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
     LOG(FATAL) << "Not implemented";
   }
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
index eaa1cdce305c2f9d7a517e9e8c8606dc1f70780b..eb87ee9bb7936d27c0c32a1a4b35ff49871c0a10 100644
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -657,6 +657,8 @@ void CpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
                     (void*)src.getData(),
                     sizeof(T) * this->getSize(),
                     stream);
+    // There is a need to add synchronization to ensure that the data is copied.
+    hl_stream_synchronize(stream);
   } else {
     src.copyTo(this);
   }
@@ -908,12 +910,13 @@ const T* CpuGpuVectorT<T>::getData(bool useGpu) const {
 // Operation will change data and need to reset sync_ & syncFlag_.
 #define MUTABLE_VECTOR_OP(OP, useGpu, args...) \
   do {                                         \
-    setSync(useGpu);                           \
     if (useGpu) {                              \
       copyToGpu();                             \
+      setSync(useGpu);                         \
       return gpuVectorT_->OP(args);            \
     } else {                                   \
       copyToCpu();                             \
+      setSync(useGpu);                         \
       return cpuVectorT_->OP(args);            \
     }                                          \
   } while (0)
@@ -1030,7 +1033,7 @@ void CpuGpuVectorT<T>::copyToCpu() {
     case DATA_AT_GPU:
       CHECK(gpuVectorT_);
       this->resizeOrCreate(gpuVectorT_->getSize(), false);
-      cpuVectorT_->copyFrom(*gpuVectorT_, HPPL_STREAM_DEFAULT);
+      cpuVectorT_->copyFrom(*gpuVectorT_);
       setSync(SYNCED);
       break;
     case DATA_AT_CPU:
@@ -1049,7 +1052,7 @@ void CpuGpuVectorT<T>::copyToGpu() {
     case DATA_AT_CPU:
       CHECK(cpuVectorT_);
       this->resizeOrCreate(cpuVectorT_->getSize(), true);
-      gpuVectorT_->copyFrom(*cpuVectorT_, HPPL_STREAM_DEFAULT);
+      gpuVectorT_->copyFrom(*cpuVectorT_);
       setSync(SYNCED);
       break;
     case DATA_AT_GPU:
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index 9af6e30c9e13895ad95653a787ec1c1ad77a248f..80b9775fccf10c57bb48145ef56165ec7c86d8b8 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -168,11 +168,11 @@ public:
   virtual void copyFrom(const VectorT<T>& src) = 0;
 
   /**
-   * If use_gpu, this function will push the copy-task to the specifed-stream
-   * and return immediately.
+   * If GpuVector, this function is an asynchronous interface,
+   * will push the copy-task to the specifed-stream and return immediately.
    *
-   * If not use GPU, this function is same as
-   * the copyFrom(const VectorT<T>& src), which use stream HPPL_STREAM_DEFAULT.
+   * If CpuVector, this function is an synchronous interface,
+   * same as the copyFrom(const VectorT<T>& src).
    */
   virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream) = 0;
 
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 5a0dffe086c4e265d17c79dba435b66c0873e3c7..354f58df39365410ff9aec2576c768e58db9e0d2 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1127,4 +1127,18 @@ TEST(Matrix, MaxOutFwdBwd) {
   }
 }
 
+TEST(CpuMatrix, copyFrom) {
+  const size_t height = 1000;
+  const size_t width = 1000;
+  CpuMatrix cpu(height, width);
+  GpuMatrix gpu(height, width);
+  CpuMatrix copy(height, width);
+
+  cpu.randomizeUniform();
+  gpu.copyFrom(cpu);
+  copy.copyFrom(gpu, HPPL_STREAM_DEFAULT);
+
+  TensorCheckEqual(cpu, copy);
+}
+
 #endif
diff --git a/paddle/memory/README.md b/paddle/memory/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e5f7880e4cad346da5399815f5e76b7b9b99bdea
--- /dev/null
+++ b/paddle/memory/README.md
@@ -0,0 +1,139 @@
+## Design
+
+### Usage
+
+To allocate 4KB CPU memory:
+
+```cpp
+p = memory::Alloc(platform::CPUPlace(), 4*1024);
+```
+
+To allocate 4KB memory on the 3rd GPU:
+
+```cpp
+p = memory::Alloc(platform::GPUPlace(2), 4*1024);
+```
+
+To free memory and check the so-far used amount of memory on a place:
+
+```cpp
+auto pl = platform::GPUPlace(0);
+p = memory::Alloc(pl, 4*1024);
+cout << memory::Used(pl);
+memory::Free(pl, p);
+```
+
+### API
+
+In `paddle/memory/memory.h` we have:
+
+```cpp
+namespace memory {
+template <typename Place> void* Alloc(Place, size_t);
+template <typename Place> void Free(Place, void*);
+template <typename Place> size_t Used(Place);
+}  // namespace memory
+```
+
+These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`:
+
+```cpp
+template<>
+void* Alloc<CPUPlace>(CPUPlace p, size_t size) {
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+```
+
+and 
+
+```cpp
+template<>
+void Alloc<GPUPlace>(GPUPlace p, size_t size) {
+  return GetGPUBuddyAllocator(p.id)->Alloc(size);
+}
+```
+
+Similar specializations exist for `Free` and `Used`.
+
+### Implementation
+
+`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions.
+
+```cpp
+BuddyAllocator* GetCPUBuddyAllocator() {
+  static BuddyAllocator* a = NULL;
+  if (a == NULL) {
+    a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...);
+  }
+  return a;
+}
+
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static BuddyAllocator* as = NULL;
+  if (as == NULL) {
+    as = new BuddyAllocator*[platform::NumGPUs()];
+    for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) {
+      as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...);
+    }
+  }
+  return as[gpu_id);
+```
+
+#### `BuddyAllocator`
+
+`BuddyAllocator` implements the buddy allocation algorithm.  Its constructor takes parameters only related with the algorithm:
+
+```cpp
+BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) {
+  ...
+}
+```
+
+Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object:
+
+```cpp
+class BuddyAllocator {
+ private:
+  struct Block {
+    size_t size;
+    Block* left, right;
+  };
+  ...
+};
+```
+
+Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`.  Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
+
+#### System Allocators
+
+The `GPUAllocator` and `CPUAllocator` are calls *system allocators*.  They work as the fallback allocators of `BuddyAllocator`.
+
+## Justification
+
+I got inspiration from Majel and Caffe2, though above design look different from both.
+
+### Caffe2
+
+In Caffe2, `Tensor<Context>::mutable_data()` allocates the memroy.  In particular, [`Tensor<Context>::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor<Context>::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479).
+
+There are two implementations of `Context`:
+
+1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
+
+1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202).  This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member.   `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
+
+### Majel
+
+In Majel, there are basically two allocator types:
+
+1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`.
+1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`.
+
+However, memory allocation is not via these two allocators.  Instead, these two allocators are defined in hidden namespaces.
+
+In Majel there are hidden global variables like:
+
+1. `cpu::SystemAllocator g_cpu_allocator`, and
+1. `vector<gpu::SystemAllocator*> g_gpu_allocators(NUM_GPUS)`.
+
+Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`.
diff --git a/paddle/optimizer/CMakeLists.txt b/paddle/optimizer/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4536f62ec7c2c3423d91e309dee993d4212160fe
--- /dev/null
+++ b/paddle/optimizer/CMakeLists.txt
@@ -0,0 +1,18 @@
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+set(OPITMIZER_SRCS
+    adadelta_optimizer.cc
+    adagrad_optimizer.cc
+    adam_optimizer.cc
+    optimizer.cc
+    parameter_optimizer.cc
+    sgd_optimizer.cc
+  )
+
+add_library(paddle_optimizer STATIC ${OPITMIZER_SRCS})
+add_dependencies(paddle_optimizer gen_proto_cpp)
+
+if(WITH_TESTING)
+  add_simple_unittest(serialization_test)
+  add_simple_unittest(parameter_optimizer_test)
+endif()
diff --git a/paddle/optimizer/adadelta_optimizer.cc b/paddle/optimizer/adadelta_optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..465ad5e0d2089121a0f11ab916afe0420cbcfab7
--- /dev/null
+++ b/paddle/optimizer/adadelta_optimizer.cc
@@ -0,0 +1,55 @@
+#include "adadelta_optimizer.h"
+#include <algorithm>
+#include <cmath>
+
+namespace paddle {
+namespace optimizer {
+
+void AdadeltaOptimizer::Update(const Tensor* gradient) {
+  num_sample_passed_ += 1;
+  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
+  Tensor& param = *parameter_;
+  const Tensor& grad = *gradient;
+  Tensor& accum_g = *accum_gradient_;
+  Tensor& accum_d = *accum_delta_;
+  Tensor& update_d = *update_delta_;
+  for (size_t i = 0; i < param.size(); ++i) {
+    accum_g[i] = rho_ * accum_g[i] + (1.0 - rho_) * grad[i] * grad[i];
+
+    update_d[i] = std::sqrt(accum_d[i] + epsilon_) /
+                  std::sqrt(accum_g[i] + epsilon_) * grad[i];
+
+    accum_d[i] = rho_ * accum_d[i] + (1.0 - rho_) * update_d[i] * update_d[i];
+
+    param[i] -= learning_rate * update_d[i] + learning_rate * decay_ * param[i];
+  }
+}
+
+const char* AdadeltaOptimizer::SerializeState(int* state_len) {
+  AdadeltaOptimizerState state;
+  // TODO(zhihong) : add lr_policy serialization
+  state.set_num_sample_passed(num_sample_passed_);
+
+  TensorToProto(*parameter_, state.mutable_parameter());
+  TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
+  TensorToProto(*accum_delta_, state.mutable_accum_delta());
+  TensorToProto(*update_delta_, state.mutable_update_delta());
+  auto str = state.SerializeAsString();
+  *state_len = str.size();
+  return str.c_str();
+}
+
+void AdadeltaOptimizer::DeserializeState(const std::string& str) {
+  AdadeltaOptimizerState state;
+  state.ParseFromString(str);
+  // TODO(zhihong) : add lr_policy DeserializeState
+  num_sample_passed_ = state.num_sample_passed();
+
+  ProtoToTensor(state.parameter(), parameter_);
+  ProtoToTensor(state.accum_gradient(), accum_gradient_);
+  ProtoToTensor(state.accum_delta(), accum_delta_);
+  ProtoToTensor(state.update_delta(), update_delta_);
+}
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/adadelta_optimizer.h b/paddle/optimizer/adadelta_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d5eab097f57d049855dd171a1aa6f74c48ae0e7
--- /dev/null
+++ b/paddle/optimizer/adadelta_optimizer.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "parameter_optimizer.h"
+
+namespace paddle {
+namespace optimizer {
+
+class AdadeltaOptimizer : public ParameterOptimizer {
+public:
+  AdadeltaOptimizer(
+      Tensor *parameter, LrPolicy *lr, double rho, double epsilon, double decay)
+      : ParameterOptimizer(parameter, lr),
+        accum_gradient_(new Tensor(parameter->size())),
+        accum_delta_(new Tensor(parameter->size())),
+        update_delta_(new Tensor(parameter->size())),
+        rho_(rho),
+        epsilon_(epsilon),
+        decay_(decay) {}
+
+  ~AdadeltaOptimizer() {
+    if (accum_gradient_) delete accum_gradient_;
+    if (accum_delta_) delete accum_delta_;
+    if (update_delta_) delete update_delta_;
+  }
+  void Update(const Tensor *gradient);
+  const char *SerializeState(int *state_len);
+  void DeserializeState(const std::string &state);
+
+private:
+  Tensor *accum_gradient_;
+  Tensor *accum_delta_;
+  Tensor *update_delta_;
+  double rho_;
+  double epsilon_;
+  double decay_;
+};
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/adagrad_optimizer.cc b/paddle/optimizer/adagrad_optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bdaa7877d2bc58c17c51b977852d4b6fec511ed2
--- /dev/null
+++ b/paddle/optimizer/adagrad_optimizer.cc
@@ -0,0 +1,42 @@
+#include <cmath>
+
+#include "adagrad_optimizer.h"
+
+namespace paddle {
+namespace optimizer {
+
+void AdagradOptimizer::Update(const Tensor* gradient) {
+  num_sample_passed_ += 1;
+  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
+  Tensor& param = *parameter_;
+  Tensor& accum_g = *accum_gradient_;
+  const Tensor& grad = *gradient;
+  for (size_t i = 0; i < param.size(); ++i) {
+    accum_g[i] += grad[i] * grad[i];
+    param[i] += learning_rate * grad[i] / std::sqrt(accum_g[i] + epsilon_) +
+                learning_rate * decay_ * param[i];
+  }
+}
+const char* AdagradOptimizer::SerializeState(int* state_len) {
+  AdagradOptimizerState state;
+  // TODO(zhihong) : add lr_policy serialization
+  state.set_num_sample_passed(num_sample_passed_);
+
+  TensorToProto(*parameter_, state.mutable_parameter());
+  TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
+  auto str = state.SerializeAsString();
+  *state_len = str.size();
+  return str.c_str();
+}
+
+void AdagradOptimizer::DeserializeState(const std::string& str) {
+  AdagradOptimizerState state;
+  state.ParseFromString(str);
+  // TODO(zhihong) : add lr_policy DeserializeState
+  num_sample_passed_ = state.num_sample_passed();
+  ProtoToTensor(state.parameter(), parameter_);
+  ProtoToTensor(state.accum_gradient(), accum_gradient_);
+}
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/adagrad_optimizer.h b/paddle/optimizer/adagrad_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..15d0a965ad0c6967e73b14b465168fa66eb8fba3
--- /dev/null
+++ b/paddle/optimizer/adagrad_optimizer.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "parameter_optimizer.h"
+
+namespace paddle {
+namespace optimizer {
+
+class AdagradOptimizer : public ParameterOptimizer {
+public:
+  AdagradOptimizer(Tensor *parameter,
+                   LrPolicy *lr,
+                   double epsilon,
+                   double decay)
+      : ParameterOptimizer(parameter, lr),
+        accum_gradient_(new Tensor(parameter->size())),
+        epsilon_(epsilon),
+        decay_(decay) {}
+  ~AdagradOptimizer() {
+    if (accum_gradient_) delete accum_gradient_;
+  }
+  void Update(const Tensor *gradient);
+  const char *SerializeState(int *state_len);
+  void DeserializeState(const std::string &state);
+
+private:
+  Tensor *accum_gradient_;
+  double epsilon_;
+  double decay_;
+};
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/adam_optimizer.cc b/paddle/optimizer/adam_optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ceab7397d87349c64ca9e5d11990cb38068421be
--- /dev/null
+++ b/paddle/optimizer/adam_optimizer.cc
@@ -0,0 +1,48 @@
+#include "adam_optimizer.h"
+#include <cmath>
+
+namespace paddle {
+namespace optimizer {
+
+void AdamOptimizer::Update(const Tensor *gradient) {
+  num_sample_passed_ += 1;
+  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
+  double coef1 = 1.0 - std::pow(beta_1_, num_sample_passed_);
+  double coef2 = 1.0 - std::pow(beta_2_, num_sample_passed_);
+  learning_rate *= std::sqrt(coef2) / coef1;
+  Tensor &param = *parameter_;
+  const Tensor &grad = *gradient;
+  Tensor &m = *momentums_;
+  Tensor &v = *velocitys_;
+  for (size_t i = 0; i < param.size(); ++i) {
+    m[i] = beta_1_ * m[i] + (1.0 - beta_1_) * grad[i];
+    v[i] = beta_2_ * v[i] + (1.0 - beta_2_) * grad[i] * grad[i];
+    param[i] -=
+        learning_rate * (m[i] / std::sqrt(v[i] + epsilon_) + decay_ * param[i]);
+  }
+}
+
+const char *AdamOptimizer::SerializeState(int *state_len) {
+  AdamOptimizerState state;
+  // TODO(zhihong) : add lr_policy serialization
+  state.set_num_sample_passed(num_sample_passed_);
+  TensorToProto(*parameter_, state.mutable_parameter());
+  TensorToProto(*momentums_, state.mutable_momentums());
+  TensorToProto(*velocitys_, state.mutable_velocitys());
+  auto str = state.SerializeAsString();
+  *state_len = str.size();
+  return str.c_str();
+}
+
+void AdamOptimizer::DeserializeState(const std::string &str) {
+  AdamOptimizerState state;
+  state.ParseFromString(str);
+  // TODO(zhihong) : add lr_policy DeserializeState
+  num_sample_passed_ = state.num_sample_passed();
+
+  ProtoToTensor(state.parameter(), parameter_);
+  ProtoToTensor(state.momentums(), momentums_);
+  ProtoToTensor(state.velocitys(), velocitys_);
+}
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/adam_optimizer.h b/paddle/optimizer/adam_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ea4c8bb8470504282b4d6c12039791ce896e401
--- /dev/null
+++ b/paddle/optimizer/adam_optimizer.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "parameter_optimizer.h"
+
+namespace paddle {
+namespace optimizer {
+
+class AdamOptimizer : public ParameterOptimizer {
+public:
+  AdamOptimizer(Tensor *parameter,
+                LrPolicy *lr,
+                double beta_1,
+                double beta_2,
+                double epsilon,
+                double decay)
+      : ParameterOptimizer(parameter, lr),
+        momentums_(new Tensor(parameter->size())),
+        velocitys_(new Tensor(parameter->size())),
+        beta_1_(beta_1),
+        beta_2_(beta_2),
+        epsilon_(epsilon),
+        decay_(decay) {}
+  ~AdamOptimizer() {
+    if (momentums_) delete momentums_;
+    if (velocitys_) delete velocitys_;
+  }
+  void Update(const Tensor *gradient);
+  const char *SerializeState(int *state_len);
+  void DeserializeState(const std::string &state);
+
+private:
+  Tensor *momentums_;
+  Tensor *velocitys_;
+  double beta_1_;
+  double beta_2_;
+  double epsilon_;
+  double decay_;
+};
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/lr_policy.h b/paddle/optimizer/lr_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8e33ad37ab4c019a36f63f34babe65cf8c8fb16
--- /dev/null
+++ b/paddle/optimizer/lr_policy.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <algorithm>
+#include "OptimizerConfig.pb.h"
+
+namespace paddle {
+namespace optimizer {
+
+class LrPolicy {
+public:
+  virtual ~LrPolicy() {}
+  virtual double LearningRate(const uint64_t num_sample_passed) = 0;
+  virtual const char *SerializeState(int *state_len) = 0;
+  virtual void DeserializeState(const std::string &state) = 0;
+};
+
+// constant learning rate policy
+class ConstLr final : public LrPolicy {
+public:
+  ConstLr(double lr) : learning_rate(lr){};
+  double LearningRate(const uint64_t num_sample_passed) {
+    return learning_rate;
+  }
+  const char *SerializeState(int *state_len) { return nullptr; }
+  void DeserializeState(const std::string &state) {}
+
+private:
+  double learning_rate;
+};
+
+class LinearLr final : public LrPolicy {
+public:
+  LinearLr(double lr, double lr_decay_a, double lr_decay_b)
+      : learning_rate(lr), lr_decay_a(lr_decay_a), lr_decay_b(lr_decay_b) {}
+  double LearningRate(const uint64_t num_sample_passed) {
+    return std::max(learning_rate - lr_decay_a * num_sample_passed, lr_decay_b);
+  }
+  const char *SerializeState(int *state_len) {
+    // TODO(zhihong) : add lr_policy serialization
+    return nullptr;
+  }
+  void DeserializeState(const std::string &state) {
+    // TODO(zhihong) : add lr_policy serialization
+  }
+
+private:
+  double learning_rate;
+  double lr_decay_a;
+  double lr_decay_b;
+};
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/optimizer.cc b/paddle/optimizer/optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54662dc37891d3211950453b210db4b475837df4
--- /dev/null
+++ b/paddle/optimizer/optimizer.cc
@@ -0,0 +1,83 @@
+#include "optimizer.h"
+#include <string>
+
+#include "parameter_optimizer.h"
+
+using namespace paddle;
+using namespace paddle::optimizer;
+
+template <paddle_element_type VALUE>
+struct EnumToType {};
+
+template <class T>
+struct TypeToEnum {};
+
+#define MATCH_ENUM_TYPE(TYPE, ENUM)                  \
+  template <>                                        \
+  struct TypeToEnum<TYPE> {                          \
+    static paddle_element_type v() { return ENUM; }; \
+    static constexpr TYPE value = ENUM;              \
+  };                                                 \
+  template <>                                        \
+  struct EnumToType<ENUM> {                          \
+    typedef TYPE Type;                               \
+  }
+
+MATCH_ENUM_TYPE(int32_t, PADDLE_ELEMENT_TYPE_INT32);
+MATCH_ENUM_TYPE(uint32_t, PADDLE_ELEMENT_TYPE_UINT32);
+MATCH_ENUM_TYPE(int64_t, PADDLE_ELEMENT_TYPE_INT64);
+MATCH_ENUM_TYPE(uint64_t, PADDLE_ELEMENT_TYPE_UINT64);
+// TODO(zhihong): only implement below type, need to fix
+MATCH_ENUM_TYPE(float, PADDLE_ELEMENT_TYPE_FLOAT32);
+MATCH_ENUM_TYPE(double, PADDLE_ELEMENT_TYPE_FLOAT64);
+
+struct paddle_optimizer {
+  paddle::optimizer::ParameterOptimizer* impl;
+};
+
+paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto,
+                                          const int config_proto_len,
+                                          const paddle_element_type data_type,
+                                          void* param_buffer,
+                                          int num_bytes,
+                                          const char* state,
+                                          const int state_len) {
+  paddle_optimizer* optimizer = new paddle_optimizer;
+  std::string config(config_proto, config_proto + config_proto_len);
+  Tensor* parameter =
+      new Tensor(reinterpret_cast<float*>(param_buffer), num_bytes);
+  optimizer->impl = ParameterOptimizer::Create(config, parameter);
+  if (state != nullptr) {
+    std::string s(state, state + state_len);
+    optimizer->impl->DeserializeState(s);
+  }
+  return optimizer;
+}
+
+int paddle_release_optimizer(paddle_optimizer* o) {
+  if (o != nullptr) delete o->impl;
+  return PADDLE_SUCCESS;
+}
+
+int paddle_update_parameter(paddle_optimizer* o,
+                            const paddle_element_type data_type,
+                            const void* grad_buffer,
+                            int num_bytes) {
+  // TOOD(zhihong): datatype not work. need to add the runtime datatype
+  auto grad_type = reinterpret_cast<const float*>(grad_buffer);
+  Tensor* gradient = new Tensor(const_cast<float*>(grad_type), num_bytes);
+  o->impl->Update(gradient);
+  return PADDLE_SUCCESS;
+}
+
+int paddle_optimizer_get_weights(paddle_optimizer* o, void** param_buffer) {
+  int param_size = 0;
+  *param_buffer = (void*)o->impl->get_weight(&param_size);
+  return param_size;
+}
+
+int paddle_optimizer_get_state(paddle_optimizer* o, const char** state) {
+  int state_len = 0;
+  *state = o->impl->SerializeState(&state_len);
+  return state_len;
+}
diff --git a/paddle/optimizer/optimizer.h b/paddle/optimizer/optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..aabf7a458dd30092ed1e522c4d88c6cfe63fcce1
--- /dev/null
+++ b/paddle/optimizer/optimizer.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+/**
+ * @brief optimizer library in independent with other module
+ * which will be used in :
+ * Case A, the gradient optimized locally on the trainer.
+ *
+ * Case B, the gradient optimized on the parameter server.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  PADDLE_ELEMENT_TYPE_INT32 = 0,
+  PADDLE_ELEMENT_TYPE_UINT32 = 1,
+  PADDLE_ELEMENT_TYPE_INT64 = 2,
+  PADDLE_ELEMENT_TYPE_UINT64 = 3,
+  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
+  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
+} paddle_element_type;
+
+/**
+ * @brief execution status code
+ */
+const int32_t PADDLE_SUCCESS = 0;
+const int32_t PADDLE_ERROR = -1;
+
+typedef struct paddle_optimizer paddle_optimizer;
+/**
+ * this group interface called in order :
+ * 1. create optimizer with config
+ * 2. set weights
+ * 3. update_parameter
+ * 4. get_weights
+ * 5. release optimizer
+ */
+
+/**
+ *  @brief create optimizer with proto_config
+ *  @param config_proto, optimizer protobuf, see OptimizerConfig.proto in detail
+ *  @return return optimizer instance
+ */
+paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto,
+                                          const int config_proto_len,
+                                          const paddle_element_type data_type,
+                                          void* param_buffer,
+                                          int num_bytes,
+                                          const char* state,
+                                          const int state_len);
+
+/**
+ *  @brief release optimizer
+ *  @param optimizer
+ *  @return return exec status
+ */
+int paddle_release_optimizer(paddle_optimizer* o);
+
+/**
+ *  @brief optimizer instance
+ *  @param datatype of gradient and parameter
+ *  @param gradient, calculate by optimzizer caller.
+ *       TODO(zhihong): just pass loss to reduce communicate overhead.
+ *                     Project Adam Ms'14 paper for detail
+ *  @param num_bytes, gradient size
+ *  @return return exec status
+ */
+int paddle_update_parameter(paddle_optimizer* o,
+                            const paddle_element_type data_type,
+                            const void* gradient,
+                            int num_bytes);
+
+/**
+ *  @brief optimizer for get parameter buffer
+ *  @param param_buffer, initilized parameter buffer
+ *  @return return content length
+ */
+int paddle_optimizer_get_weights(paddle_optimizer* o, void** param_buffer);
+
+/**
+ *  @brief optimzizer for saving training state
+ *  @param training state for receive SerializeState
+ *  @return return state_buffer length
+ */
+int paddle_optimizer_get_state(paddle_optimizer* o, const char** state);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/paddle/optimizer/parameter_optimizer.cc b/paddle/optimizer/parameter_optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f6218037925649e741d17f49af972ce2d50f8d3d
--- /dev/null
+++ b/paddle/optimizer/parameter_optimizer.cc
@@ -0,0 +1,74 @@
+#include <glog/logging.h>
+#include "adadelta_optimizer.h"
+#include "adagrad_optimizer.h"
+#include "adam_optimizer.h"
+#include "lr_policy.h"
+#include "sgd_optimizer.h"
+
+#include "parameter_optimizer.h"
+
+namespace paddle {
+namespace optimizer {
+
+ParameterOptimizer *ParameterOptimizer::Create(const std::string &config_proto,
+                                               Tensor *parameter) {
+  paddle::OptimizerConfig config;
+  CHECK(config.ParseFromString(config_proto) == true)
+      << "failed parse optimizer config";
+  auto select_lr_policy = [=](const OptimizerConfig &config) -> LrPolicy * {
+    if (config.lr_policy() == OptimizerConfig::Const)
+      return new ConstLr(config.const_lr().learning_rate());
+    if (config.lr_policy() == OptimizerConfig::Linear)
+      return new LinearLr(config.linear_lr().learning_rate(),
+                          config.linear_lr().lr_decay_a(),
+                          config.linear_lr().lr_decay_b());
+    // default
+    LOG(WARNING) << " have not select any LrPolicy. use ConstLr in default";
+    return new ConstLr(0.1);
+  };
+
+  LrPolicy *lr = select_lr_policy(config);
+  auto select_optimizer = [=](
+      Tensor *parameter,
+      const OptimizerConfig &config) -> ParameterOptimizer * {
+    if (config.optimizer() == OptimizerConfig::SGD) {
+      return new SGDOptimizer(parameter,
+                              lr,
+                              config.sgd().momentum(),
+                              config.sgd().decay(),
+                              config.sgd().nesterov());
+    }
+    if (config.optimizer() == OptimizerConfig::Adadelta) {
+      return new AdadeltaOptimizer(parameter,
+                                   lr,
+                                   config.adadelta().rho(),
+                                   config.adadelta().epsilon(),
+                                   config.adadelta().decay());
+    }
+    if (config.optimizer() == OptimizerConfig::Adagrad) {
+      return new AdagradOptimizer(
+          parameter, lr, config.adagrad().epsilon(), config.adagrad().decay());
+    }
+    if (config.optimizer() == OptimizerConfig::Adam) {
+      return new AdamOptimizer(parameter,
+                               lr,
+                               config.adam().beta_1(),
+                               config.adam().beta_2(),
+                               config.adam().epsilon(),
+                               config.adam().decay());
+    }
+    // default
+    LOG(WARNING)
+        << "have not select any Optimizer. use SGDOptimizer in default";
+    return new SGDOptimizer(parameter, lr, 0.0, 0.0, false);
+  };
+  return select_optimizer(parameter, config);
+}
+
+float *ParameterOptimizer::get_weight(int *param_size) const {
+  *param_size = (int)parameter_->size();
+  return parameter_->get_buffer();
+}
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/parameter_optimizer.h b/paddle/optimizer/parameter_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d89c9abb791f947172078d4dce5b1c366852591b
--- /dev/null
+++ b/paddle/optimizer/parameter_optimizer.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <glog/logging.h>
+#include <functional>
+#include <string>
+#include "OptimizerConfig.pb.h"
+#include "lr_policy.h"
+#include "serialization.h"
+#include "tensor.h"
+
+namespace paddle {
+namespace optimizer {
+
+class ParameterOptimizer {
+public:
+  /**
+   * @brief  update hook for algorithm need to traverse parameter more than
+   * once.
+   */
+  ParameterOptimizer(Tensor *parameter, LrPolicy *lr)
+      : parameter_(parameter), lr_policy_(lr), num_sample_passed_(0) {}
+  virtual ~ParameterOptimizer() {
+    delete parameter_;
+    delete lr_policy_;
+  }
+
+  static ParameterOptimizer *Create(const std::string &config_proto,
+                                    Tensor *parameter);
+  virtual void Update(const Tensor *gradient) = 0;
+  virtual float *get_weight(int *param_size) const;
+  virtual const char *SerializeState(int *state_len) = 0;
+  virtual void DeserializeState(const std::string &state) = 0;
+
+protected:
+  Tensor *parameter_;
+  // learning rate policy
+  LrPolicy *lr_policy_;
+  uint64_t num_sample_passed_;
+};
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/parameter_optimizer_test.cpp b/paddle/optimizer/parameter_optimizer_test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e6254d9e4dab48279b4a880695959526d30d70c
--- /dev/null
+++ b/paddle/optimizer/parameter_optimizer_test.cpp
@@ -0,0 +1,107 @@
+#include "parameter_optimizer.h"
+#include <cmath>
+#include <map>
+#include <vector>
+#include "gtest/gtest.h"
+#include "lr_policy.h"
+
+using namespace paddle;
+using namespace paddle::optimizer;
+
+Tensor* FillTensor(size_t size) {
+  Tensor* param = new Tensor(size);
+  Tensor& p = *param;
+  for (size_t i = 0; i < p.size(); ++i) {
+    p[i] = (float)rand() / (float)RAND_MAX;
+  }
+  return param;
+}
+
+Tensor* FixedTensor(size_t size) {
+  Tensor* param = new Tensor(size);
+  Tensor& p = *param;
+  for (size_t i = 0; i < p.size(); ++i) {
+    p[i] = i;
+  }
+  return param;
+}
+
+class OptimizerTest : public testing::Test {
+public:
+  // init tensor shape
+  const size_t kSize = 5;
+
+  virtual void SetUp() {
+    CreateSGD();
+    CreateAdam();
+  }
+  virtual void TearDown() {}
+
+  void CreateSGD() {
+    Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(OptimizerConfig::SGD);
+    config_.mutable_sgd()->set_momentum(0.0);
+    config_.mutable_sgd()->set_decay(0.0);
+    config_.mutable_sgd()->set_nesterov(false);
+    config_.set_lr_policy(OptimizerConfig::Const);
+    config_.mutable_const_lr()->set_learning_rate(0.1);
+    std::string str = config_.SerializeAsString();
+    ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter);
+    opts_.push_back(opt);
+  }
+
+  void CreateAdam() {
+    Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(OptimizerConfig::Adam);
+    config_.mutable_adam()->set_beta_1(0.9);
+    config_.mutable_adam()->set_beta_2(0.1);
+    config_.mutable_adam()->set_epsilon(1e-3);
+    config_.mutable_adam()->set_decay(0.0);
+    config_.set_lr_policy(OptimizerConfig::Const);
+    config_.mutable_const_lr()->set_learning_rate(0.1);
+    std::string str = config_.SerializeAsString();
+    ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter);
+    opts_.push_back(opt);
+  }
+
+  void TestGetWeight() {
+    Tensor* p = FixedTensor(kSize);
+    for (size_t i = 0; i < opts_.size(); ++i) {
+      int s = 0;
+      float* newp = (float*)opts_[i]->get_weight(&s);
+      for (size_t j = 0; j < kSize; ++j) {
+        EXPECT_EQ(newp[j], (*p)[j]);
+      }
+    }
+  }
+
+  void TestUpdate() {
+    Tensor* g = FixedTensor(kSize);
+    for (size_t i = 0; i < opts_.size(); ++i) {
+      opts_[i]->Update(g);
+    }
+  }
+
+  void TestCheckPoint() {
+    for (size_t i = 0; i < opts_.size(); ++i) {
+      int state_len = 0;
+      std::string state = opts_[i]->SerializeState(&state_len);
+      opts_[i]->DeserializeState(state);
+    }
+  }
+
+private:
+  std::vector<ParameterOptimizer*> opts_;
+  OptimizerConfig config_;
+};
+
+TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
+
+TEST_F(OptimizerTest, TestUpdate) { TestUpdate(); }
+
+TEST_F(OptimizerTest, TestCheckPoint) { TestCheckPoint(); }
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/optimizer/serialization.h b/paddle/optimizer/serialization.h
new file mode 100644
index 0000000000000000000000000000000000000000..92fbf65cc6b98d7f92841bafe4ab77001ca03b7c
--- /dev/null
+++ b/paddle/optimizer/serialization.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include "OptimizerConfig.pb.h"
+#include "paddle/utils/Logging.h"
+#include "tensor.h"
+
+namespace paddle {
+namespace optimizer {
+
+static void TensorToProto(const Tensor& tensor, TensorProto* proto) {
+  proto->set_data_type(TensorProto::PADDLE_ELEMENT_TYPE_FLOAT32);
+  std::stringstream os;
+  for (size_t i = 0; i < tensor.size(); ++i) {
+    os << tensor[i];
+    proto->add_content(os.str());
+    os.str(std::string());
+  }
+}
+
+static void ProtoToTensor(const TensorProto& proto, Tensor* tensor) {
+  std::stringstream sin;
+  for (auto i = 0; i < proto.content_size(); ++i) {
+    sin << proto.content(i);
+    sin >> (*tensor)[i];
+    sin.str(std::string());
+    sin.clear();
+  }
+}
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/serialization_test.cpp b/paddle/optimizer/serialization_test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d2454140dc243b40ed8348578360b30894213838
--- /dev/null
+++ b/paddle/optimizer/serialization_test.cpp
@@ -0,0 +1,25 @@
+#include "serialization.h"
+#include "gtest/gtest.h"
+
+using namespace paddle;
+using namespace paddle::optimizer;
+
+TEST(TensorToProto, Case1) {
+  Tensor t(3), t1(3);
+  for (size_t i = 0; i < t.size(); ++i) {
+    t[i] = i;
+    t1[i] = 0;
+  }
+
+  TensorProto proto;
+  TensorToProto(t, &proto);
+  ProtoToTensor(proto, &t1);
+  for (size_t i = 0; i < t1.size(); ++i) {
+    EXPECT_EQ(t1[i], t[i]);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/optimizer/sgd_optimizer.cc b/paddle/optimizer/sgd_optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34e051003fa83f11b1f4a39c46856e0372836a1a
--- /dev/null
+++ b/paddle/optimizer/sgd_optimizer.cc
@@ -0,0 +1,49 @@
+#include "sgd_optimizer.h"
+#include "serialization.h"
+
+namespace paddle {
+namespace optimizer {
+
+void SGDOptimizer::Update(const Tensor *gradient) {
+  num_sample_passed_ += 1;
+  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
+  float velocity = 0.0;
+  Tensor &param = *parameter_;
+  const Tensor &grad = *gradient;
+  Tensor &m = *momentums_;
+  for (size_t i = 0; i < param.size(); ++i) {
+    if (momentum_ == 0.0) {
+      velocity = -learning_rate * grad[i] - learning_rate * decay_ * param[i];
+    } else {
+      m[i] = momentum_ * m[i] - learning_rate * grad[i] -
+             learning_rate * decay_ * param[i];
+      velocity = m[i];
+    }
+    if (nesterov_) {
+      param[i] += momentum_ * velocity - learning_rate * grad[i];
+    } else {
+      param[i] += velocity;
+    }
+  }
+}
+
+const char *SGDOptimizer::SerializeState(int *state_len) {
+  SGDOptimizerState state;
+  state.set_num_sample_passed(num_sample_passed_);
+  TensorToProto(*parameter_, state.mutable_parameter());
+  if (momentum_ != 0.0) TensorToProto(*momentums_, state.mutable_momentums());
+  auto str = state.SerializeAsString();
+  *state_len = str.size();
+  return str.c_str();
+}
+
+void SGDOptimizer::DeserializeState(const std::string &str) {
+  SGDOptimizerState state;
+  state.ParseFromString(str);
+  num_sample_passed_ = state.num_sample_passed();
+  ProtoToTensor(state.parameter(), parameter_);
+  if (momentum_ != 0.0) ProtoToTensor(state.parameter(), momentums_);
+}
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/sgd_optimizer.h b/paddle/optimizer/sgd_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b74a902e1aa40a7831b36ab826d72372a3588bcf
--- /dev/null
+++ b/paddle/optimizer/sgd_optimizer.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "parameter_optimizer.h"
+
+namespace paddle {
+namespace optimizer {
+
+class SGDOptimizer : public ParameterOptimizer {
+public:
+  SGDOptimizer(Tensor* parameter, LrPolicy* lr, double m, double d, bool n)
+      : ParameterOptimizer(parameter, lr),
+        momentums_(nullptr),
+        momentum_(m),
+        decay_(d),
+        nesterov_(n) {
+    if (momentum_ != 0.0) {
+      size_t size = parameter->size();
+      // TODO: fix it with align aware allocator bind to Tensor
+      momentums_ = new Tensor(size);
+    }
+  }
+  virtual ~SGDOptimizer() {
+    if (momentums_) delete momentums_;
+  }
+  void Update(const Tensor* gradient);
+  const char* SerializeState(int* state_len);
+  void DeserializeState(const std::string& state);
+
+private:
+  Tensor* momentums_;
+  double momentum_;
+  double decay_;
+  bool nesterov_;
+};
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/tensor.h b/paddle/optimizer/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..80a8c93081ea7758d3b5ba016a14d424954db913
--- /dev/null
+++ b/paddle/optimizer/tensor.h
@@ -0,0 +1,54 @@
+#pragma once
+/**
+ * @brief tensor used by optimizer
+ */
+
+#include <string.h>
+#include <memory>
+#include "paddle/utils/Common.h"
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+namespace optimizer {
+
+template <class T>
+class TensorT {
+public:
+  TensorT(size_t size) : height_(1), width_(size) {
+    data_ptr_ = std::shared_ptr<T>(new T[size], std::default_delete<T[]>());
+    data_ = data_ptr_.get();
+  }
+
+  TensorT(T* data, size_t size)
+      : height_(1), width_(size), data_ptr_(nullptr), data_(data) {}
+
+  TensorT(T* data, size_t h, size_t w)
+      : height_(h), width_(w), data_ptr_(nullptr), data_(data) {}
+
+  virtual ~TensorT() {}
+
+  T* get_buffer() { return this->data_; }
+
+  T& operator[](const size_t idx) {
+    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
+    return data_[idx];
+  }
+  T& operator[](const size_t idx) const {
+    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
+    return data_[idx];
+  }
+  // TODO: replace with tensorshape
+  size_t size() const { return this->width_ * this->height_; }
+
+protected:
+  size_t height_;
+  size_t width_;
+  std::shared_ptr<T> data_ptr_;
+  T* data_;
+};
+
+// TODO(zhihong): design problem of dynamic datatype, need to fix it
+typedef TensorT<float> Tensor;
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index 91aca98e186aef0ad6b345cf4791ef80c616e3fe..09bd633616730dc9475edc596128166f4f70b0cd 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -149,6 +149,7 @@ struct Argument {
                                      : getBatchSize();
   }
 
+  bool hasSeq() const { return sequenceStartPositions != nullptr; }
   bool hasSubseq() const { return subSequenceStartPositions != nullptr; }
 
   const int* getCpuStartPositions() const {
diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp
index f826e8448c666bb3305c150f2bd95aade23223fb..c8b47687f5d3c00f6609b858103a5fec526b970a 100644
--- a/paddle/parameter/ParameterUpdaterHook.cpp
+++ b/paddle/parameter/ParameterUpdaterHook.cpp
@@ -14,11 +14,13 @@ limitations under the License. */
 
 #include "ParameterUpdaterHook.h"
 
+#include <algorithm>
 #include <atomic>
 #include <fstream>
 #include <mutex>
 #include <thread>
 #include <unordered_map>
+#include <vector>
 
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
@@ -29,106 +31,76 @@ namespace paddle {
 
 /**
  * The static pruning hook
- *
- * Static means user load a mask map before training started. This map will
- * define which link/weight between neural is disabled.
+ * Static means user specify a sparsity_ratio before training started, and the
+ * network will prune the parameters based on the sparsity_ratio. More details
+ * can be found https://arxiv.org/pdf/1506.02626.pdf.
  */
+
 class StaticPruningHook : public IParameterUpdaterHook {
 public:
-  /**
-   * The Mask Map Header.
-   * The map file started with this header.
-   *
-   * In Version 0, reset file will be:
-   *  contains header.size bit, each bit means such weight is enabled or not.
-   *    if bit is 1, then such weight is enabled.
-   *  at end, the file will round to byte, and the low bits of end byte will be
-   *  filled by zero.
-   *
-   */
-  struct StaticMaskHeader {
-    uint32_t version;
-    size_t size;
-  } __attribute__((__packed__));
-
-  explicit StaticPruningHook(const std::string& mask_filename) : initCount_(0) {
-    bool ok = this->loadMaskFile(mask_filename);
-    if (!ok) {
-      LOG(WARNING) << "Fail to load mask file " << mask_filename
-                   << " in current directory, searching in init_model_path";
-      std::string combineMaskFilename =
-          path::join(FLAGS_init_model_path, mask_filename);
-      CHECK(this->loadMaskFile(combineMaskFilename))
-          << "Cannot load " << mask_filename << " in ./" << mask_filename
-          << " and " << combineMaskFilename;
-    }
-    VLOG(3) << mask_filename << " mask size = " << this->mask_.size();
+  explicit StaticPruningHook(const ParameterUpdaterHookConfig &hookConfig)
+      : initCount_(0) {
+    sparsityRatio_ = hookConfig.sparsity_ratio();
   }
 
-  void update(Parameter* para) {
+  static bool sortPairAscend(const std::pair<real, size_t> &pair1,
+                             const std::pair<real, size_t> &pair2) {
+    return pair1.first > pair2.first;
+  }
+
+  void update(Parameter *para) {
     updateThreadChecker_.check();
-    auto& vec = para->getBuf(PARAMETER_GRADIENT);
+    auto &vec = para->getBuf(PARAMETER_GRADIENT);
     if (vec) {
       vec->dotMul(*maskVec_);
     }
   }
 
-  void init(Parameter* para) {
-    size_t initCount = this->initCount_.fetch_add(1);
-    CHECK_EQ(initCount, 0UL) << "Currently the StaticPruningHook must invoke "
-                                "in same ParamterUpdater";
-    VLOG(3) << "Initialize Parameter " << para;
-    SetDevice device(para->getDeviceId());
+  void generateMask(Parameter *para) {
+    VectorPtr maskTemp = Vector::create(para->getSize(), false);
+    maskTemp->zeroMem();
+    real *maskTempData = maskTemp->getData();
+    size_t nonZeroNum = para->getSize() * (1 - sparsityRatio_);
 
-    auto maskVec = Vector::create(this->mask_.size(), false);
-    {  // Initialize maskVec with float mask vector
-      real* dataPtr = maskVec->getData();
-      size_t i = 0;
-      for (bool m : mask_) {
-        dataPtr[i++] = m ? 1.0 : 0.0;
-      }
-    }
+    VectorPtr paraVec = para->getBuf(PARAMETER_VALUE);
+    VectorPtr paraCpuCopy = Vector::create(para->getSize(), false);
+
+    paraCpuCopy->copyFrom(*paraVec);
+    std::vector<std::pair<real, size_t>> param;
+
+    for (size_t i = 0; i < para->getSize(); i++)
+      param.push_back(std::make_pair(fabs(paraCpuCopy->getData()[i]), i));
+
+    std::partial_sort(
+        param.begin(), param.begin() + nonZeroNum, param.end(), sortPairAscend);
+    for (size_t i = 0; i < nonZeroNum; i++) maskTempData[param[i].second] = 1.0;
 
     // Currently just use a mask vector for hack.
-    // @TODO(yuyang18): Implemented the mask operation in vector.
     if (para->useGpu()) {
-      maskVec_ = Vector::create(this->mask_.size(), para->useGpu());
-      maskVec_->copyFrom(*maskVec);
+      maskVec_ = Vector::create(para->getSize(), para->useGpu());
+      maskVec_->copyFrom(*maskTemp);
     } else {
-      maskVec_ = maskVec;
+      maskVec_ = maskTemp;
     }
-
-    auto& vec = para->getBuf(PARAMETER_VALUE);
-    vec->dotMul(*maskVec_);
   }
 
-private:
-  bool loadMaskFile(const std::string& mask_filename) {
-    std::ifstream fin;
-    fin.open(mask_filename);
-    if (fin.is_open()) {
-      StaticMaskHeader header;
-      fin.read(reinterpret_cast<char*>(&header), sizeof(StaticMaskHeader));
-      CHECK_EQ(header.version, 0UL);
-      mask_.resize(header.size);
-      uint8_t buf;
-      for (size_t i = 0; i < header.size; ++i, buf <<= 1) {
-        if (i % 8 == 0) {
-          fin.read(reinterpret_cast<char*>(&buf), sizeof(uint8_t));
-        }
-        mask_[i] = buf & 0x80;
-      }
-      fin.close();
-      return true;
-    } else {
-      return false;
-    }
+  void init(Parameter *para) {
+    generateMask(para);
+    size_t initCount = this->initCount_.fetch_add(1);
+    CHECK_EQ(initCount, 0UL) << "Currently the StaticPruningHook must invoke "
+                                "in same ParamterUpdater";
+    VLOG(3) << "Initialize Parameter " << para;
+    SetDevice device(para->getDeviceId());
+
+    auto &paraVec = para->getBuf(PARAMETER_VALUE);
+    paraVec->dotMul(*maskVec_);
   }
 
+private:
   SameThreadChecker updateThreadChecker_;
   std::atomic<size_t> initCount_;
   VectorPtr maskVec_;
-  std::vector<bool> mask_;
+  real sparsityRatio_;
 };
 
 IParameterUpdaterHook::IParameterUpdaterHook() {}
@@ -145,7 +117,7 @@ IParameterUpdaterHook::~IParameterUpdaterHook() {}
  */
 class StringIntPairHasher {
 public:
-  size_t operator()(const std::pair<std::string, int>& k) const {
+  size_t operator()(const std::pair<std::string, int> &k) const {
     return intHasher_(strHasher_(k.first) + k.second);
   }
 
@@ -162,19 +134,19 @@ static WeakKVCache<std::pair<std::string, int>,
 /**
  * ParameterUpdaterHook actually factory method.
  */
-static IParameterUpdaterHook* createImpl(
-    const ParameterUpdaterHookConfig& config) {
-  auto& type = config.type();
+static IParameterUpdaterHook *createImpl(
+    const ParameterUpdaterHookConfig &config) {
+  auto &type = config.type();
   if (type == "pruning") {
-    if (config.has_purning_mask_filename()) {
-      return new StaticPruningHook(config.purning_mask_filename());
-    }
+    return new StaticPruningHook(config);
   }
+
+  LOG(FATAL) << "Unknown Hook type:  " << type;
   return nullptr;
 }
 
 std::shared_ptr<IParameterUpdaterHook> IParameterUpdaterHook::create(
-    const ParameterConfig& paramConfig, int idx) {
+    const ParameterConfig &paramConfig, int idx) {
   std::pair<std::string, int> key = {paramConfig.name(), idx};
   return g_hookCache_.get(
       key, [&] { return createImpl(paramConfig.update_hooks(idx)); });
diff --git a/paddle/parameter/tests/test_argument.cpp b/paddle/parameter/tests/test_argument.cpp
index 81fe4ee397351a013c8616ad08fb8cb4b8dae4d0..98ab013548734059060eb06ce1a7cec23dbf1b72 100644
--- a/paddle/parameter/tests/test_argument.cpp
+++ b/paddle/parameter/tests/test_argument.cpp
@@ -42,7 +42,7 @@ TEST(Argument, poolSequenceWithStride) {
     CHECK_EQ(outStart[3], 4);
     CHECK_EQ(outStart[4], 7);
 
-    CHECK_EQ(stridePositions->getSize(), 8);
+    CHECK_EQ(stridePositions->getSize(), 8UL);
     auto result = reversed ? strideResultReversed : strideResult;
     for (int i = 0; i < 8; i++) {
       CHECK_EQ(stridePositions->getData()[i], result[i]);
diff --git a/paddle/platform/.clang-format b/paddle/platform/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..29282dc87e2c499988c17d90d47d44cd5cf7f115
--- /dev/null
+++ b/paddle/platform/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
diff --git a/paddle/majel/CMakeLists.txt b/paddle/platform/CMakeLists.txt
similarity index 51%
rename from paddle/majel/CMakeLists.txt
rename to paddle/platform/CMakeLists.txt
index 93e5e2c22f0eb5797c635efd8ca34ffb74c03311..7abe2ab89e0798672149e28a8d02f7a58b6de3ea 100644
--- a/paddle/majel/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,8 +1,5 @@
+nv_test(cuda_test SRCS cuda_test.cu)
+
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
-
-cc_library(ddim SRCS ddim.cc)
-cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
-
-nv_test(cuda_test SRCS cuda_test.cu)
-nv_test(dim_test SRCS dim_test.cu DEPS ddim)
+cc_test(must_check_test SRCS must_check_test.cc)
diff --git a/paddle/platform/assert.h b/paddle/platform/assert.h
new file mode 100644
index 0000000000000000000000000000000000000000..70d3bf75062c5471ab54ee2c4c7637c679d9a8a3
--- /dev/null
+++ b/paddle/platform/assert.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+
+#if defined(__APPLE__) && defined(__CUDA_ARCH__) && !defined(NDEBUG)
+#include <stdio.h>
+#define PADDLE_ASSERT(e)                                           \
+  do {                                                             \
+    if (!(e)) {                                                    \
+      printf("%s:%d Assertion `%s` failed.\n", __FILE__, __LINE__, \
+             TOSTRING(e));                                         \
+      asm("trap;");                                                \
+    }                                                              \
+  } while (0)
+
+#define PADDLE_ASSERT_MSG(e, m)                                         \
+  do {                                                                  \
+    if (!(e)) {                                                         \
+      printf("%s:%d Assertion `%s` failed (%s).\n", __FILE__, __LINE__, \
+             TOSTRING(e), m);                                           \
+      asm("trap;");                                                     \
+    }                                                                   \
+  } while (0)
+#else
+#include <assert.h>
+#define PADDLE_ASSERT(e) assert(e)
+#define PADDLE_ASSERT_MSG(e, m) assert((e) && (m))
+#endif
diff --git a/paddle/majel/cuda_test.cu b/paddle/platform/cuda_test.cu
similarity index 100%
rename from paddle/majel/cuda_test.cu
rename to paddle/platform/cuda_test.cu
diff --git a/paddle/majel/detail/hostdevice.h b/paddle/platform/hostdevice.h
similarity index 100%
rename from paddle/majel/detail/hostdevice.h
rename to paddle/platform/hostdevice.h
diff --git a/paddle/utils/Compiler.h b/paddle/platform/must_check.h
similarity index 78%
rename from paddle/utils/Compiler.h
rename to paddle/platform/must_check.h
index cebca5a2a3766110b83231eb0705e48800a7bda6..4fcc62afc05b14949fc43266f0d05be1f1b7891a 100644
--- a/paddle/utils/Compiler.h
+++ b/paddle/platform/must_check.h
@@ -10,24 +10,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-/**
- * This header defines some useful attribute by each compiler. It is the
- * abstract layer of compilers.
- */
-#ifdef __GNUC__
-#define GCC_VERSION \
-  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-#else
-#define GCC_VERSION
-#endif
-
 /**
  * __must_check macro. It make the function's return value must be used,
  * otherwise it will raise a compile warning. And also Paddle treat all compile
  * warnings as errors.
  */
-#if GCC_VERSION >= 30400
+#ifdef __GNUC__
+#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) >= 30400
 #define __must_check __attribute__((warn_unused_result))
 #else
 #define __must_check
 #endif
+#else
+#define __must_check
+#endif
diff --git a/paddle/platform/must_check_test.cc b/paddle/platform/must_check_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6ee3ea49acdc4384b5d5df353bfa1290856e982c
--- /dev/null
+++ b/paddle/platform/must_check_test.cc
@@ -0,0 +1,10 @@
+#include <gtest/gtest.h>
+#include <paddle/platform/must_check.h>
+
+int __must_check SomeFunctionMustCheck() { return 0; }
+
+TEST(MustCheck, all) {
+  //  This line should not be compiled, because the
+  //  return value of SomeFunctionMustCheck marked as __must_check
+  //  SomeFunctionMustCheck();
+}
\ No newline at end of file
diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1afd03c01169d395b086c1da458ce25c66a12a51
--- /dev/null
+++ b/paddle/platform/place.cc
@@ -0,0 +1,46 @@
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace platform {
+
+namespace detail {
+
+class PlacePrinter : public boost::static_visitor<> {
+ public:
+  PlacePrinter(std::ostream &os) : os_(os) {}
+  void operator()(const CpuPlace &) { os_ << "CpuPlace"; }
+  void operator()(const GpuPlace &p) { os_ << "GpuPlace(" << p.device << ")"; }
+
+ private:
+  std::ostream &os_;
+};
+
+}  // namespace detail
+
+static Place the_default_place;
+
+void set_place(const Place &place) { the_default_place = place; }
+const Place &get_place() { return the_default_place; }
+
+const GpuPlace default_gpu() { return GpuPlace(0); }
+const CpuPlace default_cpu() { return CpuPlace(); }
+
+bool is_gpu_place(const Place &p) {
+  return boost::apply_visitor(IsGpuPlace(), p);
+}
+bool is_cpu_place(const Place &p) {
+  return !boost::apply_visitor(IsGpuPlace(), p);
+}
+
+bool places_are_same_class(const Place &p1, const Place &p2) {
+  return is_gpu_place(p1) == is_gpu_place(p2);
+}
+
+std::ostream &operator<<(std::ostream &os, const Place &p) {
+  detail::PlacePrinter printer(os);
+  boost::apply_visitor(printer, p);
+  return os;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
new file mode 100644
index 0000000000000000000000000000000000000000..489572c526e162500c8f747f0ec8df10da9d86a2
--- /dev/null
+++ b/paddle/platform/place.h
@@ -0,0 +1,49 @@
+#pragma once
+#include <boost/variant.hpp>
+#include <iostream>
+
+namespace paddle {
+namespace platform {
+
+struct CpuPlace {
+  // WORKAROUND: for some reason, omitting this constructor
+  // causes errors with boost 1.59 and OSX
+  CpuPlace() {}
+
+  // needed for variant equality comparison
+  inline bool operator==(const CpuPlace &) const { return true; }
+  inline bool operator!=(const CpuPlace &) const { return false; }
+};
+
+struct GpuPlace {
+  GpuPlace() : GpuPlace(0) {}
+  GpuPlace(int d) : device(d) {}
+
+  // needed for variant equality comparison
+  inline bool operator==(const GpuPlace &o) const { return device == o.device; }
+  inline bool operator!=(const GpuPlace &o) const { return !(*this == o); }
+
+  int device;
+};
+
+struct IsGpuPlace : public boost::static_visitor<bool> {
+  bool operator()(const CpuPlace &) const { return false; }
+  bool operator()(const GpuPlace &gpu) const { return true; }
+};
+
+typedef boost::variant<GpuPlace, CpuPlace> Place;
+
+void set_place(const Place &);
+const Place &get_place();
+
+const GpuPlace default_gpu();
+const CpuPlace default_cpu();
+
+bool is_gpu_place(const Place &);
+bool is_cpu_place(const Place &);
+bool places_are_same_class(const Place &, const Place &);
+
+std::ostream &operator<<(std::ostream &, const Place &);
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/place_test.cc b/paddle/platform/place_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..73fccceedf6918148a26100f64cf322305c3ac20
--- /dev/null
+++ b/paddle/platform/place_test.cc
@@ -0,0 +1,40 @@
+#include "paddle/platform/place.h"
+#include <sstream>
+#include "gtest/gtest.h"
+
+TEST(Place, Equality) {
+  paddle::platform::CpuPlace cpu;
+  paddle::platform::GpuPlace g0(0), g1(1), gg0(0);
+
+  EXPECT_EQ(cpu, cpu);
+  EXPECT_EQ(g0, g0);
+  EXPECT_EQ(g1, g1);
+  EXPECT_EQ(g0, gg0);
+
+  EXPECT_NE(g0, g1);
+
+  EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0));
+  EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu));
+}
+
+TEST(Place, Default) {
+  EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::get_place()));
+  EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu()));
+  EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu()));
+
+  paddle::platform::set_place(paddle::platform::CpuPlace());
+  EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
+}
+
+TEST(Place, Print) {
+  {
+    std::stringstream ss;
+    ss << paddle::platform::GpuPlace(1);
+    EXPECT_EQ("GpuPlace(1)", ss.str());
+  }
+  {
+    std::stringstream ss;
+    ss << paddle::platform::CpuPlace();
+    EXPECT_EQ("CpuPlace", ss.str());
+  }
+}
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 2b48e4dc0f875be9a87797fa14885926999a5010..a182e5f4aef9de8c6f20681328d5ba6c0e6944ef 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -31,6 +31,7 @@ Configuring cmake in /paddle/build ...
       -DWITH_DOC=OFF
       -DWITH_GPU=${WITH_GPU:-OFF}
       -DWITH_AVX=${WITH_AVX:-OFF}
+      -DWITH_GOLANG=${WITH_GOLANG:-OFF}
       -DWITH_SWIG_PY=ON
       -DCUDNN_ROOT=/usr/
       -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
@@ -43,6 +44,7 @@ cmake .. \
       -DWITH_DOC=OFF \
       -DWITH_GPU=${WITH_GPU:-OFF} \
       -DWITH_AVX=${WITH_AVX:-OFF} \
+      -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
       -DWITH_SWIG_PY=ON \
       -DCUDNN_ROOT=/usr/ \
       -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} \
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
deleted file mode 100755
index f2cbc561652a3c7502de94be37d75783fc40b9c1..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/build_and_test.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-source ./common.sh
-
-NPROC=1
-export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
-export PYTHONHOME=/opt/python/2.7.12
-export PATH=/opt/python/2.7.12/bin:${PATH}
-cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DWITH_COVERAGE=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
-NRPOC=`nproc`
-make -j $NPROC
-make coveralls
-sudo make install
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/build_doc.sh
similarity index 84%
rename from paddle/scripts/travis/docs.sh
rename to paddle/scripts/travis/build_doc.sh
index c784293695bf134b5e990639778b6e84ba45d00d..a44bd35357fde41c379134bed6b7fb242efe49e5 100755
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -1,15 +1,19 @@
 #!/bin/bash
+set -e
+
+# Create the build directory for CMake.
+mkdir -p $TRAVIS_BUILD_DIR/build
+cd $TRAVIS_BUILD_DIR/build
 
-# Add set -e, cd to directory.
-source ./common.sh
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF ${EXTRA_CMAKE_OPTS}
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF
+
 mkdir output
 make -j `nproc`
 find .. -name '*whl' | xargs pip install  # install all wheels.
 rm -rf *
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
-make paddle_docs paddle_docs_cn
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON
+make -j `nproc` paddle_docs paddle_docs_cn
 
 # check websites for broken links
 linkchecker doc/en/html/index.html
diff --git a/paddle/scripts/travis/precommit.sh b/paddle/scripts/travis/check_style.sh
similarity index 54%
rename from paddle/scripts/travis/precommit.sh
rename to paddle/scripts/travis/check_style.sh
index 7a59b1131d0a410be9c5cef08e3cc11633d2ba67..4754bdd4c80de9700d92b0e33ecfdfc582f42813 100755
--- a/paddle/scripts/travis/precommit.sh
+++ b/paddle/scripts/travis/check_style.sh
@@ -1,14 +1,14 @@
 #!/bin/bash
 function abort(){
-    echo "Your commit not fit PaddlePaddle code style" 1>&2
-    echo "Please use pre-commit scripts to auto-format your code" 1>&2
+    echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
+    echo "Please use pre-commit to reformat your code and git push again." 1>&2
     exit 1
 }
 
 trap 'abort' 0
 set -e
-source common.sh
-cd ..
+
+cd $TRAVIS_BUILD_DIR
 export PATH=/usr/bin:$PATH
 pre-commit install
 clang-format --version
diff --git a/paddle/scripts/travis/common.sh b/paddle/scripts/travis/common.sh
deleted file mode 100755
index f05c7530a3b0632948e4b18c477d6dc6aad04c03..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/common.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-set -e
-mkdir -p ../../../build
-cd ../../../build
-mkdir -p $HOME/third_party
-EXTRA_CMAKE_OPTS="-DTHIRD_PARTY_PATH=${HOME}/third_party"
diff --git a/paddle/scripts/travis/main.sh b/paddle/scripts/travis/main.sh
deleted file mode 100755
index 13f2552d29db38041a73edca0acd202945c67484..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/main.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-cd `dirname $0`
-
-if [ ${JOB} == "BUILD_AND_TEST" ]; then
-  ./build_and_test.sh
-elif [ ${JOB} == "DOCS" ]; then
-  ./docs.sh
-elif [ ${JOB} == "PRE_COMMIT" ]; then
-  ./precommit.sh
-else
-  echo Unknown job ${JOB}
-  exit 1
-fi
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/trainer/CMakeLists.txt
index 06c019f0a97757b658d1bc3405246d8f47632aad..f34d53ae99f913a8aed8767b7271a538efce4778 100644
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -4,6 +4,7 @@ set(TRAINER_SOURCES
         ParameterUpdater.cpp
         ParamUtil.cpp
         RemoteParameterUpdater.cpp
+        NewRemoteParameterUpdater.cpp
         Tester.cpp
         Trainer.cpp
         TrainerInternal.cpp
@@ -16,6 +17,7 @@ set(TRAINER_HEADERS
         ParameterUpdater.h
         ParamUtil.h
         RemoteParameterUpdater.h
+        NewRemoteParameterUpdater.h
         Tester.h
         TesterConfig.h
         Trainer.h
@@ -24,6 +26,13 @@ set(TRAINER_HEADERS
         ThreadParameterUpdater.h
         TrainerConfigHelper.h)
 
+if(NOT WITH_GOLANG)
+  list(REMOVE_ITEM TRAINER_SOURCES
+          NewRemoteParameterUpdater.cpp)
+  list(REMOVE_ITEM TRAINER_HEADERS
+          NewRemoteParameterUpdater.h)
+endif()
+
 add_library(paddle_trainer_lib STATIC
     ${TRAINER_SOURCES})
 
@@ -56,3 +65,13 @@ install(TARGETS paddle_trainer paddle_merge_model
 
 set_target_properties(paddle_trainer PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
 set_target_properties(paddle_merge_model PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+
+if(APPLE)
+  set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
+endif()
+
+if(WITH_GOLANG)
+  add_dependencies(paddle_trainer_lib paddle_pserver_cclient)
+  target_link_libraries(paddle_trainer ${CMAKE_BINARY_DIR}/go/pserver/cclient/libpaddle_pserver_cclient.a)
+  target_link_libraries(paddle_trainer_lib ${CMAKE_BINARY_DIR}/go/pserver/cclient/libpaddle_pserver_cclient.a)
+endif(WITH_GOLANG)
diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f25ce2f7f06f6da0feab27da61b8e49689cbe213
--- /dev/null
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "NewRemoteParameterUpdater.h"
+#include "Trainer.h"
+#include "paddle/utils/Stat.h"
+
+DECLARE_int32(trainer_id);
+DECLARE_string(save_dir);
+
+namespace paddle {
+NewRemoteParameterUpdater::NewRemoteParameterUpdater(
+    const OptimizationConfig &config, const std::string pserverSpec)
+    : parameterClient_(-1),
+      newParameters_(nullptr),
+      newGradients_(nullptr),
+      pserverSpec_(pserverSpec) {}
+
+void NewRemoteParameterUpdater::init(
+    const std::vector<ParameterPtr> &parameters) {
+  ParameterUpdater::init(parameters);
+
+  for (auto &para : parameters_) {
+    para->getBuf(PARAMETER_VALUE)->zeroMem();
+    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+  }
+
+  // create parameter server client.
+  parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
+                                               FLAGS_trainer_id == 0);
+
+  // init new parameter and gradient.
+  newParameters_ = initNewParameter(PARAMETER_VALUE);
+  newGradients_ = initNewParameter(PARAMETER_GRADIENT);
+
+  // init parameter, one trainer will get the opportunity to int parameter and
+  // send them to parameter server. Others will get the initialized parameter
+  // from parameter server
+  if (paddle_begin_init_params(parameterClient_)) {
+    LOG(INFO) << "paddle_begin_init_params start";
+    for (int i = 0; i < parameterSize(); ++i) {
+      auto paramConfig = parameters_[i]->getConfig();
+      std::string bytes = paramConfig.SerializeAsString();
+      const char *array = bytes.data();
+      int size = (int)bytes.size();
+      paddle_init_param(
+          parameterClient_, *newParameters_[i], (void *)array, size);
+    }
+    paddle_finish_init_params(parameterClient_);
+    LOG(INFO) << "paddle_begin_init_params done";
+  } else {
+    paddle_get_params(parameterClient_, newParameters_, parameterSize());
+  }
+
+  LOG(INFO) << "NewRemoteParameterUpdater initialized";
+}
+
+void NewRemoteParameterUpdater::updateImpl(Parameter *para) {}
+
+void NewRemoteParameterUpdater::finishBatch(real cost) {
+  // send gradient to parameter server.
+  paddle_send_grads(parameterClient_, newGradients_, parameterSize());
+  // get the updated parameter from parameterClient.
+  paddle_get_params(parameterClient_, newParameters_, parameterSize());
+
+  // clear gradient after update parameter.
+  for (auto &para : parameters_) {
+    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+  }
+}
+
+void NewRemoteParameterUpdater::startPass() {}
+
+bool NewRemoteParameterUpdater::finishPass() { return true; }
+}
diff --git a/paddle/trainer/NewRemoteParameterUpdater.h b/paddle/trainer/NewRemoteParameterUpdater.h
new file mode 100644
index 0000000000000000000000000000000000000000..f735185f62b3491a63e34cfc4a2ef73dae12243e
--- /dev/null
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <thread>
+#include "ParameterUpdater.h"
+#include "libpaddle_pserver_cclient.h"
+#include "paddle/pserver/ParameterClient2.h"
+#include "paddle/utils/Queue.h"
+#include "paddle/utils/Util.h"
+
+namespace paddle {
+
+/**
+ * New remote parameter updater for dense parameters that use cclient of go.
+ */
+class NewRemoteParameterUpdater : public ParameterUpdater {
+public:
+  NewRemoteParameterUpdater(const OptimizationConfig& config,
+                            const std::string pserverSpec);
+  ~NewRemoteParameterUpdater() {
+    releaseNewParameter(newParameters_);
+    releaseNewParameter(newGradients_);
+    if (parameterClient_ >= 0) paddle_pserver_client_release(parameterClient_);
+  }
+
+  /**
+   * initialize the internal parameter client and itself.
+   */
+  virtual void init(const std::vector<ParameterPtr>& parameters);
+  /**
+   * @brief start batch
+   *
+   * @note  one batch training exhibits stateful feature to help
+   *        to do performance tuning, sgd optimization if necessary.
+   */
+  virtual PassType startBatch(int64_t batchSize) { return PASS_TRAIN; }
+
+  /**
+   * send parameters to pservers and get returned parameters
+   * from all pservers if necessary.
+   */
+  virtual void finishBatch(real cost);
+  virtual void startPass();
+  virtual bool finishPass();
+
+protected:
+  /**
+   * work need to do after finishBatch
+   */
+  virtual void updateImpl(Parameter* para);
+
+private:
+  int parameterSize() { return (int)parameters_.size(); }
+
+  /**
+   * init parameter of go paddle pserver cclient.
+   * @param new_params
+   * @param type
+   */
+  paddle_parameter** initNewParameter(ParameterType type) {
+    paddle_parameter** new_params =
+        (paddle_parameter**)malloc(sizeof(paddle_parameter*) * parameterSize());
+    for (int i = 0; i < parameterSize(); ++i) {
+      new_params[i] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
+      memset(new_params[i], 0, sizeof(paddle_parameter));
+    }
+
+    for (int i = 0; i < parameterSize(); ++i) {
+      ParameterPtr param = parameters_[i];
+      new_params[i]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+      new_params[i]->name = (char*)param->getName().c_str();
+      new_params[i]->content =
+          (unsigned char*)(param->getBuf(type).get()->getData());
+      new_params[i]->content_len =
+          (int)param->getBuf(type).get()->getSize() * sizeof(real);
+    }
+    return new_params;
+  }
+
+  void releaseNewParameter(paddle_parameter** newParams) {
+    if (newParams != nullptr) {
+      for (int i = 0; i < parameterSize(); ++i) {
+        free(newParams[i]);
+      }
+      free(newParams);
+    }
+  }
+
+protected:
+  /// internal parameter client object for exchanging data with pserver
+  paddle_pserver_client parameterClient_;
+  /// the parameters for new pserver client
+  paddle_parameter** newParameters_;
+  /// the gradinets for new pserver client
+  paddle_parameter** newGradients_;
+  /// the specification of parameter server "host1:port,host1:port"
+  std::string pserverSpec_;
+};
+
+}  // namespace paddle
diff --git a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
index d669fbc40cbc19df309d8bf20c942a9d8fc8f47d..741a0aa71df7866c180ab2513f28638117d0f1ca 100644
--- a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
+++ b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
@@ -35,7 +35,7 @@ def outer_step(dummy_data):
                                  embedding_size=num_words)]
 
     def inner_step(dummy_memory, predict_word):
-        
+
         # simplified RNN for testing
         with mixed_layer(size=num_words) as layer:
             layer += full_matrix_projection(input=predict_word,
@@ -46,15 +46,15 @@ def outer_step(dummy_data):
                                                 param_attr=ParamAttr(name="wordvec"))
 
         return out
-    
+
     beam_gen = beam_search(name="rnn_gen",
                            step=inner_step,
                            input=gen_inputs,
                            bos_id=0,
                            eos_id=num_words-1,
                            beam_size=2 if beam_flag else 1,
-                           num_results_per_sample=2 if beam_flag else 1,
-                           max_length=10) 
+                           num_results_per_sample=1,
+                           max_length=10)
     return beam_gen
 
 beam_gen_concat = recurrent_group(name="rnn_gen_concat",
diff --git a/paddle/trainer/tests/sample_trainer_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_rnn_gen.conf
index 2b337282f6285afb527e9bbf138d2e8184700d8d..58d27f15ae1c0a38885ee105a7963b6e7bd55906 100644
--- a/paddle/trainer/tests/sample_trainer_rnn_gen.conf
+++ b/paddle/trainer/tests/sample_trainer_rnn_gen.conf
@@ -33,7 +33,7 @@ gen_inputs = [StaticInput(input=dummy_data, size=2),
                              embedding_size=num_words)]
 
 def step(dummy_memory, predict_word):
-    
+
     # simplified RNN for testing
     with mixed_layer(size=num_words) as layer:
         layer += full_matrix_projection(input=predict_word,
@@ -44,7 +44,7 @@ def step(dummy_memory, predict_word):
                                             param_attr=ParamAttr(name="wordvec"))
 
     return out
-    
+
 beam_gen = beam_search(name="rnn_gen",
                        step=step,
                        input=gen_inputs,
@@ -52,7 +52,7 @@ beam_gen = beam_search(name="rnn_gen",
                        eos_id=num_words-1,
                        beam_size=2 if beam_flag else 1,
                        num_results_per_sample=2 if beam_flag else 1,
-                       max_length=10) 
+                       max_length=10)
 
 seqtext_printer_evaluator(input=beam_gen,
                           id_input=sent_id,
diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
index 03446b3b2f6d5ff42fbf0d735a24d88bd0429747..1322e77178a4f5674f41943f886a17be8337bd75 100644
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
@@ -124,6 +124,8 @@ TEST(RecurrentGradientMachine, test_generation) {
                      bool beam_search) {
     FLAGS_config_args = beam_search ? "beam_search=1" : "beam_search=0";
     for (auto useGpu : useGpuConfs) {
+      LOG(INFO) << configFile << " useGpu=" << useGpu
+                << " beam_search=" << beam_search;
       testGeneration(configFile, useGpu, hasSubseq, expRetFile);
     }
   };
diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h
index 6992e856223494d6575ef3261d82cbdf4e375885..52a6df94979fd3d8d7d540ed0e3898bb3375d975 100644
--- a/paddle/utils/CustomStackTrace.h
+++ b/paddle/utils/CustomStackTrace.h
@@ -55,13 +55,17 @@ public:
    *        Else, just set status to popping.
    */
   void pop(const T& item) {
-    pushing() = false;
     auto& s = this->stack();
     if (item == s.top()) {
       s.pop();
     }
   }
 
+  /**
+   * @brief Indicate whether we are at forward or backward stage of computation
+   */
+  void set_stage(bool isForward) { pushing() = isForward; }
+
   /**
    * @brief clear current thread stack.
    */
diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
index cda1b5c37dada8d0c6c77fc2fb03bb614d5301b5..f3d535c69c53fa350612459560dd9ac7c279aa72 100644
--- a/paddle/utils/Error.h
+++ b/paddle/utils/Error.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <stdio.h>
 #include <memory>
 #include <string>
-#include "Compiler.h"
+#include "paddle/platform/must_check.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
index b5d9f93f1376048eabd726331006b0bb848bce11..c320074fbadab3e211ed72ce715d595c90673d6d 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -72,7 +72,6 @@ TEST(CustomStackTrace, normalTrain) {
       for (size_t i = 0; i < layerSize; ++i) {
         tracer.push("layer_" + paddle::str::to_string(i));
       }
-      tracer.pop("");
       for (size_t i = 0; i < layerSize; ++i) {
         tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i));
       }
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 62d5b9e38b21ee82d1e78c3bde5aa5df7e4a33ee..c942620990765832f21c887d30f85a2d211a5f32 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -5,6 +5,7 @@ set(proto_filenames
     ParameterConfig.proto
     ParameterService.proto
     TrainerConfig.proto
+    OptimizerConfig.proto
     ParameterServerConfig.proto)
 
 set(PROTO_GEN)
@@ -35,10 +36,8 @@ foreach(filename ${proto_filenames})
         DEPENDS ${filename} ${external_project_dependencies})
 endforeach()
 
-include_directories(${CMAKE_CURRENT_BINARY_DIR}/proto)
-
 add_custom_target(gen_proto_cpp ALL DEPENDS ${PROTO_GEN})
 add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})
-add_library(paddle_proto STATIC
-    ${PROTO_GEN})
+
+add_library(paddle_proto STATIC ${PROTO_GEN})
 target_include_directories(paddle_proto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 29270829bbc3af6990aaf03a5228ef7f6a892a5c..ebe4f5cbb569ff37a46eb44de6362a7df337fe38 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -489,6 +489,15 @@ message EvaluatorConfig {
   // Used by ClassificationErrorEvaluator
   // top # classification error
   optional int32 top_k = 13 [default = 1];
+
+  // Used by DetectionMAPEvaluator
+  optional double overlap_threshold = 14 [default = 0.5];
+
+  optional int32 background_id = 15 [default = 0];
+
+  optional bool evaluate_difficult = 16 [default = false];
+
+  optional string ap_type = 17 [default = "11point"];
 }
 
 message LinkConfig {
diff --git a/proto/OptimizerConfig.proto b/proto/OptimizerConfig.proto
new file mode 100644
index 0000000000000000000000000000000000000000..c698d3c2ddbf58a41ac6ee960af83a257325d1f9
--- /dev/null
+++ b/proto/OptimizerConfig.proto
@@ -0,0 +1,154 @@
+syntax = "proto2";
+ 
+option optimize_for = LITE_RUNTIME;
+
+package paddle;
+
+message SGDConfig {
+  // SGD
+  // momentum: float >= 0. Parameter updates momentum.
+  // decay: float >= 0. Learning rate decay over each update.
+  // nesterov: boolean. Whether to apply Nesterov momentum.
+  optional double momentum = 21 [default = 0.0];
+  optional double decay = 23 [default = 0.0];
+  optional bool nesterov =24 [default = false];
+
+}
+
+
+message AdadeltaConfig {
+  // Adadelta
+  // It is recommended to leave it at the default value.
+  // rho: float >= 0.
+  // epsilon: float >= 0. Fuzz factor.
+  // decay: float >= 0. Learning rate decay over each update.
+
+  // reference : [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701)
+  optional double rho = 33 [default = 0.90];
+  optional double epsilon = 31 [default = 1e-5];
+  optional double decay = 32 [default = 0.0];
+
+}
+
+message AdagradConfig {
+// Adagrad
+// epsilon: float >= 0.
+// decay: float >= 0. Learning rate decay over each update.
+
+// reference : [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+  optional double epsilon = 41 [default = 1e-5];
+  optional double decay = 42 [default = 0.0];
+}
+
+message AdamConfig {
+  // Adaj
+  // beta_1: float, 0 < beta < 1. Generally close to 1.
+  // beta_2: float, 0 < beta < 1. Generally close to 1.
+  // epsilon: float >= 0. Fuzz factor.
+  // decay: float >= 0. Learning rate decay over each update.
+  // reference : [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
+  optional double beta_1 = 41;
+  optional double beta_2 = 42;
+  optional double epsilon = 43;
+  optional double decay = 44;
+}
+
+message ConstLrConfig {
+  // learninRate Policy
+  optional double learning_rate = 1 [default = 1.0];
+}
+
+message LinearLrConfig {
+  // learninRate Policy
+  optional double learning_rate = 1 [default = 1.0];
+  optional double lr_decay_a = 2;
+  optional double lr_decay_b = 3;
+}
+
+message TensorProto {
+enum DataType {
+  PADDLE_ELEMENT_TYPE_INT32 = 0;
+  PADDLE_ELEMENT_TYPE_UINT32 = 1;
+  PADDLE_ELEMENT_TYPE_INT64 = 2;
+  PADDLE_ELEMENT_TYPE_UINT64 = 3;
+  PADDLE_ELEMENT_TYPE_FLOAT32 = 4;
+  PADDLE_ELEMENT_TYPE_FLOAT64 = 5;
+}
+  optional DataType data_type = 1;
+  repeated bytes content = 2;
+}
+
+message SGDOptimizerState {
+  // learning rate policy
+  optional double learning_rate = 101;
+  optional double lr_decay_a = 102;
+  optional double lr_decay_b = 103;
+  optional double num_sample_passed = 104;
+  // state
+  optional TensorProto parameter = 1;
+  optional TensorProto momentums = 2;
+}
+
+message AdadeltaOptimizerState {
+  // learning rate policy
+  optional double learning_rate = 101;
+  optional double lr_decay_a = 102;
+  optional double lr_decay_b = 103;
+  optional double num_sample_passed = 104;
+  // state
+  optional TensorProto parameter = 1;
+  optional TensorProto accum_gradient = 2;
+  optional TensorProto accum_delta = 3;
+  optional TensorProto update_delta = 4;
+}
+
+message AdagradOptimizerState {
+  // learning rate policy
+  optional double learning_rate = 101;
+  optional double lr_decay_a = 102;
+  optional double lr_decay_b = 103;
+  optional double num_sample_passed = 104;
+  // state
+  optional TensorProto parameter = 1;
+  optional TensorProto accum_gradient = 2;
+}
+
+message AdamOptimizerState {
+  // learning rate policy
+  optional double learning_rate = 101;
+  optional double lr_decay_a = 102;
+  optional double lr_decay_b = 103;
+  optional double num_sample_passed = 104;
+  // state
+  optional TensorProto parameter = 1;
+  optional TensorProto momentums = 2;
+  optional TensorProto velocitys = 3;
+}
+
+message OptimizerConfig {
+  enum Optimizer {
+   SGD = 1;
+   Adadelta = 2;
+   Adagrad = 3;
+   Adam = 4;
+  }
+  optional Optimizer optimizer = 1;
+  optional SGDConfig sgd = 3;
+  optional AdadeltaConfig adadelta = 4;
+  optional AdagradConfig adagrad = 5;
+  optional AdamConfig adam = 6;
+
+  enum LrPolicy {
+   Const = 0;
+   Linear = 1;
+  }
+  optional LrPolicy lr_policy = 11;
+  optional ConstLrConfig const_lr = 12;
+  optional LinearLrConfig linear_lr = 13;
+
+  // common config of optimizer
+  // gradient clip when L2 exceeding value
+  optional double clip_norm = 101;
+  // gradient clip when L1 exceeding value
+  optional double clip_value = 102;
+}
diff --git a/proto/ParameterConfig.proto b/proto/ParameterConfig.proto
index cbcd0af598df22c36c66767fdeb7add2aa49e87d..580d66324602df4c655dd2f1e1cd87159b5b346b 100644
--- a/proto/ParameterConfig.proto
+++ b/proto/ParameterConfig.proto
@@ -25,8 +25,10 @@ enum ParameterInitStrategy {
 }
 
 message ParameterUpdaterHookConfig {
+  // hook type such as  'pruning'
   required string type = 1;
-  optional string purning_mask_filename = 2;
+  // this represents the ratio of zero element to be set by the Parameter 
+  optional double sparsity_ratio = 2 [default = 0.6];
 }
 
 message ParameterConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 0792e2d40b43f5fb2de8d6bb43a62cfa23f77082..58e4902f57aa8018b820f48f6cbf659f1e5f5183 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -126,6 +126,7 @@ def init_config_environment(
         g_config=TrainerConfig(),
         g_layer_map={},
         g_parameter_map={},
+        g_parameter_initializer_map={},
         g_extended_config_funcs={},
 
         # store command args of paddle_trainer
@@ -327,53 +328,33 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name,
     SubModelBegin(name)
     g_current_submodel.is_recurrent_layer_group = True
     g_current_submodel.reversed = seq_reversed
-    g_current_submodel.target_inlinkid = -1
     in_links_count = 0
     for linkid, link in enumerate(in_links):
         if isinstance(link, basestring):
             name = link
-            has_subseq = False
         else:
             name = link.link_name
-            has_subseq = link.has_subseq
-        # assign target_inlinkid according to target_inlinkname
-        if target_inlinkname == name:
-            g_current_submodel.target_inlinkid = linkid
 
-        if in_links_count == 0:
-            in_links_has_subseq = has_subseq
-        else:
-            config_assert(
-                in_links_has_subseq == has_subseq,
-                "The sequence type of in_links should be the same in RecurrentLayerGroup"
-            )
         in_links_count += 1
         layer_name = MakeLayerNameInParentSubmodel(name)
         layer = g_layer_map[layer_name]
-        if has_subseq:
-            SequenceScatterAgentLayer(name=name, size=layer.size)
-        else:
-            ScatterAgentLayer(name=name, size=layer.size)
+        ScatterAgentLayer(name=name, size=layer.size)
 
         pair = g_current_submodel.in_links.add()
         pair.layer_name = layer_name
         pair.link_name = MakeLayerNameInSubmodel(name)
-        pair.has_subseq = has_subseq
 
 
 @config_func
 def RecurrentLayerGroupSetOutLink(link):
     if isinstance(link, basestring):
         name = link
-        has_subseq = False
     else:
         name = link.link_name
-        has_subseq = link.has_subseq
     layer_name = MakeLayerNameInParentSubmodel(name)
     pair = g_current_submodel.out_links.add()
     pair.layer_name = MakeLayerNameInSubmodel(name)
     pair.link_name = layer_name
-    pair.has_subseq = has_subseq
 
 
 def RecurrentLayerGroupSetGenerator(generator=None):
@@ -388,8 +369,7 @@ def RecurrentLayerGroupBegin(name,
                              generator=None,
                              target_inlinkname="",
                              seq_reversed=False):
-    RecurrentLayerGroupWithoutOutLinksBegin(name, in_links, seq_reversed,
-                                            target_inlinkname)
+    RecurrentLayerGroupWithoutOutLinksBegin(name, in_links, seq_reversed)
     for link in out_links:
         RecurrentLayerGroupSetOutLink(link)
 
@@ -424,8 +404,6 @@ def RecurrentLayerGroupEnd(name):
         agent_name = GetLayerBaseName(pair.link_name)
         if prev_submodel.HasField("generator"):
             DataLayer(name=agent_name, size=layer.size)
-        elif pair.has_subseq:
-            SequenceGatherAgentLayer(name=agent_name, size=layer.size)
         else:
             GatherAgentLayer(name=agent_name, size=layer.size)
 
@@ -439,22 +417,22 @@ def model_type(name):
 
 @config_class
 class Bias(Cfg):
-    def __init__(
-            self,
-            parameter_name=None,
-            learning_rate=None,
-            momentum=None,
-            decay_rate=None,
-            decay_rate_l1=None,
-            initial_mean=None,
-            initial_std=None,
-            initial_strategy=None,
-            initial_smart=None,
-            num_batches_regularization=None,
-            sparse_remote_update=None,
-            gradient_clipping_threshold=None,
-            is_static=None,
-            is_shared=None, ):
+    def __init__(self,
+                 parameter_name=None,
+                 learning_rate=None,
+                 momentum=None,
+                 decay_rate=None,
+                 decay_rate_l1=None,
+                 initial_mean=None,
+                 initial_std=None,
+                 initial_strategy=None,
+                 initial_smart=None,
+                 num_batches_regularization=None,
+                 sparse_remote_update=None,
+                 gradient_clipping_threshold=None,
+                 is_static=None,
+                 is_shared=None,
+                 initializer=None):
         self.add_keys(locals())
 
 
@@ -465,6 +443,7 @@ class Input(Cfg):
             self,
             input_layer_name,
             parameter_name=None,
+            initializer=None,
             learning_rate=None,
             momentum=None,
             decay_rate=None,
@@ -521,6 +500,7 @@ class Projection(Input):
             initial_std=None,
             initial_strategy=None,
             initial_smart=None,
+            initializer=None,
             num_batches_regularization=None,
             sparse_remote_update=None,
             sparse_update=None,
@@ -1300,20 +1280,23 @@ def parse_maxout(maxout, input_layer_name, maxout_conf):
 
 # Define an evaluator
 @config_func
-def Evaluator(
-        name,
-        type,
-        inputs,
-        chunk_scheme=None,
-        num_chunk_types=None,
-        classification_threshold=None,
-        positive_label=None,
-        dict_file=None,
-        result_file=None,
-        num_results=None,
-        top_k=None,
-        delimited=None,
-        excluded_chunk_types=None, ):
+def Evaluator(name,
+              type,
+              inputs,
+              chunk_scheme=None,
+              num_chunk_types=None,
+              classification_threshold=None,
+              positive_label=None,
+              dict_file=None,
+              result_file=None,
+              num_results=None,
+              top_k=None,
+              delimited=None,
+              excluded_chunk_types=None,
+              overlap_threshold=None,
+              background_id=None,
+              evaluate_difficult=None,
+              ap_type=None):
     evaluator = g_config.model_config.evaluators.add()
     evaluator.type = type
     evaluator.name = MakeLayerNameInSubmodel(name)
@@ -1347,6 +1330,18 @@ def Evaluator(
     if excluded_chunk_types:
         evaluator.excluded_chunk_types.extend(excluded_chunk_types)
 
+    if overlap_threshold is not None:
+        evaluator.overlap_threshold = overlap_threshold
+
+    if background_id is not None:
+        evaluator.background_id = background_id
+
+    if evaluate_difficult is not None:
+        evaluator.evaluate_difficult = evaluate_difficult
+
+    if ap_type is not None:
+        evaluator.ap_type = ap_type
+
 
 class LayerBase(object):
     def __init__(
@@ -1479,7 +1474,8 @@ class LayerBase(object):
                     gradient_clipping_threshold=bias.
                     gradient_clipping_threshold,
                     is_static=bias.is_static,
-                    is_shared=bias.is_shared, )
+                    is_shared=bias.is_shared,
+                    initializer=bias.initializer)
             if for_self:
                 self.config.bias_parameter_name = bias.parameter_name
             else:
@@ -1536,7 +1532,8 @@ class LayerBase(object):
             format=format,
             is_static=input_config.is_static,
             is_shared=input_config.is_shared,
-            update_hooks=input_config.update_hooks)
+            update_hooks=input_config.update_hooks,
+            initializer=input_config.initializer)
 
     def set_layer_size(self, size):
         if self.config.size == 0:
@@ -1646,8 +1643,14 @@ class SelectiveFCLayer(LayerBase):
 
 @config_layer('print')
 class PrintLayer(LayerBase):
-    def __init__(self, name, inputs):
+    def __init__(self, name, inputs, format=None):
         super(PrintLayer, self).__init__(name, 'print', 0, inputs)
+        if format is None:
+            format = "\n".join([
+                "layer=" + input.input_layer_name + " %s"
+                for input in self.inputs
+            ])
+        self.config.user_arg = format
 
 
 @config_layer('priorbox')
@@ -1944,7 +1947,6 @@ class BatchNormLayer(LayerBase):
     def __init__(self,
                  name,
                  inputs,
-                 active_type="linear",
                  bias=True,
                  use_global_stats=True,
                  moving_average_fraction=0.9,
@@ -1982,12 +1984,7 @@ class BatchNormLayer(LayerBase):
             cudnn_version >= 4007
         self.layer_type = "cudnn_batch_norm" if use_cudnn else "batch_norm"
         super(BatchNormLayer, self).__init__(
-            name,
-            self.layer_type,
-            0,
-            active_type=active_type,
-            inputs=inputs,
-            **xargs)
+            name, self.layer_type, 0, inputs=inputs, **xargs)
 
         if use_global_stats is not None:
             self.config.use_global_stats = use_global_stats
@@ -2248,13 +2245,6 @@ class AgentLayer(LayerBase):
             name, 'agent', size, inputs=[], device=device)
 
 
-@config_layer('sequence_agent')
-class SequenceAgentLayer(LayerBase):
-    def __init__(self, name, size, device=None):
-        super(SequenceAgentLayer, self).__init__(
-            name, 'sequence_agent', size, inputs=[], device=device)
-
-
 @config_layer('gather_agent')
 class GatherAgentLayer(LayerBase):
     def __init__(self, name, size, device=None):
@@ -2269,20 +2259,6 @@ class ScatterAgentLayer(LayerBase):
             name, 'scatter_agent', size, inputs=[], device=device)
 
 
-@config_layer('sequence_gather_agent')
-class SequenceGatherAgentLayer(LayerBase):
-    def __init__(self, name, size, device=None):
-        super(SequenceGatherAgentLayer, self).__init__(
-            name, 'sequence_gather_agent', size, inputs=[], device=device)
-
-
-@config_layer('sequence_scatter_agent')
-class SequenceScatterAgentLayer(LayerBase):
-    def __init__(self, name, size, device=None):
-        super(SequenceScatterAgentLayer, self).__init__(
-            name, 'sequence_scatter_agent', size, inputs=[], device=device)
-
-
 @config_layer('multiplex')
 class MultiplexLayer(LayerBase):
     def __init__(self, name, inputs, size, device=None):
@@ -2298,12 +2274,12 @@ class MultiplexLayer(LayerBase):
 
 
 @config_func
-def Link(
-        name,
-        has_subseq=False, ):
+def Link(name, has_subseq=False):
+    """
+    Still keeping has_subseq for backward compatibility
+    """
     link_config = LinkConfig()
     link_config.link_name = name
-    link_config.has_subseq = has_subseq
     return link_config
 
 
@@ -2336,20 +2312,13 @@ def Memory(name,
         config_assert(name is not None, "name needs cannot be None")
         memory_name = name + "+delay1"
     agent_name = memory_name
-    if is_sequence:
-        config_assert(
-            boot_layer is not None,
-            "there must be boot_layer in network when is_sequence = True")
-        agent_layer = SequenceAgentLayer(agent_name, size)
-    else:
-        agent_layer = AgentLayer(agent_name, size)
+    agent_layer = AgentLayer(agent_name, size)
     config_assert(g_current_submodel.is_recurrent_layer_group,
                   'Memory should be used in recurrent layer group only')
     memory = g_current_submodel.memories.add()
     if name is not None:
         memory.layer_name = MakeLayerNameInSubmodel(name)
     memory.link_name = MakeLayerNameInSubmodel(agent_name)
-    memory.is_sequence = is_sequence
     options = sum((boot_layer is not None, bool(boot_bias),
                    boot_with_const_id is not None))
     config_assert(
@@ -2423,15 +2392,23 @@ class ExpandLayer(LayerBase):
 
 @config_layer('featmap_expand')
 class FeatMapExpandLayer(LayerBase):
-    def __init__(self, name, inputs, device=None, num_filters=None, bias=False):
+    def __init__(self,
+                 name,
+                 inputs,
+                 num_filters=None,
+                 as_row_vector=True,
+                 bias=False,
+                 **xargs):
         super(FeatMapExpandLayer, self).__init__(
-            name, 'featmap_expand', 0, inputs=inputs, device=device)
+            name, 'featmap_expand', 0, inputs=inputs, **xargs)
         config_assert(
             len(self.inputs) == 1, 'ExpandLayer takes 1 and only 1 inputs')
         if num_filters is not None:
             self.config.num_filters = num_filters
         else:
             logger.fatal("FeatMapExpandLayer must specify num_filters.")
+        if not as_row_vector:
+            self.config.user_arg = "as_col_vec"
         self.set_layer_size(self.get_input_layer(0).size * num_filters)
 
 
@@ -2441,14 +2418,12 @@ class MaxLayer(LayerBase):
                  name,
                  inputs,
                  trans_type='non-seq',
-                 active_type='linear',
                  bias=False,
                  output_max_index=None,
                  **xargs):
         super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, **xargs)
         config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input')
         self.config.trans_type = trans_type
-        self.config.active_type = active_type
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             self.set_layer_size(input_layer.size)
@@ -2490,18 +2465,12 @@ class SequenceLastInstanceLayer(LayerBase):
     def __init__(self,
                  name,
                  inputs,
-                 active_type='linear',
                  trans_type='non-seq',
                  bias=False,
                  stride=-1,
                  **xargs):
         super(SequenceLastInstanceLayer, self).__init__(
-            name,
-            'seqlastins',
-            0,
-            inputs=inputs,
-            active_type=active_type,
-            **xargs)
+            name, 'seqlastins', 0, inputs=inputs, **xargs)
         config_assert(
             len(inputs) == 1, 'SequenceLastInstanceLayer must have 1 input')
         if trans_type == 'seq':
@@ -2517,7 +2486,6 @@ class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
     def __init__(self,
                  name,
                  inputs,
-                 active_type='linear',
                  trans_type='non-seq',
                  bias=False,
                  stride=-1,
@@ -2525,7 +2493,6 @@ class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
         super(SequenceFirstInstanceLayer, self).__init__(
             name,
             inputs=inputs,
-            active_type=active_type,
             trans_type=trans_type,
             bias=bias,
             stride=stride,
@@ -2535,14 +2502,9 @@ class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
 
 @config_layer('seqconcat')
 class SequenceConcatLayer(LayerBase):
-    def __init__(self, name, inputs, active_type='linear', bias=False, **xargs):
+    def __init__(self, name, inputs, bias=False, **xargs):
         super(SequenceConcatLayer, self).__init__(
-            name,
-            'seqconcat',
-            0,
-            inputs=inputs,
-            active_type=active_type,
-            **xargs)
+            name, 'seqconcat', 0, inputs=inputs, **xargs)
         config_assert(
             len(inputs) == 2, 'SequenceConcatLayer must have 2 inputs')
         for input_index in xrange(len(self.inputs)):
@@ -2553,20 +2515,9 @@ class SequenceConcatLayer(LayerBase):
 
 @config_layer('seqreshape')
 class SequenceReshapeLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 size,
-                 inputs,
-                 active_type='linear',
-                 bias=False,
-                 **xargs):
+    def __init__(self, name, size, inputs, bias=False, **xargs):
         super(SequenceReshapeLayer, self).__init__(
-            name,
-            'seqreshape',
-            size,
-            inputs=inputs,
-            active_type=active_type,
-            **xargs)
+            name, 'seqreshape', size, inputs=inputs, **xargs)
         config_assert(
             len(inputs) == 1, 'SequenceReshapeLayer must have 1 inputs')
         self.set_layer_size(size)
@@ -2575,9 +2526,9 @@ class SequenceReshapeLayer(LayerBase):
 
 @config_layer('subseq')
 class SubSequenceLayer(LayerBase):
-    def __init__(self, name, inputs, active_type='linear', bias=False, **xargs):
+    def __init__(self, name, inputs, bias=False, **xargs):
         super(SubSequenceLayer, self).__init__(
-            name, 'subseq', 0, inputs=inputs, active_type=active_type, **xargs)
+            name, 'subseq', 0, inputs=inputs, **xargs)
         config_assert(len(inputs) == 3, 'SubSequenceLayer must have 3 inputs')
         input_layer0 = self.get_input_layer(0)
         size = input_layer0.size
@@ -2733,11 +2684,10 @@ class AverageLayer(LayerBase):
                  inputs,
                  average_strategy='average',
                  trans_type='non-seq',
-                 active_type='linear',
                  bias=False,
                  **xargs):
         super(AverageLayer, self).__init__(
-            name, 'average', 0, inputs=inputs, active_type=active_type, **xargs)
+            name, 'average', 0, inputs=inputs, **xargs)
         self.config.average_strategy = average_strategy
         self.config.trans_type = trans_type
         config_assert(len(inputs) == 1, 'AverageLayer must have 1 input')
@@ -3189,11 +3139,11 @@ def Layer(name, type, **xargs):
 @config_func
 def ParameterHook(type, **kwargs):
     if type == 'pruning':
-        mask_filename = kwargs.get('mask_filename', None)
-        assert mask_filename is not None
         hook = ParameterUpdaterHookConfig()
         hook.type = type
-        hook.purning_mask_filename = mask_filename
+        sparsity_ratio = kwargs.get('sparsity_ratio', None)
+        if sparsity_ratio is not None:
+            hook.sparsity_ratio = sparsity_ratio
         return hook
     else:
         return None
@@ -3221,7 +3171,8 @@ def Parameter(name,
               need_compact=None,
               is_static=None,
               is_shared=None,
-              update_hooks=None):
+              update_hooks=None,
+              initializer=None):
 
     config_assert(name not in g_parameter_map,
                   'Duplicated parameter name: ' + name)
@@ -3300,15 +3251,20 @@ def Parameter(name,
 
     if update_hooks is not None:
         if hasattr(update_hooks, '__call__'):
-            update_hooks = update_hooks(para.name)
+            update_hooks = update_hooks()
 
         if isinstance(update_hooks, list):
             for hook in update_hooks:
                 para.update_hooks.extend([hook])
         else:
-            para.update_hooks.extend(update_hooks)
+            para.update_hooks.extend([update_hooks])
 
     g_parameter_map[name] = para
+    if initializer is not None:
+        config_assert(
+            callable(initializer),
+            "parameter initializer should be a callable object")
+        g_parameter_initializer_map[name] = initializer
 
 
 @config_func
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index d1167a234caed3753c6beedfc89b01054e3688e1..9b9f979bb615f37ec1dc9baa154d28741b1400d5 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -14,7 +14,8 @@
 
 from paddle.trainer.config_parser import *
 __all__ = [
-    'ParamAttr', 'ExtraAttr', 'ParameterAttribute', 'ExtraLayerAttribute'
+    'HookAttr', 'ParamAttr', 'ExtraAttr', 'ParameterAttribute',
+    'ExtraLayerAttribute'
 ]
 
 
@@ -55,6 +56,40 @@ def is_compatible_with(x, Type):
         return False
 
 
+class HookAttribute(object):
+    """
+    Hook Attribute object. As a member of ParameterAttribute class, the hook is an auxiliary operation that occurs 
+    during training process of a layer with parameters, such as img_conv layer, fc layer.
+
+    :param  type: Hook type, currently supported types: 
+                        'pruning' :  user specify a sparsity_ratio before training started, and the
+                            network will prune the parameters based on the sparsity_ratio. 
+                            eg: The definition of Hook object can be hk = HookAttribute('pruning', 0.6)
+                            The specific usage can be paddle.layer.img_conv(input=img, filter_size=3,
+                                                                       num_channels=3, num_filters=64,
+                                                                       param_attr=ParameterAttribute(update_hooks=hk) )
+                            The pruning details can be found https://arxiv.org/pdf/1506.02626.pdf
+    :type type: string
+
+    :param sparsity_ratio: Must be specified if hook type is 'pruning', 
+                        it represents the ratio of the zero elements to be set by the Parameter.
+    :type sparsity_ratio: float or None
+	
+    """
+
+    def __init__(self, type, sparsity_ratio=None):
+        self.type = type
+        self.sparsity_ratio = sparsity_ratio
+        if self.sparsity_ratio is not None:
+            assert is_compatible_with(
+                self.sparsity_ratio,
+                float), 'sparisity_ratio must be float type'
+            assert self.sparsity_ratio <= 1 and self.sparsity_ratio >= 0, 'sparsity_ratio must be a float between [0, 1] '
+
+    def __call__(self):
+        return ParameterHook(self.type, sparsity_ratio=self.sparsity_ratio)
+
+
 class ParameterAttribute(object):
     """
     Parameter Attributes object. To fine-tuning network training process, user
@@ -95,6 +130,10 @@ class ParameterAttribute(object):
     :param sparse_update: Enable sparse update for this parameter. It will
                           enable both local and remote sparse update.
     :type sparse_update: bool
+    :param initializer: If not None, it should be a callable object which accepts
+                        a parameter name and returns numpy array for the initial
+                        value of the parameter
+    :param initializer: callable object
     """
 
     def __init__(self,
@@ -109,7 +148,9 @@ class ParameterAttribute(object):
                  learning_rate=None,
                  momentum=None,
                  gradient_clipping_threshold=None,
-                 sparse_update=False):
+                 sparse_update=False,
+                 update_hooks=None,
+                 initializer=None):
         self.attr = {}
 
         if is_static:
@@ -161,6 +202,11 @@ class ParameterAttribute(object):
                 is_compatible_with(gradient_clipping_threshold, float):
             self.attr['gradient_clipping_threshold'] = \
                 gradient_clipping_threshold
+        if initializer is not None:
+            self.attr['initializer'] = initializer
+
+        if update_hooks:
+            self.attr['update_hooks'] = update_hooks
 
     def set_default_parameter_name(self, name):
         """
@@ -237,5 +283,6 @@ class ExtraLayerAttribute(object):
             return attr.attr
 
 
+HookAttr = HookAttribute
 ParamAttr = ParameterAttribute
 ExtraAttr = ExtraLayerAttribute
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index a5234f3e47f6caa4b365de593648e0ee5ad6e4a2..44d52edfa7bae49bea196eba9387391b171840d8 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -21,7 +21,8 @@ __all__ = [
     "chunk_evaluator", "sum_evaluator", "column_sum_evaluator",
     "value_printer_evaluator", "gradient_printer_evaluator",
     "maxid_printer_evaluator", "maxframe_printer_evaluator",
-    "seqtext_printer_evaluator", "classification_error_printer_evaluator"
+    "seqtext_printer_evaluator", "classification_error_printer_evaluator",
+    "detection_map_evaluator"
 ]
 
 
@@ -31,10 +32,11 @@ class EvaluatorAttribute(object):
     FOR_RANK = 1 << 2
     FOR_PRINT = 1 << 3
     FOR_UTILS = 1 << 4
+    FOR_DETECTION = 1 << 5
 
     KEYS = [
         "for_classification", "for_regression", "for_rank", "for_print",
-        "for_utils"
+        "for_utils", "for_detection"
     ]
 
     @staticmethod
@@ -57,22 +59,25 @@ def evaluator(*attrs):
     return impl
 
 
-def evaluator_base(
-        input,
-        type,
-        label=None,
-        weight=None,
-        name=None,
-        chunk_scheme=None,
-        num_chunk_types=None,
-        classification_threshold=None,
-        positive_label=None,
-        dict_file=None,
-        result_file=None,
-        num_results=None,
-        delimited=None,
-        top_k=None,
-        excluded_chunk_types=None, ):
+def evaluator_base(input,
+                   type,
+                   label=None,
+                   weight=None,
+                   name=None,
+                   chunk_scheme=None,
+                   num_chunk_types=None,
+                   classification_threshold=None,
+                   positive_label=None,
+                   dict_file=None,
+                   result_file=None,
+                   num_results=None,
+                   delimited=None,
+                   top_k=None,
+                   excluded_chunk_types=None,
+                   overlap_threshold=None,
+                   background_id=None,
+                   evaluate_difficult=None,
+                   ap_type=None):
     """
     Evaluator will evaluate the network status while training/testing.
 
@@ -107,6 +112,14 @@ def evaluator_base(
     :type weight: LayerOutput.
     :param top_k: number k in top-k error rate
     :type top_k: int
+    :param overlap_threshold: In detection tasks to filter detection results
+    :type overlap_threshold: float
+    :param background_id: Identifier of background class
+    :type background_id: int
+    :param evaluate_difficult: Whether to evaluate difficult objects
+    :type evaluate_difficult: bool
+    :param ap_type: How to calculate average persicion
+    :type ap_type: str
     """
     # inputs type assertions.
     assert classification_threshold is None or isinstance(
@@ -136,7 +149,61 @@ def evaluator_base(
         delimited=delimited,
         num_results=num_results,
         top_k=top_k,
-        excluded_chunk_types=excluded_chunk_types, )
+        excluded_chunk_types=excluded_chunk_types,
+        overlap_threshold=overlap_threshold,
+        background_id=background_id,
+        evaluate_difficult=evaluate_difficult,
+        ap_type=ap_type)
+
+
+@evaluator(EvaluatorAttribute.FOR_DETECTION)
+@wrap_name_default()
+def detection_map_evaluator(input,
+                            label,
+                            overlap_threshold=0.5,
+                            background_id=0,
+                            evaluate_difficult=False,
+                            ap_type="11point",
+                            name=None):
+    """
+    Detection mAP Evaluator. It will print mean Average Precision (mAP) for detection.
+
+    The detection mAP Evaluator based on the output of detection_output layer counts
+    the true positive and the false positive bbox and integral them to get the
+    mAP.
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       eval =  detection_map_evaluator(input=det_output,label=lbl)
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param label: Label layer.
+    :type label: LayerOutput
+    :param overlap_threshold: The bbox overlap threshold of a true positive.
+    :type overlap_threshold: float
+    :param background_id: The background class index.
+    :type background_id: int
+    :param evaluate_difficult: Whether evaluate a difficult ground truth.
+    :type evaluate_difficult: bool
+    """
+    if not isinstance(input, list):
+        input = [input]
+
+    if label:
+        input.append(label)
+
+    evaluator_base(
+        name=name,
+        type="detection_map",
+        input=input,
+        label=label,
+        overlap_threshold=overlap_threshold,
+        background_id=background_id,
+        evaluate_difficult=evaluate_difficult,
+        ap_type=ap_type)
 
 
 @evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 2d8ddbb9007b241eb1986887d8ea6c2de8235c29..84ed160773065da15fc26bfb5c5882b068874f1c 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -311,18 +311,6 @@ class LayerOutput(object):
         self.outputs = outputs
         self.reverse = reverse
 
-    def __repr__(self):
-        """
-        Disable __repr__ for debug reason. Will be implemented when release
-        """
-        assert False, "this method should not be invoked"
-
-    def __str__(self):
-        """
-        Disable __str__ for debug reason. Will be implemented when release
-        """
-        assert False, "this method should not be invoked"
-
     def set_input(self, input):
         """
         Set the input for a memory layer. Can only be used for memory layer
@@ -976,7 +964,7 @@ def fc_layer(input,
 
 
 @wrap_name_default("print")
-def printer_layer(input, name=None):
+def printer_layer(input, format=None, name=None):
     """
     Print the output value of input layers. This layer is useful for debugging.
 
@@ -994,6 +982,7 @@ def printer_layer(input, name=None):
 
     Layer(
         name=name,
+        format=format,
         type=LayerType.PRINT_LAYER,
         inputs=[l.name for l in input], )
     # this layer don't return anything, can not be input of other layer.
@@ -1565,14 +1554,24 @@ def expand_layer(input,
 
 
 @wrap_name_default()
+@wrap_act_default(act=IdentityActivation())
 @layer_support()
-def repeat_layer(input, num_repeats, name=None, layer_attr=None):
+def repeat_layer(input,
+                 num_repeats,
+                 as_row_vector=True,
+                 act=None,
+                 name=None,
+                 layer_attr=None):
     """
-    A layer for repeating the input for num_repeats times. This is equivalent
-    to apply concat_layer() with num_repeats same input.
+    A layer for repeating the input for num_repeats times.
 
+    If as_row_vector:
+    .. math::
+       y  = [x_1,\cdots, x_n, \cdots, x_1, \cdots, x_n]
+    If not as_row_vector:
     .. math::
-       y  = [x, x, \cdots, x]
+       y  = [x_1,\cdots, x_1, \cdots, x_n, \cdots, x_n]
+
 
     The example usage is:
 
@@ -1585,6 +1584,14 @@ def repeat_layer(input, num_repeats, name=None, layer_attr=None):
     :param num_repeats: Repeat the input so many times
     :type num_repeats: int
     :param name: Layer name.
+    :param as_row_vector: True for treating input as row vector and repeating
+                          in the column direction.  This is equivalent to apply
+                          concat_layer() with num_repeats same input.
+                          False for treating input as column vector and repeating
+                          in the row direction.
+    :type as_row_vector: bool
+    :param act: Activation type.
+    :type act: BaseActivation
     :type name: basestring
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
@@ -1595,13 +1602,16 @@ def repeat_layer(input, num_repeats, name=None, layer_attr=None):
     l = Layer(
         inputs=[input.name],
         name=name,
+        active_type=act.name,
         num_filters=num_repeats,
+        as_row_vector=as_row_vector,
         type=LayerType.FEATURE_MAP_EXPAND_LAYER,
         **ExtraAttr.to_kwargs(layer_attr))
     return LayerOutput(
         name=name,
         size=l.config.size,
         layer_type=LayerType.FEATURE_MAP_EXPAND_LAYER,
+        activation=act,
         parents=[input])
 
 
@@ -2846,11 +2856,13 @@ def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
     Concat sequence a with sequence b.
 
     Inputs:
-      - a = [a1, a2, ..., an]
+      - a = [a1, a2, ..., am]
       - b = [b1, b2, ..., bn]
-      - Note that the length of a and b should be the same.
 
-    Output: [a1, b1, a2, b2, ..., an, bn]
+    Output: [a1, ..., am, b1, ..., bn]
+
+    Note that the above computation is for one sample. Multiple samples are
+    processed in one batch.
 
     The example usage is:
 
@@ -2944,7 +2956,7 @@ def memory(name,
     :param memory_name: the name of the memory.
                         It is ignored when name is provided.
     :type memory_name: basestring
-    :param is_seq: is sequence for boot_layer
+    :param is_seq: DEPRECATED. is sequence for boot_layer
     :type is_seq: bool
     :param boot_layer: boot layer of memory.
     :type boot_layer: LayerOutput|None
@@ -2971,7 +2983,6 @@ def memory(name,
     memory_name = Memory(
         name,
         size,
-        is_sequence=is_seq,
         boot_layer=boot_layer.name if boot_layer is not None else None,
         boot_bias=boot_bias,
         boot_bias_active_type=boot_bias_active_type.name,
@@ -3318,19 +3329,21 @@ class StaticInput(object):
     """
     StaticInput is only used in recurrent_group which defines a read-only memory
     that can be a sequence or non-sequence.
+    :param size: DEPRECATED
+    :param is_seq: DEPRECATED
     """
 
     def __init__(self, input, is_seq=False, size=None):
         assert isinstance(input, LayerOutput)
         self.input = input
-        self.is_seq = is_seq
-        assert input.size is not None or size is not None
+        assert input.size is not None
         if size is not None:
-            input.size = size
+            assert input.size == size
 
 
-class SubsequenceInput(object):
+def SubsequenceInput(input):
     """
+    DEPRECATED.
     Input sequence has sub-sequence, used in recurrent_group.
 
     The example usage is:
@@ -3339,11 +3352,7 @@ class SubsequenceInput(object):
 
        input = SubsequenceInput(layer)
     """
-
-    def __init__(self, input):
-        assert isinstance(input, LayerOutput)
-        assert input.size is not None
-        self.input = input
+    return input
 
 
 @wrap_name_default("recurrent_group")
@@ -3407,7 +3416,8 @@ def recurrent_group(step,
                     input sequence in a reverse order.
     :type reverse: bool
 
-    :param targetInlink: the input layer which share info with layer group's output
+    :param targetInlink: DEPRECATED.
+                         The input layer which share info with layer group's output
 
                          Param input specifies multiple input layers. For
                          SubsequenceInput inputs, config should assign one input
@@ -3429,46 +3439,21 @@ def recurrent_group(step,
     model_type('recurrent_nn')
 
     def is_single_input(x):
-        return isinstance(x, LayerOutput) or isinstance(x, StaticInput) \
-               or isinstance(x, SubsequenceInput)
+        return isinstance(x, LayerOutput) or isinstance(x, StaticInput)
 
     if is_single_input(input):
         input = [input]
     assert isinstance(input, collections.Sequence)
 
     def is_in_links(x):
-        return isinstance(x, LayerOutput) or isinstance(x, SubsequenceInput)
+        return isinstance(x, LayerOutput)
 
     in_links = filter(is_in_links, input)
 
-    def targetInlink_in_inlinks():
-        for inlink in in_links:
-            if isinstance(inlink, SubsequenceInput):
-                if targetInlink == inlink.input:
-                    return True
-            elif targetInlink == inlink:
-                return True
-        return False
-
-    assert (targetInlink == None or targetInlink_in_inlinks())
-    targetInlinkName = None if targetInlink == None \
-        else targetInlink.name if isinstance(targetInlink, LayerOutput) \
-        else targetInlink.input.name
-
-    contains_sub_seq = [False]
-
-    def map_in_links(x):
-        if isinstance(x, SubsequenceInput):
-            contains_sub_seq[0] = True
-            return Link(name=x.input.name, has_subseq=True)
-        else:
-            return x.name
-
     RecurrentLayerGroupWithoutOutLinksBegin(
         name=name,
-        in_links=map(map_in_links, in_links),
-        seq_reversed=reverse,
-        target_inlinkname=targetInlinkName)
+        in_links=map(lambda x: x.name, in_links),
+        seq_reversed=reverse)
     in_args = []
     has_LayerOutput = False
     for each_input in input:
@@ -3476,21 +3461,13 @@ def recurrent_group(step,
         if isinstance(each_input, LayerOutput):
             in_args.append(each_input)
             has_LayerOutput = True
-        elif isinstance(each_input, SubsequenceInput):
-            in_args.append(each_input.input)
-            has_LayerOutput = True
-        else:
+        else:  # StaticInput
             mem_name = "__%s_memory__" % each_input.input.name
             mem = memory(
-                name=mem_name,
-                is_seq=each_input.is_seq,
+                name=None,
                 size=each_input.input.size,
                 boot_layer=each_input.input)
-            with mixed_layer(
-                    name=mem_name,
-                    size=each_input.input.size,
-                    act=IdentityActivation()) as mix:
-                mix += identity_projection(mem)
+            mem.set_input(mem)
             in_args.append(mem)
 
     assert (is_generating != has_LayerOutput)
@@ -3503,10 +3480,7 @@ def recurrent_group(step,
     for ot in layer_outs:
         assert isinstance(ot, LayerOutput)
         ot.reverse = reverse
-        if contains_sub_seq[0]:
-            RecurrentLayerGroupSetOutLink(Link(ot.name, has_subseq=True))
-        else:
-            RecurrentLayerGroupSetOutLink(ot.name)
+        RecurrentLayerGroupSetOutLink(ot.name)
 
     RecurrentLayerGroupEnd(name=name)
 
@@ -3865,7 +3839,8 @@ def classification_cost(input,
                         weight=None,
                         name=None,
                         evaluator=classification_error_evaluator,
-                        layer_attr=None):
+                        layer_attr=None,
+                        coeff=1.):
     """
     classification cost Layer.
 
@@ -3881,6 +3856,8 @@ def classification_cost(input,
     :param evaluator: Evaluator method.
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
+    :param coeff: The coefficient affects the gradient in the backward.
+    :type coeff: float
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3894,6 +3871,7 @@ def classification_cost(input,
         name=name,
         type="multi-class-cross-entropy",
         inputs=ipts,
+        coeff=coeff,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
 
     def __add_evaluator__(e):
@@ -5608,13 +5586,13 @@ def row_conv_layer(input,
     to deploy in an online and low-latency setting. The lookahead convolution
     incorporates information from future subsequences in a computationally
     efficient manner to improve unidirectional recurrent neural networks.
- 
+
     The connection of row convolution is different form the 1D sequence
     convolution. Assumed that, the future context-length is k, that is to say,
     it can get the output at timestep t by using the the input feature from t-th
     timestep to (t+k+1)-th timestep. Assumed that the hidden dim of input
     activations are d, the activations r_t for the new layer at time-step t are:
- 
+
     .. math::
 
         r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}}
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 1bf59ed4840ae69afc5bce49c86a08b60e9603ee..67154a8d7d366bd983b4426da87e0b33307fced4 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1381,7 +1381,7 @@ def inputs(layers, *args):
     if len(args) != 0:
         layers.extend(args)
 
-    Inputs(*[l.name for l in layers])
+    Inputs(* [l.name for l in layers])
 
 
 def outputs(layers, *args):
@@ -1424,7 +1424,7 @@ def outputs(layers, *args):
     assert len(layers) > 0
 
     if HasInputsSet():  # input already set
-        Outputs(*[l.name for l in layers])
+        Outputs(* [l.name for l in layers])
         return  # just return outputs.
 
     if len(layers) != 1:
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index c24102255f5bbed0f551b2dbfec20be7daf5f5b4..c0e87d6de372dfdd9c7e694af71df8f3b011d43a 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-export configs=(test_fc layer_activations projections test_print_layer
+export configs=(test_repeat_layer test_fc layer_activations projections test_print_layer
 test_sequence_pooling test_lstmemory_layer test_grumemory_layer
 last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
index 12b2255f3a41119792d0f993ce2e03ce9ee3e994..fee0f8e462bfd211e6aa7698ebfeaf0a19428a62 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
@@ -9,7 +9,7 @@ layers {
   name: "__first_seq_0__"
   type: "seqlastins"
   size: 30
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "data"
   }
@@ -21,7 +21,7 @@ layers {
   name: "__first_seq_1__"
   type: "seqlastins"
   size: 30
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "data"
   }
@@ -33,7 +33,7 @@ layers {
   name: "__last_seq_0__"
   type: "seqlastins"
   size: 30
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "data"
   }
@@ -44,7 +44,7 @@ layers {
   name: "__last_seq_1__"
   type: "seqlastins"
   size: 30
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "data"
   }
@@ -55,7 +55,7 @@ layers {
   name: "__first_seq_2__"
   type: "seqlastins"
   size: 30
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "data"
   }
@@ -67,7 +67,7 @@ layers {
   name: "__last_seq_2__"
   type: "seqlastins"
   size: 30
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "data"
   }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
index 64530146a1458933d4ba0edffc1b1b7e60a21187..7254deb368963914fd1fff7925b6aeedbed59318 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
@@ -123,7 +123,7 @@ layers {
   name: "__last_seq_0__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__simple_gru_0__"
   }
@@ -134,7 +134,7 @@ layers {
   name: "__last_seq_1__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__simple_gru_1__"
   }
@@ -256,19 +256,15 @@ sub_models {
   memories {
     layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
     link_name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
-    is_sequence: false
   }
   in_links {
     layer_name: "__simple_gru_0___transform"
     link_name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
-    has_subseq: false
   }
   out_links {
     layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
     link_name: "__simple_gru_0__"
-    has_subseq: false
   }
-  target_inlinkid: -1
 }
 sub_models {
   name: "__simple_gru_1___recurrent_group"
@@ -280,18 +276,14 @@ sub_models {
   memories {
     layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
     link_name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
-    is_sequence: false
   }
   in_links {
     layer_name: "__simple_gru_1___transform"
     link_name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
-    has_subseq: false
   }
   out_links {
     layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
     link_name: "__simple_gru_1__"
-    has_subseq: false
   }
-  target_inlinkid: -1
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
index 79fa4c74f081aebadd258e06333de9eafe6a5ee3..7f2aa5a0fea1f4628e4effca5ce9af896f6e6c2c 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
@@ -205,7 +205,7 @@ layers {
   name: "__last_seq_0__"
   type: "seqlastins"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__lstm_group_0__"
   }
@@ -216,7 +216,7 @@ layers {
   name: "__last_seq_1__"
   type: "seqlastins"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__lstm_group_1__"
   }
@@ -341,24 +341,19 @@ sub_models {
   memories {
     layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
     link_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-    is_sequence: false
   }
   memories {
     layer_name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
     link_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-    is_sequence: false
   }
   in_links {
     layer_name: "__mixed_0__"
     link_name: "__mixed_0__@__lstm_group_0___recurrent_group"
-    has_subseq: false
   }
   out_links {
     layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
     link_name: "__lstm_group_0__"
-    has_subseq: false
   }
-  target_inlinkid: -1
 }
 sub_models {
   name: "__lstm_group_1___recurrent_group"
@@ -373,23 +368,18 @@ sub_models {
   memories {
     layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
     link_name: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
-    is_sequence: false
   }
   memories {
     layer_name: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
     link_name: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
-    is_sequence: false
   }
   in_links {
     layer_name: "__mixed_1__"
     link_name: "__mixed_1__@__lstm_group_1___recurrent_group"
-    has_subseq: false
   }
   out_links {
     layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
     link_name: "__lstm_group_1__"
-    has_subseq: false
   }
-  target_inlinkid: -1
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
index 68fa881b4f1408b8cd20f2417062ce035c0fda54..0d51f70ee01b913051f7d20547f68a22663200a0 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
@@ -138,7 +138,7 @@ layers {
   name: "__last_seq_0__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__recurrent_layer_0__"
   }
@@ -149,7 +149,7 @@ layers {
   name: "__first_seq_0__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__recurrent_layer_1__"
   }
@@ -161,7 +161,7 @@ layers {
   name: "__last_seq_1__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__lstmemory_0__"
   }
@@ -172,7 +172,7 @@ layers {
   name: "__first_seq_1__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__lstmemory_1__"
   }
@@ -184,7 +184,7 @@ layers {
   name: "__last_seq_2__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__gru_0__"
   }
@@ -195,7 +195,7 @@ layers {
   name: "__first_seq_2__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__gru_1__"
   }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
index c402aff174ab7c7d7f63234960d4a24d84622dd4..f4cc492dfb9b5a8c04f6f41cfab017fc613e2a66 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
@@ -12,6 +12,7 @@ layers {
   inputs {
     input_layer_name: "input"
   }
+  user_arg: "layer=input %s"
 }
 input_layer_names: "input"
 output_layer_names: "input"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_repeat_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_repeat_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..e012386ff9515947d40ddddb6804de08207e1154
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_repeat_layer.protostr
@@ -0,0 +1,42 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "__repeat_layer_0__"
+  type: "featmap_expand"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+  }
+  num_filters: 10
+}
+layers {
+  name: "__repeat_layer_1__"
+  type: "featmap_expand"
+  size: 300
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "data"
+  }
+  num_filters: 10
+  user_arg: "as_col_vec"
+}
+input_layer_names: "data"
+output_layer_names: "__repeat_layer_0__"
+output_layer_names: "__repeat_layer_1__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__repeat_layer_0__"
+  layer_names: "__repeat_layer_1__"
+  input_layer_names: "data"
+  output_layer_names: "__repeat_layer_0__"
+  output_layer_names: "__repeat_layer_1__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
index 77b447aa9db2a6c323fd3c322e7e9ca1ed19a6dd..af1b63c5dfbf0984a20eda02d608f76a454613c6 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
@@ -91,7 +91,7 @@ layers {
   name: "__last_seq_0__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "rnn_forward"
   }
@@ -140,7 +140,7 @@ layers {
   name: "__first_seq_0__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "rnn_back"
   }
@@ -155,7 +155,7 @@ layers {
 }
 layers {
   name: "sub_seq_input@__recurrent_group_2__"
-  type: "sequence_scatter_agent"
+  type: "scatter_agent"
   size: 100
   active_type: ""
 }
@@ -182,7 +182,7 @@ layers {
 }
 layers {
   name: "rnn_subseq_forward"
-  type: "sequence_gather_agent"
+  type: "gather_agent"
   size: 200
   active_type: ""
 }
@@ -190,7 +190,7 @@ layers {
   name: "__last_seq_1__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "rnn_subseq_forward"
   }
@@ -280,7 +280,7 @@ layers {
   name: "__last_seq_2__"
   type: "seqlastins"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__lstm_group_0__"
   }
@@ -329,7 +329,7 @@ layers {
   name: "__last_seq_3__"
   type: "seqlastins"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__gru_group_0__"
   }
@@ -378,7 +378,7 @@ layers {
   name: "__last_seq_4__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__fc_layer_0__"
   }
@@ -618,19 +618,15 @@ sub_models {
   memories {
     layer_name: "rnn_forward@__recurrent_group_0__"
     link_name: "rnn_forward+delay1@__recurrent_group_0__"
-    is_sequence: false
   }
   in_links {
     layer_name: "seq_input"
     link_name: "seq_input@__recurrent_group_0__"
-    has_subseq: false
   }
   out_links {
     layer_name: "rnn_forward@__recurrent_group_0__"
     link_name: "rnn_forward"
-    has_subseq: false
   }
-  target_inlinkid: -1
 }
 sub_models {
   name: "__recurrent_group_1__"
@@ -642,19 +638,15 @@ sub_models {
   memories {
     layer_name: "rnn_back@__recurrent_group_1__"
     link_name: "rnn_back+delay1@__recurrent_group_1__"
-    is_sequence: false
   }
   in_links {
     layer_name: "seq_input"
     link_name: "seq_input@__recurrent_group_1__"
-    has_subseq: false
   }
   out_links {
     layer_name: "rnn_back@__recurrent_group_1__"
     link_name: "rnn_back"
-    has_subseq: false
   }
-  target_inlinkid: -1
 }
 sub_models {
   name: "__recurrent_group_2__"
@@ -666,19 +658,15 @@ sub_models {
   memories {
     layer_name: "rnn_subseq_forward@__recurrent_group_2__"
     link_name: "rnn_subseq_forward+delay1@__recurrent_group_2__"
-    is_sequence: false
   }
   in_links {
     layer_name: "sub_seq_input"
     link_name: "sub_seq_input@__recurrent_group_2__"
-    has_subseq: true
   }
   out_links {
     layer_name: "rnn_subseq_forward@__recurrent_group_2__"
     link_name: "rnn_subseq_forward"
-    has_subseq: true
   }
-  target_inlinkid: -1
 }
 sub_models {
   name: "__lstm_group_0___recurrent_group"
@@ -693,24 +681,19 @@ sub_models {
   memories {
     layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
     link_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-    is_sequence: false
   }
   memories {
     layer_name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
     link_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-    is_sequence: false
   }
   in_links {
     layer_name: "__mixed_0__"
     link_name: "__mixed_0__@__lstm_group_0___recurrent_group"
-    has_subseq: false
   }
   out_links {
     layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
     link_name: "__lstm_group_0__"
-    has_subseq: false
   }
-  target_inlinkid: -1
 }
 sub_models {
   name: "__gru_group_0___recurrent_group"
@@ -722,19 +705,15 @@ sub_models {
   memories {
     layer_name: "__gru_group_0__@__gru_group_0___recurrent_group"
     link_name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
-    is_sequence: false
   }
   in_links {
     layer_name: "__mixed_1__"
     link_name: "__mixed_1__@__gru_group_0___recurrent_group"
-    has_subseq: false
   }
   out_links {
     layer_name: "__gru_group_0__@__gru_group_0___recurrent_group"
     link_name: "__gru_group_0__"
-    has_subseq: false
   }
-  target_inlinkid: -1
 }
 sub_models {
   name: "__recurrent_group_3__"
@@ -746,18 +725,14 @@ sub_models {
   memories {
     layer_name: "__fc_layer_0__@__recurrent_group_3__"
     link_name: "__memory_6__@__recurrent_group_3__"
-    is_sequence: false
   }
   in_links {
     layer_name: "seq_input"
     link_name: "seq_input@__recurrent_group_3__"
-    has_subseq: false
   }
   out_links {
     layer_name: "__fc_layer_0__@__recurrent_group_3__"
     link_name: "__fc_layer_0__"
-    has_subseq: false
   }
-  target_inlinkid: -1
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
index 91284b4fb32fcfdbf6b9e7384ffe080574b78821..9d1b41c9d5586235984771d610f5df40a8754522 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
@@ -27,7 +27,7 @@ layers {
   name: "__seqreshape_0__"
   type: "seqreshape"
   size: 5
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "data1"
   }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
index 1999c006d237eb449d59c8e8a2a83c1e4fab9d0e..5a217f5544a8a3b4704b158dfeb92f747b7bd94b 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
@@ -9,7 +9,7 @@ layers {
   name: "__seq_pooling_0__"
   type: "max"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "dat_in"
   }
@@ -19,7 +19,7 @@ layers {
   name: "__seq_pooling_1__"
   type: "max"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "dat_in"
   }
@@ -29,7 +29,7 @@ layers {
   name: "__seq_pooling_2__"
   type: "average"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "dat_in"
   }
@@ -40,7 +40,7 @@ layers {
   name: "__seq_pooling_3__"
   type: "average"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "dat_in"
   }
@@ -51,7 +51,7 @@ layers {
   name: "__seq_pooling_4__"
   type: "average"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "dat_in"
   }
@@ -62,7 +62,7 @@ layers {
   name: "__seq_pooling_5__"
   type: "average"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "dat_in"
   }
@@ -73,7 +73,7 @@ layers {
   name: "__seq_pooling_6__"
   type: "max"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "dat_in"
   }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..004e2a5dd4efa9feab7619643673b37fe28146c5
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
@@ -0,0 +1,11 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+din = data_layer(name='data', size=30)
+
+outputs(
+    repeat_layer(
+        input=din, num_repeats=10, as_row_vector=True),
+    repeat_layer(
+        input=din, num_repeats=10, act=TanhActivation(), as_row_vector=False))
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index b9d0a7f29138cae281236b26509a56738f3801f4..6a1e23a343d6a8de9dbec573f257efb4fb658e92 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -37,9 +37,25 @@ import plot
 import image
 
 __all__ = [
-    'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer',
-    'event', 'data_type', 'attr', 'pooling', 'data_feeder', 'dataset', 'reader',
-    'topology', 'networks', 'infer', 'plot', 'evaluator', 'image'
+    'optimizer',
+    'layer',
+    'activation',
+    'parameters',
+    'init',
+    'trainer',
+    'event',
+    'data_type',
+    'attr',
+    'pooling',
+    'data_feeder',
+    'dataset',
+    'reader',
+    'topology',
+    'networks',
+    'infer',
+    'plot',
+    'evaluator',
+    'image',
 ]
 
 
diff --git a/python/paddle/v2/attr.py b/python/paddle/v2/attr.py
index 32f78614e7f8abe7cffdc7a50a9fa77f1fc1a780..5d23894d735c463d469f842b875ecbec1dbaf476 100644
--- a/python/paddle/v2/attr.py
+++ b/python/paddle/v2/attr.py
@@ -17,10 +17,12 @@ import paddle.trainer_config_helpers.attrs
 __all__ = [
     "Param",
     "Extra",
+    "Hook",
 ]
 
 Param = paddle.trainer_config_helpers.attrs.ParameterAttribute
 Extra = paddle.trainer_config_helpers.attrs.ExtraLayerAttribute
+Hook = paddle.trainer_config_helpers.attrs.HookAttribute
 
 for each in paddle.trainer_config_helpers.attrs.__all__:
     globals()[each] = getattr(paddle.trainer_config_helpers.attrs, each)
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
index 81af0a8e66a44a3476206147684d81bcac1be372..f885b2834e8ad502b752c6fd53daf7ef1693433f 100644
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -31,10 +31,10 @@ images per class.
 import cPickle
 import itertools
 import numpy
-from common import download
+import paddle.v2.dataset.common
 import tarfile
 
-__all__ = ['train100', 'test100', 'train10', 'test10']
+__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
 
 URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
 CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
@@ -75,7 +75,8 @@ def train100():
     :rtype: callable
     """
     return reader_creator(
-        download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'train')
+        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'train')
 
 
 def test100():
@@ -88,7 +89,9 @@ def test100():
     :return: Test reader creator.
     :rtype: callable
     """
-    return reader_creator(download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'test')
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'test')
 
 
 def train10():
@@ -102,7 +105,8 @@ def train10():
     :rtype: callable
     """
     return reader_creator(
-        download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'data_batch')
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'data_batch')
 
 
 def test10():
@@ -116,9 +120,20 @@ def test10():
     :rtype: callable
     """
     return reader_creator(
-        download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'test_batch')
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'test_batch')
 
 
 def fetch():
-    download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
-    download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
+    paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
+    paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train100(), 10, "cifar_train100")
+    paddle.v2.dataset.common.convert(path, test100(), 10, "cifar_test100")
+    paddle.v2.dataset.common.convert(path, train10(), 10, "cifar_train10")
+    paddle.v2.dataset.common.convert(path, test10(), 10, "cifar_test10")
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index 418b592a5ac638cc61b86a9b3fbdcee1e3a0bcaf..4a2eb59c340f5d0d3818170e56d730330e0bab29 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -15,6 +15,7 @@
 import requests
 import hashlib
 import os
+import errno
 import shutil
 import sys
 import importlib
@@ -22,12 +23,24 @@ import paddle.v2.dataset
 import cPickle
 import glob
 
-__all__ = ['DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader']
+__all__ = [
+    'DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader',
+    'convert'
+]
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
 
-if not os.path.exists(DATA_HOME):
+# When running unit tests, there could be multiple processes that
+# trying to create DATA_HOME directory simultaneously, so we cannot
+# use a if condition to check for the existence of the directory;
+# instead, we use the filesystem as the synchronization mechanism by
+# catching returned errors.
+try:
     os.makedirs(DATA_HOME)
+except OSError as exc:
+    if exc.errno != errno.EEXIST:
+        raise
+    pass
 
 
 def md5file(fname):
@@ -149,3 +162,57 @@ def cluster_files_reader(files_pattern,
                     yield line
 
     return reader
+
+
+def convert(output_path,
+            reader,
+            num_shards,
+            name_prefix,
+            max_lines_to_shuffle=1000):
+    import recordio
+    import cPickle as pickle
+    import random
+    """
+    Convert data from reader to recordio format files.
+
+    :param output_path: directory in which output files will be saved.
+    :param reader: a data reader, from which the convert program will read data instances.
+    :param num_shards: the number of shards that the dataset will be partitioned into.
+    :param name_prefix: the name prefix of generated files.
+    :param max_lines_to_shuffle: the max lines numbers to shuffle before writing.
+    """
+
+    assert num_shards >= 1
+    assert max_lines_to_shuffle >= 1
+
+    def open_writers():
+        w = []
+        for i in range(0, num_shards):
+            n = "%s/%s-%05d-of-%05d" % (output_path, name_prefix, i,
+                                        num_shards - 1)
+            w.append(recordio.writer(n))
+
+        return w
+
+    def close_writers(w):
+        for i in range(0, num_shards):
+            w[i].close()
+
+    def write_data(w, lines):
+        random.shuffle(lines)
+        for i, d in enumerate(lines):
+            d = pickle.dumps(d, pickle.HIGHEST_PROTOCOL)
+            w[i % num_shards].write(d)
+
+    w = open_writers()
+    lines = []
+
+    for i, d in enumerate(reader()):
+        lines.append(d)
+        if i % max_lines_to_shuffle == 0 and i >= max_lines_to_shuffle:
+            write_data(w, lines)
+            lines = []
+            continue
+
+    write_data(w, lines)
+    close_writers(w)
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
index 12d648bf6557ed6e437320e56a80294abac29f18..f8aae52e7c29d86c7da9c1da0dd1d093634d4567 100644
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@@ -23,9 +23,9 @@ to initialize SRL model.
 import tarfile
 import gzip
 import itertools
-from common import download
+import paddle.v2.dataset.common
 
-__all__ = ['test, get_dict', 'get_embedding']
+__all__ = ['test, get_dict', 'get_embedding', 'convert']
 
 DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
@@ -182,9 +182,15 @@ def get_dict():
     """
     Get the word, verb and label dictionary of Wikipedia corpus.
     """
-    word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
-    verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
-    label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
+    word_dict = load_dict(
+        paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st',
+                                          WORDDICT_MD5))
+    verb_dict = load_dict(
+        paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st',
+                                          VERBDICT_MD5))
+    label_dict = load_dict(
+        paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st',
+                                          TRGDICT_MD5))
     return word_dict, verb_dict, label_dict
 
 
@@ -192,7 +198,7 @@ def get_embedding():
     """
     Get the trained word vector based on Wikipedia corpus.
     """
-    return download(EMB_URL, 'conll05st', EMB_MD5)
+    return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
 
 
 def test():
@@ -209,15 +215,23 @@ def test():
     """
     word_dict, verb_dict, label_dict = get_dict()
     reader = corpus_reader(
-        download(DATA_URL, 'conll05st', DATA_MD5),
+        paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
         words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
         props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
     return reader_creator(reader, word_dict, verb_dict, label_dict)
 
 
 def fetch():
-    download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
-    download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
-    download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
-    download(EMB_URL, 'conll05st', EMB_MD5)
-    download(DATA_URL, 'conll05st', DATA_MD5)
+    paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
+    paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
+    paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
+    paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
+    paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, test(), 10, "conl105_train")
+    paddle.v2.dataset.common.convert(path, test(), 10, "conl105_test")
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
index 5dc5abfe53d90ec3adc9a27a49ed086953146497..c0ec5992e0e6b0a2fd2359910d0f7a6c690c2ec3 100644
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@@ -28,7 +28,7 @@ import re
 import string
 import threading
 
-__all__ = ['build_dict', 'train', 'test']
+__all__ = ['build_dict', 'train', 'test', 'convert']
 
 URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
 MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
@@ -166,3 +166,12 @@ def word_dict():
 
 def fetch():
     paddle.v2.dataset.common.download(URL, 'imdb', MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    w = word_dict()
+    paddle.v2.dataset.common.convert(path, lambda: train(w), 10, "imdb_train")
+    paddle.v2.dataset.common.convert(path, lambda: test(w), 10, "imdb_test")
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
index dd3a4552d2e1a2b00dde5ddb7ac1d78445bdca51..b18ee8e9ba91e0e8ccf061223b3c0d4636442956 100644
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -22,7 +22,7 @@ import paddle.v2.dataset.common
 import collections
 import tarfile
 
-__all__ = ['train', 'test', 'build_dict']
+__all__ = ['train', 'test', 'build_dict', 'convert']
 
 URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
 MD5 = '30177ea32e27c525793142b6bf2c8e2d'
@@ -146,3 +146,15 @@ def test(word_idx, n, data_type=DataType.NGRAM):
 
 def fetch():
     paddle.v2.dataset.common.download(URL, "imikolov", MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    N = 5
+    word_dict = build_dict()
+    paddle.v2.dataset.common.convert(path,
+                                     train(word_dict, N), 10, "imikolov_train")
+    paddle.v2.dataset.common.convert(path,
+                                     test(word_dict, N), 10, "imikolov_test")
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index 435556b2921b7976bbc61160ce3812949981c9e7..ea5891f4f3f6ee1c5023cccee9732cbd9d78b881 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -21,7 +21,7 @@ import paddle.v2.dataset.common
 import subprocess
 import numpy
 import platform
-__all__ = ['train', 'test']
+__all__ = ['train', 'test', 'convert']
 
 URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
 TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
@@ -113,3 +113,11 @@ def fetch():
     paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
     paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
     paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train(), 10, "minist_train")
+    paddle.v2.dataset.common.convert(path, test(), 10, "minist_test")
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
index 837a85912663826f0483aff4f6a38f3945375d82..d9372d422a3293eddeb7c0d5b7c8980f55c44690 100644
--- a/python/paddle/v2/dataset/movielens.py
+++ b/python/paddle/v2/dataset/movielens.py
@@ -23,14 +23,15 @@ set and test set into paddle reader creators.
 """
 
 import zipfile
-from common import download
+import paddle.v2.dataset.common
 import re
 import random
 import functools
 
 __all__ = [
     'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
-    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info'
+    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info',
+    'convert'
 ]
 
 age_table = [1, 18, 25, 35, 45, 50, 56]
@@ -99,7 +100,7 @@ USER_INFO = None
 
 
 def __initialize_meta_info__():
-    fn = download(URL, "movielens", MD5)
+    fn = paddle.v2.dataset.common.download(URL, "movielens", MD5)
     global MOVIE_INFO
     if MOVIE_INFO is None:
         pattern = re.compile(r'^(.*)\((\d+)\)$')
@@ -246,7 +247,15 @@ def unittest():
 
 
 def fetch():
-    download(URL, "movielens", MD5)
+    paddle.v2.dataset.common.download(URL, "movielens", MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train(), 10, "movielens_train")
+    paddle.v2.dataset.common.convert(path, test(), 10, "movielens_test")
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py
index 4dd34e7383fe2a290fcf61474914183a383e2b9c..e33f120c8734621fd60497298d993e6e43bd06e0 100644
--- a/python/paddle/v2/dataset/sentiment.py
+++ b/python/paddle/v2/dataset/sentiment.py
@@ -26,9 +26,9 @@ from itertools import chain
 import nltk
 from nltk.corpus import movie_reviews
 
-import common
+import paddle.v2.dataset.common
 
-__all__ = ['train', 'test', 'get_word_dict']
+__all__ = ['train', 'test', 'get_word_dict', 'convert']
 NUM_TRAINING_INSTANCES = 1600
 NUM_TOTAL_INSTANCES = 2000
 
@@ -39,12 +39,13 @@ def download_data_if_not_yet():
     """
     try:
         # make sure that nltk can find the data
-        if common.DATA_HOME not in nltk.data.path:
-            nltk.data.path.append(common.DATA_HOME)
+        if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path:
+            nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME)
         movie_reviews.categories()
     except LookupError:
         print "Downloading movie_reviews data set, please wait....."
-        nltk.download('movie_reviews', download_dir=common.DATA_HOME)
+        nltk.download(
+            'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
         print "Download data set success....."
         print "Path is " + nltk.data.find('corpora/movie_reviews').path
 
@@ -128,4 +129,13 @@ def test():
 
 
 def fetch():
-    nltk.download('movie_reviews', download_dir=common.DATA_HOME)
+    nltk.download(
+        'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train, 10, "sentiment_train")
+    paddle.v2.dataset.common.convert(path, test, 10, "sentiment_test")
diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/v2/dataset/tests/common_test.py
index f9815d4f9e1ee3bbe9ccf2dae588c51c262468c1..cfa194eba38ea70311c4deeac2635dc0a0103576 100644
--- a/python/paddle/v2/dataset/tests/common_test.py
+++ b/python/paddle/v2/dataset/tests/common_test.py
@@ -57,6 +57,38 @@ class TestCommon(unittest.TestCase):
         for idx, e in enumerate(reader()):
             self.assertEqual(e, str("0"))
 
+    def test_convert(self):
+        record_num = 10
+        num_shards = 4
+
+        def test_reader():
+            def reader():
+                for x in xrange(record_num):
+                    yield x
+
+            return reader
+
+        path = tempfile.mkdtemp()
+        paddle.v2.dataset.common.convert(path,
+                                         test_reader(), num_shards,
+                                         'random_images')
+
+        files = glob.glob(path + '/random_images-*')
+        self.assertEqual(len(files), num_shards)
+
+        recs = []
+        for i in range(0, num_shards):
+            n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1)
+            r = recordio.reader(n)
+            while True:
+                d = r.read()
+                if d is None:
+                    break
+                recs.append(d)
+
+        recs.sort()
+        self.assertEqual(total, record_num)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
index 3469fd9ce12dd4d934004f90286979b73048a5c8..c715ea96819659c60215d61e5701ca565bb5d3ff 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -14,14 +14,14 @@
 """
 UCI Housing dataset.
 
-This module will download dataset from
+This module will paddle.v2.dataset.common.download dataset from
 https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
 parse training set and test set into paddle reader creators.
 """
 
 import numpy as np
 import os
-from common import download
+import paddle.v2.dataset.common
 
 __all__ = ['train', 'test']
 
@@ -29,7 +29,7 @@ URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing
 MD5 = 'd4accdce7a25600298819f8e28e8d593'
 feature_names = [
     'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
-    'PTRATIO', 'B', 'LSTAT'
+    'PTRATIO', 'B', 'LSTAT', 'convert'
 ]
 
 UCI_TRAIN_DATA = None
@@ -82,7 +82,7 @@ def train():
     :rtype: callable
     """
     global UCI_TRAIN_DATA
-    load_data(download(URL, 'uci_housing', MD5))
+    load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
 
     def reader():
         for d in UCI_TRAIN_DATA:
@@ -102,7 +102,7 @@ def test():
     :rtype: callable
     """
     global UCI_TEST_DATA
-    load_data(download(URL, 'uci_housing', MD5))
+    load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
 
     def reader():
         for d in UCI_TEST_DATA:
@@ -112,4 +112,12 @@ def test():
 
 
 def fetch():
-    download(URL, 'uci_housing', MD5)
+    paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train(), 10, "uci_housing_train")
+    paddle.v2.dataset.common.convert(path, test(), 10, "uci_houseing_test")
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index 0902f87741c342b237439081703081b467dc6f35..e1dc4f4c30051202e8fd077087679c4fd6cbd7a0 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -22,10 +22,10 @@ parse training set and test set into paddle reader creators.
 import tarfile
 import gzip
 
-from paddle.v2.dataset.common import download
+import paddle.v2.dataset.common
 from paddle.v2.parameters import Parameters
 
-__all__ = ['train', 'test', 'build_dict']
+__all__ = ['train', 'test', 'build_dict', 'convert']
 
 URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
 MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
@@ -115,7 +115,8 @@ def train(dict_size):
     :rtype: callable
     """
     return reader_creator(
-        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'train/train', dict_size)
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'train/train', dict_size)
 
 
 def test(dict_size):
@@ -130,16 +131,18 @@ def test(dict_size):
     :rtype: callable
     """
     return reader_creator(
-        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size)
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'test/test', dict_size)
 
 
 def gen(dict_size):
     return reader_creator(
-        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'gen/gen', dict_size)
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'gen/gen', dict_size)
 
 
 def model():
-    tar_file = download(URL_MODEL, 'wmt14', MD5_MODEL)
+    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
     with gzip.open(tar_file, 'r') as f:
         parameters = Parameters.from_tar(f)
     return parameters
@@ -148,7 +151,7 @@ def model():
 def get_dict(dict_size, reverse=True):
     # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
     # else reverse = true, return dict = {'001':'a', '002':'b', ...}
-    tar_file = download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    tar_file = paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
     src_dict, trg_dict = __read_to_dict__(tar_file, dict_size)
     if reverse:
         src_dict = {v: k for k, v in src_dict.items()}
@@ -157,5 +160,14 @@ def get_dict(dict_size, reverse=True):
 
 
 def fetch():
-    download(URL_TRAIN, 'wmt14', MD5_TRAIN)
-    download(URL_MODEL, 'wmt14', MD5_MODEL)
+    paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    dict_size = 30000
+    paddle.v2.dataset.common.convert(path, train(dict_size), 10, "wmt14_train")
+    paddle.v2.dataset.common.convert(path, test(dict_size), 10, "wmt14_test")
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
index aeed9ebd7d4d64efa5d0bf1638742a485c0fa44a..4ade1c6f329ae39769279963af6809f938807bdd 100644
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -45,12 +45,12 @@ __all__ = ['data', 'parse_network']
 def __need_to_keep__(name):
     return name in [
         'StaticInput', 'SubsequenceInput', 'GeneratedInput', 'LayerType',
-        'layer_support'
+        'layer_support', 'BaseGeneratedInput'
     ]
 
 
 def __need_to_wrap__(name):
-    return name not in ['AggregateLevel', 'ExpandLevel']
+    return name not in ['AggregateLevel', 'ExpandLevel', 'BaseGeneratedInput']
 
 
 def __convert_name__(inname):
@@ -199,6 +199,15 @@ def __get_used_submodels__(layer_names):
     return submodel_names
 
 
+def __get_submodel_data_out_links__():
+    data_links = set()
+    for submodel in cp.g_config.model_config.sub_models:
+        for link in submodel.out_links:
+            if cp.g_layer_map[link.link_name].type == 'data':
+                data_links.add(link.link_name)
+    return data_links
+
+
 def __get_used_evaluators__(layer_names):
     evaluator_names = set()
     for e in cp.g_config.model_config.evaluators:
@@ -260,10 +269,11 @@ def parse_network(output_layers, extra_layers=None):
     else:
         extra_layers = []
 
-    layer_names = __get_used_layers__(output_layers + extra_layers)
+    layer_names = __get_used_layers__(list(output_layers) + list(extra_layers))
     submodel_names = __get_used_submodels__(layer_names)
     submodel_names.add('root')
     evaluator_names = __get_used_evaluators__(layer_names)
+    data_out_links = __get_submodel_data_out_links__()
     input_layer_names = set()
     output_layer_names = set()
 
@@ -279,7 +289,7 @@ def parse_network(output_layers, extra_layers=None):
             continue
         model_config.layers.extend([l])
         if l.type == 'data':
-            if l.name in model_config.output_layer_names:
+            if l.name in data_out_links:
                 """
                 In text generation, the outlink to save the generated word
                 indices is a data_layer defined in recurrent_group. This
diff --git a/python/paddle/v2/master/.gitignore b/python/paddle/v2/master/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..a3ac6e1a33e74631136fc95574532284db7cd7cd
--- /dev/null
+++ b/python/paddle/v2/master/.gitignore
@@ -0,0 +1,3 @@
+*.whl
+*.so
+*.pyc
diff --git a/python/paddle/v2/master/__init__.py b/python/paddle/v2/master/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8975b5d4a33cbecb4fa5a144bc610c36591d629
--- /dev/null
+++ b/python/paddle/v2/master/__init__.py
@@ -0,0 +1,3 @@
+from client import *
+
+__all__ = ['client']
diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py
new file mode 100644
index 0000000000000000000000000000000000000000..de8e9bb88e1064e41a80e0ef7838e307089a1331
--- /dev/null
+++ b/python/paddle/v2/master/client.py
@@ -0,0 +1,39 @@
+import ctypes
+import os
+
+path = os.path.join(os.path.dirname(__file__), "libpaddle_master.so")
+lib = ctypes.cdll.LoadLibrary(path)
+
+
+class client(object):
+    """
+    client is a client to the master server.
+    """
+
+    def __init__(self, addr, buf_size):
+        self.c = lib.paddle_new_master_client(addr, buf_size)
+
+    def close(self):
+        lib.paddle_release_master_client(self.c)
+        self.c = None
+
+    def set_dataset(self, paths):
+        holder_type = ctypes.c_char_p * len(paths)
+        holder = holder_type()
+        print paths
+        for idx, path in enumerate(paths):
+            c_ptr = ctypes.c_char_p(path)
+            holder[idx] = c_ptr
+        lib.paddle_set_dataset(self.c, holder, len(paths))
+
+    def next_record(self):
+        p = ctypes.c_char_p()
+        ret = ctypes.pointer(p)
+        size = lib.paddle_next_record(self.c, ret)
+        if size == 0:
+            # Empty record
+            return ""
+        record = ret.contents.value[:size]
+        # Memory created from C should be freed.
+        lib.mem_free(ret.contents)
+        return record
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 5e99d4a241b7fe2b0f9ff4ba191db4b341c4d30e..1ef2dceca910e806bddf17c95d1c345a144d9e31 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -45,7 +45,12 @@ class Optimizer(object):
         return swig_api.ParameterUpdater.createRemoteUpdater(
             self.__opt_conf__, pass_num, use_sparse_updater)
 
-    def create_updater(self, is_local, num_passes, use_sparse_updater):
+    def __create_new_remote_updater__(self, pserver_spec):
+        return swig_api.ParameterUpdater.createNewRemoteUpdater(
+            self.__opt_conf__, pserver_spec)
+
+    def create_updater(self, is_local, num_passes, use_sparse_updater,
+                       pserver_spec):
         """
         create proper parameter_updater by configuration.
         :param is_local: create local or remote parameter updater
@@ -64,8 +69,12 @@ class Optimizer(object):
         if is_local:
             parameter_updater = self.__create_local_updater__()
         else:
-            parameter_updater = self.__create_remote_updater__(
-                num_passes, use_sparse_updater)
+            if pserver_spec is None:
+                parameter_updater = self.__create_remote_updater__(
+                    num_passes, use_sparse_updater)
+            else:
+                parameter_updater = self.__create_new_remote_updater__(
+                    pserver_spec)
         return parameter_updater
 
 
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index 64805d0c504b876f4d1f6657fe94457534a0b278..ad20241b98302f136326ae491c6723a6c12ae284 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -1,6 +1,7 @@
 import numpy as np
 import py_paddle.swig_paddle as api
 from paddle.proto.ParameterConfig_pb2 import ParameterConfig
+import paddle.trainer.config_parser as cp
 import struct
 import tarfile
 import cStringIO
@@ -18,8 +19,11 @@ def create(layers):
     """
     topology = Topology(layers)
     pool = Parameters()
+    initializers = cp.g_parameter_initializer_map
     for param in topology.proto().parameters:
         pool.__append_config__(param)
+        if param.name in initializers:
+            pool[param.name] = initializers[param.name](param.name)
     return pool
 
 
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
index 07142056f872db5113acdd296b17c52b343c1be6..9f888b16d6b2fbf457ee4f4fe94fcb51b6f37fc9 100644
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could be used in user
 program.
 """
 
-__all__ = ['np_array', 'text_file']
+__all__ = ['np_array', 'text_file', "recordio"]
 
 
 def np_array(x):
@@ -55,3 +55,24 @@ def text_file(path):
         f.close()
 
     return reader
+
+
+def recordio(path):
+    """
+    Creates a data reader that outputs record one one by one from given recordio file
+    :path: path of recordio file
+    :returns: data reader of recordio file
+    """
+
+    import recordio as rec
+
+    def reader():
+        f = rec.reader(path)
+        while True:
+            r = f.read()
+            if r is None:
+                break
+            yield r
+        f.close()
+
+    return reader
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
index c76faa596c9fb9079cab3456b721c18ef9768e95..e432003129d2b8dea60138d08f13ec5e9d29a7ad 100644
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
@@ -230,7 +230,7 @@ class XmapEndSignal():
     pass
 
 
-def xmap_readers(mapper, reader, process_num, buffer_size):
+def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
     """
     Use multiprocess to map samples from reader by a mapper defined by user.
     And this function contains a buffered decorator.
@@ -242,12 +242,15 @@ def xmap_readers(mapper, reader, process_num, buffer_size):
     :type process_num: int
     :param buffer_size: max buffer size
     :type buffer_size: int
+    :param order: keep the order of reader
+    :type order: bool
     :return: the decarated reader
     :rtype: callable
     """
     end = XmapEndSignal()
     in_queue = Queue(buffer_size)
     out_queue = Queue(buffer_size)
+    out_order = [0]
 
     # define a worker to read samples from reader to in_queue
     def read_worker(reader, in_queue):
@@ -255,8 +258,17 @@ def xmap_readers(mapper, reader, process_num, buffer_size):
             in_queue.put(i)
         in_queue.put(end)
 
+    # define a worker to read samples from reader to in_queue with order flag
+    def order_read_worker(reader, in_queue):
+        in_order = 0
+        for i in reader():
+            in_queue.put((in_order, i))
+            in_order += 1
+        in_queue.put(end)
+
     # start a read worker in a thread
-    t = Thread(target=read_worker, args=(reader, in_queue))
+    target = order_read_worker if order else read_worker
+    t = Thread(target=target, args=(reader, in_queue))
     t.daemon = True
     t.start()
 
@@ -271,11 +283,28 @@ def xmap_readers(mapper, reader, process_num, buffer_size):
         in_queue.put(end)
         out_queue.put(end)
 
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue by order
+    def order_handle_worker(in_queue, out_queue, mapper, out_order):
+        ins = in_queue.get()
+        while not isinstance(ins, XmapEndSignal):
+            order, sample = ins
+            r = mapper(sample)
+            while order != out_order[0]:
+                pass
+            out_queue.put(r)
+            out_order[0] += 1
+            ins = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
     # start several handle_workers
+    target = order_handle_worker if order else handle_worker
+    args = (in_queue, out_queue, mapper, out_order) if order else (
+        in_queue, out_queue, mapper)
     workers = []
     for i in xrange(process_num):
-        worker = Thread(
-            target=handle_worker, args=(in_queue, out_queue, mapper))
+        worker = Thread(target=target, args=args)
         worker.daemon = True
         workers.append(worker)
     for w in workers:
diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py
index 9f8d7133b8694aae5541eff9576eaba8a31e77dc..ba4f558874a0155d276fcb0e0d2d9258f0903f0e 100644
--- a/python/paddle/v2/reader/tests/creator_test.py
+++ b/python/paddle/v2/reader/tests/creator_test.py
@@ -13,9 +13,7 @@
 # limitations under the License.
 import os
 import unittest
-
 import numpy as np
-
 import paddle.v2.reader.creator
 
 
@@ -36,5 +34,14 @@ class TestTextFile(unittest.TestCase):
             self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
 
 
+class TestRecordIO(unittest.TestCase):
+    def test_recordio(self):
+        path = os.path.join(
+            os.path.dirname(__file__), "test_recordio_creator.dat")
+        reader = paddle.v2.reader.creator.recordio(path)
+        for idx, r in enumerate(reader()):
+            self.assertSequenceEqual(r, str(idx))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py
index 734154b9790a4dc118d11992343648364c907305..bb3c5d220b9ce1552d2fc429abb1863930cd4d17 100644
--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ b/python/paddle/v2/reader/tests/decorator_test.py
@@ -121,5 +121,27 @@ class TestShuffle(unittest.TestCase):
             self.assertEqual(total, 10)
 
 
+class TestXmap(unittest.TestCase):
+    def test_xmap(self):
+        def mapper(x):
+            return (x + 1)
+
+        orders = (True, False)
+        thread_nums = (1, 2, 4, 8, 16)
+        buffered_size = (1, 2, 4, 8, 16)
+        for order in orders:
+            for tNum in thread_nums:
+                for size in buffered_size:
+                    result = []
+                    for i in paddle.v2.reader.xmap_readers(mapper,
+                                                           reader_creator_10(0),
+                                                           tNum, size, order)():
+                        result.append(i)
+                    if not order:
+                        result.sort()
+                    for idx, e in enumerate(result):
+                        self.assertEqual(e, mapper(idx))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/reader/tests/test_recordio_creator.dat b/python/paddle/v2/reader/tests/test_recordio_creator.dat
new file mode 100644
index 0000000000000000000000000000000000000000..17aa89b6796184407e83246d3f342a55a66b4a69
Binary files /dev/null and b/python/paddle/v2/reader/tests/test_recordio_creator.dat differ
diff --git a/python/paddle/v2/tests/test_parameters.py b/python/paddle/v2/tests/test_parameters.py
index ebb182caab6430862a8e4da2ae4ea6b1e72f726c..45372e7dd0ec7cbdd6a2eb5c0397ef7e74284cd0 100644
--- a/python/paddle/v2/tests/test_parameters.py
+++ b/python/paddle/v2/tests/test_parameters.py
@@ -11,6 +11,9 @@ except ImportError:
     sys.exit(0)
 
 import paddle.v2.parameters as parameters
+import paddle.v2.data_type as data_type
+import paddle.v2.layer as layer
+from paddle.v2.attr import ParamAttr
 from paddle.proto.ParameterConfig_pb2 import ParameterConfig
 import random
 import cStringIO
@@ -55,6 +58,25 @@ class TestParameters(unittest.TestCase):
             p1 = params_dup.get(name)
             self.assertTrue(numpy.isclose(p0, p1).all())
 
+    def test_initializer(self):
+        def initializer(name):
+            assert name == "fc.w"
+            mat = numpy.ones((3, 2), dtype=numpy.float32)
+            mat[1, 1] = 2
+            return mat
+
+        x = layer.data(name="x", type=data_type.dense_vector(3))
+        y = layer.fc(x,
+                     size=2,
+                     bias_attr=False,
+                     param_attr=ParamAttr(
+                         name="fc.w", initializer=initializer))
+        params = parameters.create(y)
+        val = params["fc.w"]
+        assert val.shape == (3, 2)
+        expected = numpy.array([[1, 1], [1, 2], [1, 1]], numpy.float32)
+        assert numpy.logical_and.reduce(numpy.reshape(val == expected, 6))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index 8fdb67cc2688a67ed815af396b214e339195c73f..f9658a8c5df9562073c8a187074a6cb3459ac5d9 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -49,7 +49,8 @@ class SGD(object):
                  parameters,
                  update_equation,
                  extra_layers=None,
-                 is_local=True):
+                 is_local=True,
+                 pserver_spec=None):
 
         if not isinstance(parameters, v2_parameters.Parameters):
             raise TypeError('parameters should be parameters')
@@ -63,6 +64,7 @@ class SGD(object):
         self.__parameters__ = parameters
         self.__topology_in_proto__ = topology.proto()
         self.__is_local__ = is_local
+        self.__pserver_spec__ = pserver_spec
 
         self.__use_sparse_updater__ = self.__topology__.use_sparse_updater()
         # # In local mode, disable sparse_remote_update.
@@ -126,7 +128,8 @@ class SGD(object):
         __check_train_args__(**locals())
 
         self.__parameter_updater__ = self.__optimizer__.create_updater(
-            self.__is_local__, num_passes, self.__use_sparse_updater__)
+            self.__is_local__, num_passes, self.__use_sparse_updater__,
+            self.__pserver_spec__)
         self.__parameter_updater__.init(self.__gradient_machine__)
 
         self.__gradient_machine__.start()
diff --git a/python/setup.py.in b/python/setup.py.in
index 93724f918801ea706517a1df158ceb78a1c2335c..86fc0fc5c0318b03659bf84f8ad9e2a114467c74 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,6 +1,5 @@
 from setuptools import setup
 
-
 packages=['paddle',
           'paddle.proto',
           'paddle.trainer',
@@ -14,6 +13,7 @@ packages=['paddle',
 setup_requires=["requests",
                 "numpy",
                 "protobuf==3.1",
+                "recordio",
                 "matplotlib",
                 "rarfile"]
 
@@ -27,5 +27,5 @@ setup(name='paddle',
       packages=packages,
       package_dir={
           '': '${CMAKE_CURRENT_SOURCE_DIR}'
-      }
+      },
 )