Merge branch 'develop' of https://github.com/baidu/Paddle into nnpack_lib

2e02952b · hedaoyuan · 0e45f952 · 4db23bba · 2e02952b · 2e02952b
214 changed file
--- a/.travis.yml
+++ b/.travis.yml
 language: cpp
 cache:
  directories:
-    - $HOME/third_party
    - $HOME/.ccache
    - $HOME/.cache/pip
+    - $TRAVIS_BUILD_DIR/build/third_party
 sudo: required
 dist: trusty
 os:
  - linux
 env:
-  - JOB=DOCS
-  - JOB=BUILD_AND_TEST
-  - JOB=PRE_COMMIT
+  - JOB=build_doc
+  - JOB=check_style
 addons:
  apt:
    packages:
      - gcc-4.8
      - g++-4.8
-      - gfortran-4.8
      - git
      - build-essential
      - python
@@ -34,27 +32,17 @@ addons:
      - libtool
      - ccache
 before_install:
-  - |
-    if [ ${JOB} == "BUILD_AND_TEST" ]; then
-      local change_list=`git diff --name-only $TRAVIS_COMMIT_RANGE`
-      if [ $? -eq 0 ]; then  # if git diff return no zero, then rerun unit test.
-        if ! echo ${change_list} | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)'
-        then
-          echo "Only markdown docs were updated, stopping build process."
-          exit
-        fi
-      fi
-    fi
-  - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
+  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python 
  # protobuf version.
  - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
  - pip install rarfile
+  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
  - |
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
-  - | 
-    timeout 2580 paddle/scripts/travis/main.sh  # 43min timeout
+  - |
+    timeout 2580 paddle/scripts/travis/${JOB}.sh  # 43min timeout
    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
 notifications:
  email:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,7 @@ option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
+option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)

 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -70,7 +71,7 @@ if(ANDROID)
        "Disable RDMA when cross-compiling for Android" FORCE)
 endif(ANDROID)

-set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
+set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
  "A path setting third party libraries download & build directories.")

 if (WITH_C_API AND WITH_PYTHON)
@@ -107,6 +108,7 @@ include(configure)          # add paddle env configuration
 include_directories("${PROJ_ROOT}")
 include_directories("${PROJ_ROOT}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
+include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/cclient")

 set(EXTERNAL_LIBS
    ${GFLAGS_LIBRARIES}
@@ -128,6 +130,11 @@ add_subdirectory(proto)
 add_subdirectory(paddle)
 add_subdirectory(python)

+if(WITH_GOLANG)
+    #TODO (add go/master/c back when fixed)
+    add_subdirectory(go/pserver/cclient)
+endif(WITH_GOLANG)
+
 if(WITH_DOC)
    add_subdirectory(doc)
 endif()
--- a/Dockerfile
+++ b/Dockerfile
@@ -25,7 +25,7 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
    apt-get install -y \
    git python-pip python-dev openssh-server bison  \
-    wget unzip tar xz-utils bzip2 gzip coreutils  \
+    wget unzip tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
    python-numpy python-matplotlib gcc g++ \
    automake locales clang-format-3.8 swig doxygen cmake  \

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -40,6 +40,10 @@ if(NOT CMAKE_CROSSCOMPILING)
    endif()
 endif()

+if(NOT WITH_GOLANG)
+    add_definitions(-DPADDLE_WITHOUT_GOLANG)
+endif(NOT WITH_GOLANG)
+
 if(NOT WITH_GPU)
    add_definitions(-DPADDLE_ONLY_CPU)
    add_definitions(-DHPPL_STUB_FUNC)

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -21,7 +21,8 @@ IF(NOT ${CBLAS_FOUND})
    SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)

-    SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/${LIBRARY_PREFIX}openblas${STATIC_LIBRARY_SUFFIX}"
+    SET(CBLAS_LIBRARIES
+        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
        CACHE FILEPATH "openblas library." FORCE)

    SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs)

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -14,11 +14,41 @@

 INCLUDE(ExternalProject)

+# Print and set the protobuf library information,
+# finish this cmake process and exit from this file.
 macro(PROMPT_PROTOBUF_LIB)
+    SET(protobuf_DEPS ${ARGN})
+
    MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}")
    MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}")
    MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}")
    INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
+
+    # Assuming that all the protobuf libraries are of the same type.
+    IF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$")
+        SET(protobuf_LIBTYPE STATIC)
+    ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$")
+        SET(protobuf_LIBTYPE SHARED)
+    ELSE()
+        MESSAGE(FATAL_ERROR "Unknown library type: ${PROTOBUF_LIBRARY}")
+    ENDIF()
+
+    ADD_LIBRARY(protobuf ${protobuf_LIBTYPE} IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET protobuf PROPERTY IMPORTED_LOCATION ${PROTOBUF_LIBRARY})
+
+    ADD_LIBRARY(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION ${PROTOBUF_LITE_LIBRARY})
+
+    ADD_LIBRARY(protoc ${protobuf_LIBTYPE} IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY})
+
+    FOREACH(dep ${protobuf_DEPS})
+        ADD_DEPENDENCIES(protobuf ${dep})
+        ADD_DEPENDENCIES(protobuf_lite ${dep})
+        ADD_DEPENDENCIES(protoc ${dep})
+    ENDFOREACH()
+
+    LIST(APPEND external_project_dependencies protobuf)
    RETURN()
 endmacro()
 macro(SET_PROTOBUF_VERSION)
@@ -43,22 +73,23 @@ if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
 endif()

 FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
-    SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_NAME})
-    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_NAME})
+    STRING(REPLACE "extern_" "" TARGET_DIR_NAME "${TARGET_NAME}")
+    SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME})
+    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_DIR_NAME})

    SET(${TARGET_NAME}_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE)
    SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE)
    SET(${TARGET_NAME}_LITE_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${STATIC_LIBRARY_SUFFIX}"
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}"
         PARENT_SCOPE)
    SET(${TARGET_NAME}_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${STATIC_LIBRARY_SUFFIX}"
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}"
         PARENT_SCOPE)
    SET(${TARGET_NAME}_PROTOC_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${STATIC_LIBRARY_SUFFIX}"
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}"
         PARENT_SCOPE)
    SET(${TARGET_NAME}_PROTOC_EXECUTABLE
-        "${PROTOBUF_INSTALL_DIR}/bin/protoc${EXECUTABLE_SUFFIX}"
+        "${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}"
         PARENT_SCOPE)

    SET(OPTIONAL_CACHE_ARGS "")
@@ -109,6 +140,8 @@ IF(NOT CMAKE_CROSSCOMPILING)
        SET_PROTOBUF_VERSION()
        IF("${PROTOBUF_VERSION}" VERSION_LESS "3.1.0")
            SET(PROTOBUF_FOUND OFF)
+        ELSE()
+            PROMPT_PROTOBUF_LIB()
        ENDIF()
    ENDIF(PROTOBUF_FOUND)
 ELSE()
@@ -120,18 +153,22 @@ ELSE()
 ENDIF()

 IF(NOT PROTOBUF_FOUND)
-    build_protobuf(protobuf FALSE)
-    LIST(APPEND external_project_dependencies protobuf)
+    build_protobuf(extern_protobuf FALSE)

-    SET(PROTOBUF_INCLUDE_DIR ${protobuf_INCLUDE_DIR}
+    SET(PROTOBUF_INCLUDE_DIR ${extern_protobuf_INCLUDE_DIR}
        CACHE PATH "protobuf include directory." FORCE)
-    IF(NOT CMAKE_CROSSCOMPILING)
-        SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_PROTOC_EXECUTABLE}
+    SET(PROTOBUF_LITE_LIBRARY ${extern_protobuf_LITE_LIBRARY}
+        CACHE FILEPATH "protobuf lite library." FORCE)
+    SET(PROTOBUF_LIBRARY ${extern_protobuf_LIBRARY}
+        CACHE FILEPATH "protobuf library." FORCE)
+    SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
+        CACHE FILEPATH "protoc library." FORCE)
+
+    IF(CMAKE_CROSSCOMPILING)
+        PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf)
+    ELSE()
+        SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE}
            CACHE FILEPATH "protobuf executable." FORCE)
+        PROMPT_PROTOBUF_LIB(extern_protobuf)
    ENDIF()
-    SET(PROTOBUF_LITE_LIBRARY ${protobuf_LITE_LIBRARY} CACHE FILEPATH "protobuf lite library." FORCE)
-    SET(PROTOBUF_LIBRARY ${protobuf_LIBRARY} CACHE FILEPATH "protobuf library." FORCE)
-    SET(PROTOBUF_PROTOC_LIBRARY ${protobuf_PROTOC_LIBRARY} CACHE FILEPATH "protoc library." FORCE)
 ENDIF(NOT PROTOBUF_FOUND)
-
-PROMPT_PROTOBUF_LIB()
\ No newline at end of file
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -11,22 +11,80 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+#


-# To simplify the build process of PaddlePaddle, we defined couple of
-# fundamental abstractions, e.g., how to build library, binary and
-# test in C++, CUDA and Go.
+# generic.cmake defines CMakes functions that look like Bazel's
+# building rules (https://bazel.build/).
 #
+# 
 # -------------------------------------------
-#    C++	      CUDA C++	      Go
+#     C++        CUDA C++       Go
 # -------------------------------------------
-# cc_library	 nv_library	  go_library
-# cc_binary  	 nv_binary	  go_binary
-# cc_test        nv_test	  go_test
+# cc_library    nv_library   go_library
+# cc_binary     nv_binary    go_binary
+# cc_test       nv_test      go_test
 # -------------------------------------------
+# 
+# To build a static library example.a from example.cc using the system
+#  compiler (like GCC):
+# 
+#   cc_library(example SRCS example.cc)
+# 
+# To build a static library example.a from multiple source files
+# example{1,2,3}.cc:
+# 
+#   cc_library(example SRCS example1.cc example2.cc example3.cc)
+# 
+# To build a shared library example.so from example.cc:
+# 
+#   cc_library(example SHARED SRCS example.cc)
+# 
+# To build a library using Nvidia's NVCC from .cu file(s), use the nv_
+# prefixed version:
+# 
+#   nv_library(example SRCS example.cu)
+# 
+# To specify that a library new_example.a depends on other libraies:
+# 
+#   cc_library(new_example SRCS new_example.cc DEPS example)
+# 
+# Static libraries can be composed of other static libraries:
+# 
+#   cc_library(composed DEPS dependent1 dependent2 dependent3)
+# 
+# To build an executable binary file from some source files and
+# dependent libraries:
+# 
+#   cc_binary(example SRCS main.cc something.cc DEPS example1 example2)
+# 
+# To build an executable binary file using NVCC, use the nv_ prefixed
+# version:
+# 
+#   nv_binary(example SRCS main.cc something.cu DEPS example1 example2)
+# 
+# To build a unit test binary, which is an executable binary with
+# GoogleTest linked:
+# 
+#   cc_test(example_test SRCS example_test.cc DEPS example)
+# 
+# To build a unit test binary using NVCC, use the nv_ prefixed version:
+# 
+#   nv_test(example_test SRCS example_test.cu DEPS example)
+#
+# It is pretty often that executable and test binaries depend on
+# pre-defined external libaries like glog and gflags defined in
+# /cmake/external/*.cmake:
+#
+#   cc_test(example_test SRCS example_test.cc DEPS example glog gflags)
+#
+# To build a go static library using Golang, use the go_ prefixed version:
+#
+#   go_library(example STATIC)
+#
+# To build a go shared library using Golang, use the go_ prefixed version:
 #
-# cmake_parse_arguments can help us to achieve this goal.
-# https://cmake.org/cmake/help/v3.0/module/CMakeParseArguments.html
+#   go_library(example SHARED)
 #

 if(NOT APPLE)
@@ -34,33 +92,92 @@ if(NOT APPLE)
    link_libraries(${CMAKE_THREAD_LIBS_INIT})
 endif(NOT APPLE)

-# cc_library parses tensor.cc and figures out that target also depend on tensor.h.
-# cc_library(tensor
-#   SRCS
-#   tensor.cc
-#   DEPS
-#   variant)
+function(merge_static_libs TARGET_NAME)
+  set(libs ${ARGN})
+  list(REMOVE_DUPLICATES libs)
+
+  # First get the file names of the libraries to be merged
+  foreach(lib ${libs})
+    get_target_property(libtype ${lib} TYPE)
+    if(NOT libtype STREQUAL "STATIC_LIBRARY")
+      message(FATAL_ERROR "merge_static_libs can only process static libraries")
+    endif()
+    set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+  endforeach()
+
+  if(APPLE) # Use OSX's libtool to merge archives
+    add_custom_target(${TARGET_NAME}_archive
+      COMMAND libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+      DEPENDS ${libs}
+      )
+    add_library(${TARGET_NAME} STATIC IMPORTED GLOBAL)
+    set_property(TARGET ${TARGET_NAME} PROPERTY
+      IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a")
+    add_dependencies(${TARGET_NAME} ${TARGET_NAME}_archive)
+	else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+    foreach(lib ${libs})
+      set(objlistfile ${lib}.objlist) # list of objects in the input library
+      set(objdir ${lib}.objdir)
+
+      add_custom_command(OUTPUT ${objdir}
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir})
+
+      add_custom_command(OUTPUT ${objlistfile}
+        COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
+        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ../${objlistfile}
+        DEPENDS ${lib} ${objdir}
+        WORKING_DIRECTORY ${objdir})
+
+      # Empty dummy source file that goes into merged library
+      set(mergebase ${lib}.mergebase.c)
+      add_custom_command(OUTPUT ${mergebase}
+        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
+        DEPENDS ${objlistfile})
+
+      list(APPEND mergebases "${mergebase}")
+    endforeach()
+
+    # We need a target for the output merged library
+    add_library(${TARGET_NAME} STATIC ${mergebases})
+    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+
+    foreach(lib ${libs})
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+      COMMAND ${CMAKE_AR} ru ${outlibfile} @"../${objlistfile}"
+      WORKING_DIRECTORY ${objdir})
+    endforeach()
+
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+      COMMAND ${CMAKE_RANLIB} ${outlibfile})
+  endif()
+endfunction(merge_static_libs)
+
 function(cc_library TARGET_NAME)
-  set(options OPTIONAL)
+  set(options STATIC static SHARED shared)
  set(oneValueArgs "")
  set(multiValueArgs SRCS DEPS)
  cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  if (${cc_library_OPTIONAL} STREQUAL "SHARED")
-    add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
-  else()
-    add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
-  endif()
-  if (cc_library_DEPS)
-    add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
-  endif()
+  if (cc_library_SRCS)
+    if (cc_library_SHARED OR cc_library_shared) # build *.so
+      add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
+    else()
+      add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
+    endif()
+    if (cc_library_DEPS)
+      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
+    endif()
+  else(cc_library_SRCS)
+    if (cc_library_DEPS)
+      merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
+    else()
+      message(FATAL "Please specify source file or library in cc_library.")
+    endif()
+  endif(cc_library_SRCS)
 endfunction(cc_library)

-# cc_binary parses tensor.cc and figures out that target also depend on tensor.h.
-# cc_binary(tensor
-#   SRCS
-#   tensor.cc)
 function(cc_binary TARGET_NAME)
-  set(options OPTIONAL)
+  set(options "")
  set(oneValueArgs "")
  set(multiValueArgs SRCS DEPS)
  cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -71,13 +188,6 @@ function(cc_binary TARGET_NAME)
  endif()
 endfunction(cc_binary)

-# The dependency to target tensor implies that if any of
-# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built.
-# cc_test(tensor_test
-#   SRCS
-#   tensor_test.cc
-#   DEPS
-#   tensor)
 function(cc_test TARGET_NAME)
  if(WITH_TESTING)
    set(options "")
@@ -91,28 +201,28 @@ function(cc_test TARGET_NAME)
  endif()
 endfunction(cc_test)

-# Suppose that ops.cu includes global functions that take Tensor as
-# their parameters, so ops depend on tensor. This implies that if
-# any of tensor.{h.cc}, ops.{h,cu} is changed, ops need to be re-built.
-# nv_library(ops
-#   SRCS
-#   ops.cu
-#   DEPS
-#   tensor)
 function(nv_library TARGET_NAME)
  if (WITH_GPU)
-    set(options OPTIONAL)
+    set(options STATIC static SHARED shared)
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    if (${nv_library_OPTIONAL} STREQUAL "SHARED")
-      cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
-    else()
-      cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
-    endif()
-    if (nv_library_DEPS)
-      add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
-    endif()
+    if(nv_library_SRCS)
+      if (nv_library_SHARED OR nv_library_shared) # build *.so
+        cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
+      else()
+          cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
+      endif()
+      if (nv_library_DEPS)
+        add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
+      endif()
+    else(nv_library_SRCS)
+      if (nv_library_DEPS)
+        merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
+      else()
+        message(FATAL "Please specify source file or library in nv_library.")
+      endif()
+    endif(nv_library_SRCS)
  endif()
 endfunction(nv_library)

@@ -130,13 +240,6 @@ function(nv_binary TARGET_NAME)
  endif()
 endfunction(nv_binary)

-# The dependency to target tensor implies that if any of
-# ops{.h,.cu,_test.cu} is changed, ops_test need to be re-built.
-# nv_test(ops_test
-#   SRCS
-#   ops_test.cu
-#   DEPS
-#   ops)
 function(nv_test TARGET_NAME)
  if (WITH_GPU AND WITH_TESTING)
    set(options "")
@@ -152,42 +255,53 @@ endfunction(nv_test)

 set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
 file(MAKE_DIRECTORY ${GOPATH})
+set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle")

-# Because api.go defines a GO wrapper to ops and tensor, it depends on
-# both.  This implies that if any of tensor.{h,cc}, ops.{h,cu}, or
-# api.go is changed, api need to be re-built.
-# go_library(api
-#   SRCS
-#   api.go
-#   DEPS
-#   tensor # Because ops depend on tensor, this line is optional.
-#   ops)
 function(go_library TARGET_NAME)
-  set(options OPTIONAL)
+  set(options STATIC static SHARED shared)
  set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
+  set(multiValueArgs DEPS)
  cmake_parse_arguments(go_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  if (${go_library_OPTIONAL} STREQUAL "SHARED")
+
+  if (go_library_SHARED OR go_library_shared)
    set(BUILD_MODE "-buildmode=c-shared")
-    if(APPLE)
-      set(LIB_NAME "lib${TARGET_NAME}.dylib")
-    else()
-      set(LIB_NAME "lib${TARGET_NAME}.so")
-    endif()
+    set(LIB_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}")
  else()
    set(BUILD_MODE "-buildmode=c-archive")
-    set(LIB_NAME "lib${TARGET_NAME}.a")
+    set(LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}")
  endif()
-  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
+
+  # Add dummy code to support `make target_name` under Terminal Command
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+  file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+  if (go_library_SHARED OR go_library_shared)
+    add_library(${TARGET_NAME} SHARED ${dummyfile})
+  else()
+    add_library(${TARGET_NAME} STATIC ${dummyfile})
+  endif()
+  if(go_library_DEPS)
+    add_dependencies(${TARGET_NAME} ${go_library_DEPS})
+  endif(go_library_DEPS)
+
+  # we need to symlink Paddle directory into GOPATH. If we
+  # don't do it and we have code that depends on Paddle, go
+  # get ./... will download a new Paddle repo from Github,
+  # without the changes in our current Paddle repo that we
+  # want to build.
+  file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
+  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+    COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
+    # Symlink Paddle directory into GOPATH
+    COMMAND mkdir -p ${PADDLE_IN_GOPATH}
+    COMMAND rm -rf ${PADDLE_IN_GOPATH}                                                                                                                                         
+    COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH}
+    # Automatically get all dependencies specified in the source code                                                                                                                                 
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ./...
+    # Golang build source code
    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
    -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
-    ${go_library_SRCS}
+    ${GO_SOURCE}
    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-  add_custom_target(${TARGET_NAME}_lib ALL DEPENDS ${TARGET_NAME}_timestamp ${go_library_DEPS})
-  add_library(${TARGET_NAME} STATIC IMPORTED)
-  set_property(TARGET ${TARGET_NAME} PROPERTY
-    IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}")
-  add_dependencies(${TARGET_NAME} ${TARGET_NAME}_lib)
 endfunction(go_library)

 function(go_binary TARGET_NAME)
@@ -217,10 +331,3 @@ function(go_test TARGET_NAME)
  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS})
  add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME})
 endfunction(go_test)
-
-# go_extern will download extern go project.
-# go_extern(target_name extern_source)
-# go_extern(go_redis github.com/hoisie/redis)
-function(go_extern TARGET_NAME)
-  add_custom_target(${TARGET_NAME} env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get ${ARGN})
-endfunction(go_extern)
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -33,6 +33,7 @@ ELSE(WIN32)
            SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
                "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
        ENDIF()
+        set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
    ELSE(APPLE)

        IF(EXISTS "/etc/issue")
@@ -84,24 +85,6 @@ IF(DEFINED CMAKE_SYSTEM_NAME)
    ENDIF()
 ENDIF()

-# prefix and suffix on different os
-IF(WIN32)
-    SET(LIBRARY_PREFIX "")
-    SET(SHARED_LIBRARY_SUFFIX ".dll")
-    SET(STATIC_LIBRARY_SUFFIX ".lib")
-    SET(EXECUTABLE_SUFFIX ".exe")
-ELSE(WIN32)
-    SET(LIBRARY_PREFIX "lib")
-    IF(APPLE)
-        SET(SHARED_LIBRARY_SUFFIX ".dylib")
-    ELSE(APPLE)
-        SET(SHARED_LIBRARY_SUFFIX ".so")
-    ENDIF(APPLE)
-
-    SET(STATIC_LIBRARY_SUFFIX ".a")
-    SET(EXECUTABLE_SUFFIX "")
-ENDIF(WIN32)
-
 # external dependencies log output
 SET(EXTERNAL_PROJECT_LOG_ARGS
    LOG_DOWNLOAD    0     # Wrap download in script to log output

--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -84,6 +84,7 @@ function(link_paddle_exe TARGET_NAME)
        paddle_parameter
        paddle_proto
        paddle_cuda
+        paddle_optimizer
        ${EXTERNAL_LIBS}
        ${CMAKE_THREAD_LIBS_INIT}
        ${CMAKE_DL_LIBS}

--- a/doc/api/v2/config/evaluators.rst
+++ b/doc/api/v2/config/evaluators.rst
@@ -99,3 +99,12 @@ value_printer
 ..  automodule:: paddle.v2.evaluator
    :members:  value_printer
    :noindex:
+
+Detection
+=====
+
+detection_map
+-------------
+..  automodule:: paddle.v2.evaluator
+    :members:  detection_map
+    :noindex:
--- a/doc/design/cluster_train/pserver_client.md
+++ b/doc/design/cluster_train/pserver_client.md
@@ -74,14 +74,25 @@ typedef enum {
 typedef struct {
  char*               name;
  paddle_element_type element_type;
-  void*               content;
+  unsigned char*      content;
  int                 content_len;
 } paddle_parameter, paddle_gradient;

-typedef struct paddle_pserver_client paddle_pserver_client;
+typedef int paddle_pserver_client;

-paddle_pserver_client* paddle_new_pserver_client();
-void paddle_pserver_client_release(paddle_pserver_client* client);
+/**
+ * @brief creates a pserver client that talks to etcd for coordination.
+ */
+paddle_pserver_client paddle_new_etcd_pserver_client(char* etcd_addr);
+
+/**
+ * @brief creates a pserver client given pserver addresses.
+ *
+ * @param pserver_addrs comma-separated pserver addresses.
+ * @param selected if current pserver client is selected to initialize all parameter servers.
+ */
+paddle_pserver_client paddle_new_pserver_client(char* pserver_addrs, int selected);
+void paddle_pserver_client_release(paddle_pserver_client c);

 /**
 * @brief paddle_begin_init_params begins to initialize parameters on
@@ -95,7 +106,7 @@ void paddle_pserver_client_release(paddle_pserver_client* client);
 * @return 1 if the trainer is selected to initialize parameter
 * servers, otherwise 0.
 */
-int paddle_begin_init_params(paddle_pserver_client* client);
+int paddle_begin_init_params(paddle_pserver_client client);

 /**
 * @brief paddle_init_param initializes the parameter on parameter
@@ -109,7 +120,7 @@ int paddle_begin_init_params(paddle_pserver_client* client);
 * @paddle_begin_init_param). Or simply exit the program and wait for
 * the cluster management system to restart the trainer.
 */
-int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);
+int paddle_init_param(paddle_pserver_client client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);

 /**
 * @brief paddle_finish_init_params tells parameter servers client has
@@ -120,7 +131,7 @@ int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, con
 * @paddle_begin_init_param). Or simply exit the program and wait for
 * the cluster management system to restart the trainer.
 */
-int paddle_finish_init_params(paddle_pserver_client* client);
+int paddle_finish_init_params(paddle_pserver_client client);

 /**
 * @brief paddle_send_grads sends gradients to parameter servers for
@@ -131,7 +142,7 @@ int paddle_finish_init_params(paddle_pserver_client* client);
 * @param learning_rate the learning rate for the gradients.
 * @return 0 if successful, otherwise -1.
 */
-int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grads, int len);
+int paddle_send_grads(paddle_pserver_client client, const paddle_gradient* grads, int len);

 /**
 * @brief paddle_get_params gets parameters from parameter servers.
@@ -139,13 +150,15 @@ int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grad
 * paddle_get_params will block until parameters are initialized on
 * the parameter servers.
 *
- * @param names the array of names of the parameters to get.
- * @param dst the destination array of parameters to save to.
+ * @param dst the destination array of parameter pointers to save to.
+ * The parameter pointer must be pre-popullated with required parameter name,
+ * and the content of parameter must be pre-allocated of the size of required
+ * parameter on pserver.
 * @param len the length of the names array and the paddle_parameter
 * array.
 * @return 0 if successful, otherwise -1.
 */
-int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_parameter* dst, int len);
+int paddle_get_params(paddle_pserver_client client, paddle_parameter** dst, int len);

 /**
 * @brief paddle_save_model indicates parameters to save the parameter
@@ -154,5 +167,5 @@ int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_
 * @param path the path to save parameters.
 * @return 0 if successful, otherwise -1.
 */
-int paddle_save_model(paddle_pserver_client* client, const char* path);
+int paddle_save_model(paddle_pserver_client client, const char* path);
 ```
--- a/doc/design/cluster_train/remote_parameter_updater.md
+++ b/doc/design/cluster_train/remote_parameter_updater.md
+# Design Doc: Remote Parameter Updater for Cluster Train
+
+For an overview of distribute training, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter updater that will use parameter server cclient [The Client Library of Parameter Server Design Doc](pserver_client.md) to manage and update parameters.
+
+## Parameter Updater
+
+Parameter Updater is used by trainer to manage and update parameter, there are mainly two kind of parameter updater: local and remote, since this design is for cluster train, we will only discuss remote parameter updater here.
+
+### Remote Parameter Updater
+
+Remote Parameter Updater manage parameters through remote parameter server with the client that communicate with pserver([The Client Library of Parameter Server Design Doc](pserver_client.md))
+
+In PaddlePaddle Python V2 API, trainer is implemented in python, and the trainer will hold a instance of parameter updater and call it's functions directly. In this design, we will also expose the api of RemoteParameterUpdater to python with swig.
+
+#### Sparse Remote Parameter Updater
+
+Since we will only implement dense parameter management new, the mechanism for sparse parameter will be discussed in next stage.
+
+### Interface Design
+
+TBD
--- a/doc/design/scope.md
+++ b/doc/design/scope.md
+# Design of Scope in Paddle
+
+## Overview
+
+Scope is an important concept in programming languages, which defines a program region that a set of bindings between names and entities applies. In a specific scope, a valid name is uniquely associated with an entity, such as a variable. And in another scope, this name may refer to other entity or nothing at all. It clearly restricts the visibility and validity of names in a program. Hence **Scope** is introduced to PaddlePaddle to manage variables in context. But different from the original abstract concept, Scope now becomes an object with two important attributes:
+
+- Scope is an association of a name to variable.
+- Variables in a parent scope can be retrieved from local scope.
+
+A detailed explanation of these two attributes goes as following.
+
+
+## Scope is an association of a name to variable.
+
+Scope is an association of a name to variable. All variables belong to `Scope`. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. One net can run in different scopes and update different variable in the scope.
+
+
+1. Scope only contains a map of a name to variable.
+
+   All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state(momentum) etc.
+
+1. Variable can only be created by Scope and a variable can only be got from Scope. User cannot create or get a variable outside a scope. This is a constraints of our framework, and will keep our framework simple and clear.
+
+1. Scope only contains methods that are used to Create and Get Variables. Scope do not contain Operators and have no information to run them.
+    `Net` is designed to drive the computation and Scope only contains a map of variables. There is no computation logic inside a `Scope`. Scope just handles the lifetime management of variables.
+    - `Create` is used to create a Variable by its name and add the mapping relation.
+    - `Get` is used to find a Variable by name.
+
+1. Every variable only belongs to one certain Scope.
+
+   Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`.
+
+1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else. 
+
+   Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be a invalid pointer when associated `Scope` is destroyed.
+
+```cpp
+class Scope {
+ public:
+  Variable* CreateVariable(const std::string& name);
+  const Variable* GetVariable(const std::string& name) const;
+
+ private:
+    std::unordered_map<std::string, std::unique_ptr<Vairable>> vars_;
+};
+```
+
+
+## Parent scope and local scope
+
+Just like [scope](https://en.wikipedia.org/wiki/Scope_(computer_science)) in programming languages, `Scope` in the neural network can also be a local scope. There are two attributes about local scope.
+
+1.  We can create local variables in a local scope. When that local scope are destroyed, all local variables should also be destroyed.
+2.  Variables in a parent scope can be retrieved from local scopes of that parent scope, i.e., when user get a variable from a scope, it will try to search this variable in current scope. If there is no such variable in the local scope, `scope` will keep searching from its parent, until the variable is found or there is no parent.
+
+```cpp
+class Scope {
+ public:
+  Scope(const std::shared_ptr<Scope>& scope): parent_(scope) {}
+
+  Variable* GetVariable(const std::string& name) const {
+    Variable* var = GetVarLocally(name);
+    if (var != nullptr) {
+      return var;
+    } else if (parent_ != nullptr) {
+      return parent_->GetVariable(name);
+    } else {
+      return nullptr;
+    }
+  }
+
+ private:
+  std::shared_ptr<Scope> parent_ {nullptr};
+};
+```
+
+In `Scope` class, there is a private data member called `parent_`. `parent_` is a smart pointer to its parent scope. When user `Get` a variable by its `name`, the `name` will be searched inside the current scope. If the variable cannot be found locally and parent scope is not a `nullptr`, the variable will be searched inside that parent scope. `parent_` pointer's default value is `nullptr`. It means that the scope is a global scope when `parent_` is nullptr.
+
+A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily.
+
+# Interface Design
+
+```cpp
+class Variable {
+ private:
+  Variable() = default;
+  friend class Scope;
+};
+
+class Scope {
+ private:
+  Scope(const std::shared_ptr<Scope>& parent = nullptr);
+
+ public:
+  static std::shared_ptr<Scope> Create(const std::shared_ptr<Scope>& parent = nullptr);
+
+  // return nullptr if not found.
+  Variable* GetVariable(const std::string& name) const;
+
+  // return Error if already contains same name variable.
+  Error CreateVariable(const std::string& name);
+
+ private:
+  std::shared_ptr<Scope> parent_;
+  std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+};
+```
+## Only scope can create a variable
+
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `CreateVariable` can construct `Variable`.
+
+## When scope destroyed, all variables inside this scope should be destroyed together
+
+The scope hold unique pointers for all variables. User can `GetVariable` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.
+
+## Sharing a parent scope
+
+Local scope contains a `parent_` pointer. It is a linked-list for scopes. Using a `shared_ptr` because when a local scope is using, its parents cannot be destroyed.
+
+Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shared pointer. We cannot construct a scope variable, because it cannot be passed to other scope as `parent` pointer.
+
+## Orthogonal interface
+
+`GetVariable` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `CreateVariable` will return a `Error` when there is a name conflict locally. Combine `GetVariable` and `CreateVariable`, we can implement `CreateOrGetVariable` easily.
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -22,6 +22,7 @@ To compile the source code, your computer must be equipped with the following de
 - **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
 - **BLAS**: MKL, OpenBlas or ATLAS
 - **Python**: only support Python 2.7
+- **Go**

 **Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
 For CUDA 8.0, GCC versions later than 5.3 are not supported!
@@ -107,6 +108,18 @@ As a simple example, consider the following:
    sudo apt-get install -y python python-pip python-numpy libpython-dev bison
    sudo pip install 'protobuf==3.1.0.post1'

+    # Install Go
+    # You can follow https://golang.org/doc/install for a detailed explanation.
+    wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
+    tar -C $HOME -xzf go.tgz && \
+    mkdir $HOME/gopath && \
+    rm go.tgz
+
+    # Setup environment variables
+    export GOROOT=$HOME/go
+    export GOPATH=$HOME/gopath
+    export PATH=$PATH:$GOROOT/bin
+
    # install cmake 3.4
    curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
        cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \

--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ b/doc/howto/deep_model/rnn/index_cn.rst
@@ -4,6 +4,7 @@ RNN相关模型
 ..  toctree::
  :maxdepth: 1

+  rnn_config_cn.rst
  recurrent_group_cn.md
  hierarchical_layer_cn.rst
  hrnn_rnn_api_compare_cn.rst
--- a/doc/howto/deep_model/rnn/index_en.rst
+++ b/doc/howto/deep_model/rnn/index_en.rst
 RNN Models
 ==========
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn_config_en.rst
--- a/doc/howto/deep_model/rnn/rnn_config_cn.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst
@@ -5,36 +5,13 @@ RNN配置
 中配置循环神经网络（RNN）。PaddlePaddle
 高度支持灵活和高效的循环神经网络配置。 在本教程中，您将了解如何：

-  准备用来学习循环神经网络的序列数据。
 -  配置循环神经网络架构。
 -  使用学习完成的循环神经网络模型生成序列。

 我们将使用 vanilla 循环神经网络和 sequence to sequence
 模型来指导你完成这些步骤。sequence to sequence
-模型的代码可以在\ ``demo / seqToseq``\ 找到。
-
-准备序列数据
------------
-
-PaddlePaddle
-不需要对序列数据进行任何预处理，例如填充。唯一需要做的是将相应类型设置为输入。例如，以下代码段定义了三个输入。
-它们都是序列，它们的大小是\ ``src_dict``\ ，\ ``trg_dict``\ 和\ ``trg_dict``\ ：
-
-.. code:: python
-
-    settings.input_types = [
-      integer_value_sequence(len(settings.src_dict)),
-      integer_value_sequence(len(settings.trg_dict)),
-      integer_value_sequence(len(settings.trg_dict))]
-
-在\ ``process``\ 函数中，每个\ ``yield``\ 函数将返回三个整数列表。每个整数列表被视为一个整数序列：
-
-.. code:: python
-
-    yield src_ids, trg_ids, trg_ids_next
-
-有关如何编写数据提供程序的更多细节描述，请参考 :ref:`api_pydataprovider2` 。完整的数据提供文件在
-``demo/seqToseq/dataprovider.py``\ 。
+模型的代码可以在 `book/08.machine_translation <https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation>`_ 找到。
+wmt14数据的提供文件在 `python/paddle/v2/dataset/wmt14.py <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/wmt14.py>`_ 。

 配置循环神经网络架构
 --------------------
@@ -85,19 +62,19 @@ vanilla
                   act=None,
                   rnn_layer_attr=None):
        def __rnn_step__(ipt):
-           out_mem = memory(name=name, size=size)
-           rnn_out = mixed_layer(input = [full_matrix_projection(ipt),
-                                          full_matrix_projection(out_mem)],
-                                 name = name,
-                                 bias_attr = rnn_bias_attr,
-                                 act = act,
-                                 layer_attr = rnn_layer_attr,
-                                 size = size)
+           out_mem = paddle.layer.memory(name=name, size=size)
+           rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt),
+                                                 paddle.layer.full_matrix_projection(input=out_mem)],
+                                        name = name,
+                                        bias_attr = rnn_bias_attr,
+                                        act = act,
+                                        layer_attr = rnn_layer_attr,
+                                        size = size)
           return rnn_out
-        return recurrent_group(name='%s_recurrent_group' % name,
-                               step=__rnn_step__,
-                               reverse=reverse,
-                               input=input)
+        return paddle.layer.recurrent_group(name='%s_recurrent_group' % name,
+                                            step=__rnn_step__,
+                                            reverse=reverse,
+                                            input=input)

 PaddlePaddle
 使用“Memory”（记忆模块）实现单步函数。\ **Memory**\ 是在PaddlePaddle中构造循环神经网络时最重要的概念。
@@ -140,43 +117,52 @@ Sequence to Sequence Model with Attention
 .. code:: python

    # 定义源语句的数据层
-    src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
    # 计算每个词的词向量
-    src_embedding = embedding_layer(
+    src_embedding = paddle.layer.embedding(
        input=src_word_id,
        size=word_vector_dim,
-        param_attr=ParamAttr(name='_source_language_embedding'))
+        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
    # 应用前向循环神经网络
-    src_forward = grumemory(input=src_embedding, size=encoder_size)
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
    # 应用反向递归神经网络（reverse=True表示反向循环神经网络）
-    src_backward = grumemory(input=src_embedding,
-                              size=encoder_size,
-                              reverse=True)
+    src_backward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
    # 将循环神经网络的前向和反向部分混合在一起
-    encoded_vector = concat_layer(input=[src_forward, src_backward])
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])

    # 投射编码向量到 decoder_size
-    encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)],
-                               size = decoder_size)
+    encoded_proj = paddle.layer.mixed(
+        size=decoder_size,
+        input=paddle.layer.full_matrix_projection(encoded_vector))

    # 计算反向RNN的第一个实例
-    backward_first = first_seq(input=src_backward)
+    backward_first = paddle.layer.first_seq(input=src_backward)

    # 投射反向RNN的第一个实例到 decoder size
-    decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation())
+    decoder_boot = paddle.layer.mixed(
+       size=decoder_size,
+       act=paddle.activation.Tanh(),
+       input=paddle.layer.full_matrix_projection(backward_first))

 解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在
 ``gru_decoder_with_attention`` 中定义：

 .. code:: python

-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-                  StaticInput(input=encoded_proj,is_seq=True)]
-    trg_embedding = embedding_layer(
-        input=data_layer(name='target_language_word',
-                         size=target_dict_dim),
-        size=word_vector_dim,
-        param_attr=ParamAttr(name='_target_language_embedding'))
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+    trg_embedding = paddle.layer.embedding(
+            input=paddle.layer.data(
+                name='target_language_word',
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
    group_inputs.append(trg_embedding)

    # 对于配备有注意力机制的解码器，在训练中，
@@ -185,9 +171,10 @@ Sequence to Sequence Model with Attention
    # StaticInput 意味着不同时间步的输入都是相同的值，
    # 否则它以一个序列输入，不同时间步的输入是不同的。
    # 所有输入序列应该有相同的长度。
-    decoder = recurrent_group(name=decoder_group_name,
-                              step=gru_decoder_with_attention,
-                              input=group_inputs)
+    decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs)

 单步函数的实现如下所示。首先，它定义解码网络的\ **Memory**\ 。然后定义
 attention，门控循环单元单步函数和输出函数：
@@ -198,27 +185,32 @@ attention，门控循环单元单步函数和输出函数：
        # 定义解码器的Memory
        # Memory的输出定义在 gru_step 内
        # 注意 gru_step 应该与它的Memory名字相同
-        decoder_mem = memory(name='gru_decoder',
-                             size=decoder_size,
-                             boot_layer=decoder_boot)
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
        # 计算 attention 加权编码向量
-        context = simple_attention(encoded_sequence=enc_vec,
-                                   encoded_proj=enc_proj,
-                                   decoder_state=decoder_mem)
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
        # 混合当前词向量和attention加权编码向量
-        decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context),
-                                               full_matrix_projection(current_word)],
-                                     size = decoder_size * 3)
+         decoder_inputs = paddle.layer.mixed(
+            size=decoder_size * 3,
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
        # 定义门控循环单元循环神经网络单步函数
-        gru_step = gru_step_layer(name='gru_decoder',
-                                  input=decoder_inputs,
-                                  output_mem=decoder_mem,
-                                  size=decoder_size)
+         gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
        # 定义输出函数
-        out = mixed_layer(input=[full_matrix_projection(input=gru_step)],
-                          size=target_dict_dim,
-                          bias_attr=True,
-                          act=SoftmaxActivation())
+         out = paddle.layer.mixed(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
        return out

 生成序列
@@ -238,41 +230,32 @@ attention，门控循环单元单步函数和输出函数：
   -  ``beam_size``: beam search 算法中的beam大小。
   -  ``max_length``: 生成序列的最大长度。

-  使用 ``seqtext_printer_evaluator``
-   根据索引矩阵和字典打印文本。这个函数需要设置：
-
-   -  ``id_input``: 数据的整数ID，用于标识生成的文件中的相应输出。
-   -  ``dict_file``: 用于将词ID转换为词的字典文件。
-   -  ``result_file``: 生成结果文件的路径。
-
 代码如下：

 .. code:: python

-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-                  StaticInput(input=encoded_proj,is_seq=True)]
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
    # 在生成时，解码器基于编码源序列和最后生成的目标词预测下一目标词。
    # 编码源序列（编码器输出）必须由只读Memory的 StaticInput 指定。
    # 这里， GeneratedInputs 自动获取上一个生成的词，并在最开始初始化为起始词，如 <s>。
-    trg_embedding = GeneratedInput(
-        size=target_dict_dim,
-        embedding_name='_target_language_embedding',
-        embedding_size=word_vector_dim)
+    trg_embedding = paddle.layer.GeneratedInput(
+            size=target_dict_dim,
+            embedding_name='_target_language_embedding',
+            embedding_size=word_vector_dim)
    group_inputs.append(trg_embedding)
-    beam_gen = beam_search(name=decoder_group_name,
-                           step=gru_decoder_with_attention,
-                           input=group_inputs,
-                           bos_id=0, # Beginnning token.
-                           eos_id=1, # End of sentence token.
-                           beam_size=beam_size,
-                           max_length=max_length)
-
-    seqtext_printer_evaluator(input=beam_gen,
-                              id_input=data_layer(name="sent_id", size=1),
-                              dict_file=trg_dict_path,
-                              result_file=gen_trans_file)
-    outputs(beam_gen)
-
-注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 :ref:`semantic_role_labeling` 了解更多详细信息。
-
-完整的配置文件在\ ``demo/seqToseq/seqToseq_net.py``\ 。
+    beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs,
+            bos_id=0, # Beginnning token.
+            eos_id=1, # End of sentence token.
+            beam_size=beam_size,
+            max_length=max_length)
+
+    return beam_gen
+
+注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 `book/06.understand_sentiment <https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment>`_ 了解更多详细信息。
+
+完整的配置文件在 `book/08.machine_translation/train.py <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py>`_ 。
--- a/doc/howto/deep_model/rnn/rnn_config_en.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_en.rst
@@ -3,34 +3,11 @@ RNN Configuration

 This tutorial will guide you how to configure recurrent neural network in PaddlePaddle. PaddlePaddle supports highly flexible and efficient recurrent neural network configuration. In this tutorial, you will learn how to:

- prepare sequence data for learning recurrent neural networks.
 - configure recurrent neural network architecture.
 - generate sequence with learned recurrent neural network models.

-We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at :code:`demo/seqToseq`.
-
-=====================
-Prepare Sequence Data
-=====================
-
-PaddlePaddle does not need any preprocessing to sequence data, such as padding. The only thing that needs to be done is to set the type of the corresponding type to input. For example, the following code snippets defines three input. All of them are sequences, and the size of them are :code:`src_dict`, :code:`trg_dict`, and :code:`trg_dict`:
-
-.. code-block:: python
-
-    settings.input_types = [
-      integer_value_sequence(len(settings.src_dict)),
-      integer_value_sequence(len(settings.trg_dict)),
-      integer_value_sequence(len(settings.trg_dict))]
-
-
-Then at the :code:`process` function, each :code:`yield` function will return three integer lists. Each integer list is treated as a sequence of integers:
-
-.. code-block:: python
-
-    yield src_ids, trg_ids, trg_ids_next
-
-
-For more details description of how to write a data provider, please refer to :ref:`api_pydataprovider2` . The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`.
+We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at `book/08.machine_translation <https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation>`_ .
+And the data preparation of this model can be found at `python/paddle/v2/dataset/wmt14.py <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/wmt14.py>`_ 

 ===============================================
 Configure Recurrent Neural Network Architecture
@@ -75,19 +52,19 @@ Its **output function** simply takes :math:`x_t` as the output.
                   act=None,
                   rnn_layer_attr=None):
        def __rnn_step__(ipt):
-           out_mem = memory(name=name, size=size)
-           rnn_out = mixed_layer(input = [full_matrix_projection(ipt),
-                                          full_matrix_projection(out_mem)],
-                                 name = name,
-                                 bias_attr = rnn_bias_attr,
-                                 act = act,
-                                 layer_attr = rnn_layer_attr,
-                                 size = size)
+           out_mem = paddle.layer.memory(name=name, size=size)
+           rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt),
+                                                 paddle.layer.full_matrix_projection(input=out_mem)],
+                                        name = name,
+                                        bias_attr = rnn_bias_attr,
+                                        act = act,
+                                        layer_attr = rnn_layer_attr,
+                                        size = size)
           return rnn_out
-        return recurrent_group(name='%s_recurrent_group' % name,
-                               step=__rnn_step__,
-                               reverse=reverse,
-                               input=input)
+        return paddle.layer.recurrent_group(name='%s_recurrent_group' % name,
+                                            step=__rnn_step__,
+                                            reverse=reverse,
+                                            input=input)


 PaddlePaddle uses memory to construct step function. **Memory** is the most important concept when constructing recurrent neural networks in PaddlePaddle. A memory is a state that is used recurrently in step functions, such as :math:`x_{t+1} = f_x(x_t)`. One memory contains an **output** and a **input**. The output of memory at the current time step is utilized as the input of the memory at the next time step. A memory can also has a **boot layer**, whose output is utilized as the initial value of the memory. In our case, the output of the gated recurrent unit is employed as the output memory. Notice that the name of the layer :code:`rnn_out` is the same as the name of :code:`out_mem`. This means the output of the layer :code:`rnn_out` (:math:`x_{t+1}`) is utilized as the **output** of :code:`out_mem` memory.
@@ -113,43 +90,52 @@ We also project the encoder vector to :code:`decoder_size` dimensional space, ge
 .. code-block:: python

    # Define the data layer of the source sentence.
-    src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
    # Calculate the word embedding of each word.
-    src_embedding = embedding_layer(
+    src_embedding = paddle.layer.embedding(
        input=src_word_id,
        size=word_vector_dim,
-        param_attr=ParamAttr(name='_source_language_embedding'))
+        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
    # Apply forward recurrent neural network.
-    src_forward = grumemory(input=src_embedding, size=encoder_size)
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
    # Apply backward recurrent neural network. reverse=True means backward recurrent neural network.
-    src_backward = grumemory(input=src_embedding,
-                              size=encoder_size,
-                              reverse=True)
+    src_backward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
    # Mix the forward and backward parts of the recurrent neural network together.
-    encoded_vector = concat_layer(input=[src_forward, src_backward])
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])

    # Project encoding vector to decoder_size.
-    encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)],
-                               size = decoder_size)
+    encoded_proj = paddle.layer.mixed(
+        size=decoder_size,
+        input=paddle.layer.full_matrix_projection(encoded_vector))

    # Compute the first instance of the backward RNN.
-    backward_first = first_seq(input=src_backward)
+    backward_first = paddle.layer.first_seq(input=src_backward)

    # Project the first instance of backward RNN to decoder size.
-    decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation())
+    decoder_boot = paddle.layer.mixed(
+       size=decoder_size,
+       act=paddle.activation.Tanh(),
+       input=paddle.layer.full_matrix_projection(backward_first))


 The decoder uses :code:`recurrent_group` to define the recurrent neural network. The step and output functions are defined in :code:`gru_decoder_with_attention`:

 .. code-block:: python

-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-                  StaticInput(input=encoded_proj,is_seq=True)]
-    trg_embedding = embedding_layer(
-        input=data_layer(name='target_language_word',
-                         size=target_dict_dim),
-        size=word_vector_dim,
-        param_attr=ParamAttr(name='_target_language_embedding'))
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+    trg_embedding = paddle.layer.embedding(
+            input=paddle.layer.data(
+                name='target_language_word',
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
    group_inputs.append(trg_embedding)

    # For decoder equipped with attention mechanism, in training,
@@ -158,9 +144,10 @@ The decoder uses :code:`recurrent_group` to define the recurrent neural network.
    # StaticInput means the same value is utilized at different time steps.
    # Otherwise, it is a sequence input. Inputs at different time steps are different.
    # All sequence inputs should have the same length.
-    decoder = recurrent_group(name=decoder_group_name,
-                              step=gru_decoder_with_attention,
-                              input=group_inputs)
+    decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs)


 The implementation of the step function is listed as below. First, it defines the **memory** of the decoder network. Then it defines attention, gated recurrent unit step function, and the output function:
@@ -171,27 +158,32 @@ The implementation of the step function is listed as below. First, it defines th
        # Defines the memory of the decoder.
        # The output of this memory is defined in gru_step.
        # Notice that the name of gru_step should be the same as the name of this memory.
-        decoder_mem = memory(name='gru_decoder',
-                             size=decoder_size,
-                             boot_layer=decoder_boot)
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
        # Compute attention weighted encoder vector.
-        context = simple_attention(encoded_sequence=enc_vec,
-                                   encoded_proj=enc_proj,
-                                   decoder_state=decoder_mem)
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
        # Mix the current word embedding and the attention weighted encoder vector.
-        decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context),
-                                               full_matrix_projection(current_word)],
-                                     size = decoder_size * 3)
+        decoder_inputs = paddle.layer.mixed(
+            size=decoder_size * 3,
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
        # Define Gated recurrent unit recurrent neural network step function.
-        gru_step = gru_step_layer(name='gru_decoder',
-                                  input=decoder_inputs,
-                                  output_mem=decoder_mem,
-                                  size=decoder_size)
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
        # Defines the output function.
-        out = mixed_layer(input=[full_matrix_projection(input=gru_step)],
-                          size=target_dict_dim,
-                          bias_attr=True,
-                          act=SoftmaxActivation())
+        out = paddle.layer.mixed(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
        return out


@@ -207,45 +199,37 @@ After training the model, we can use it to generate sequences. A common practice
  - :code:`eos_id`: the end token. Every sentence ends with the end token.
  - :code:`beam_size`: the beam size used in beam search.
  - :code:`max_length`: the maximum length of the generated sentences.
-
-* use :code:`seqtext_printer_evaluator` to print text according to index matrix and dictionary. This function needs to set:
-
-  - :code:`id_input`: the integer ID of the data, used to identify the corresponding output in the generated files.
-  - :code:`dict_file`: the dictionary file for converting word id to word.
-  - :code:`result_file`: the path of the generation result file.
    
 The code is listed below:

 .. code-block:: python

-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-                  StaticInput(input=encoded_proj,is_seq=True)]
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
    # In generation, decoder predicts a next target word based on
    # the encoded source sequence and the last generated target word.
    # The encoded source sequence (encoder's output) must be specified by
    # StaticInput which is a read-only memory.
    # Here, GeneratedInputs automatically fetchs the last generated word,
    # which is initialized by a start mark, such as <s>.
-    trg_embedding = GeneratedInput(
-        size=target_dict_dim,
-        embedding_name='_target_language_embedding',
-        embedding_size=word_vector_dim)
+    trg_embedding = paddle.layer.GeneratedInput(
+            size=target_dict_dim,
+            embedding_name='_target_language_embedding',
+            embedding_size=word_vector_dim)
    group_inputs.append(trg_embedding)
-    beam_gen = beam_search(name=decoder_group_name,
-                           step=gru_decoder_with_attention,
-                           input=group_inputs,
-                           bos_id=0, # Beginnning token.
-                           eos_id=1, # End of sentence token.
-                           beam_size=beam_size,
-                           max_length=max_length)
+    beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs,
+            bos_id=0, # Beginnning token.
+            eos_id=1, # End of sentence token.
+            beam_size=beam_size,
+            max_length=max_length)

-    seqtext_printer_evaluator(input=beam_gen,
-                              id_input=data_layer(name="sent_id", size=1),
-                              dict_file=trg_dict_path,
-                              result_file=gen_trans_file)
-    outputs(beam_gen)
+    return beam_gen


-Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to :ref:`semantic_role_labeling` for more details.
+Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to `book/06.understand_sentiment <https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment>`_ for more details.

-The full configuration file is located at :code:`demo/seqToseq/seqToseq_net.py`.
+The full configuration file is located at `book/08.machine_translation/train.py <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py>`_ .
--- a/go/cmake/CMakeDetermineGoCompiler.cmake
+++ b/go/cmake/CMakeDetermineGoCompiler.cmake
-if(NOT CMAKE_Go_COMPILER)
-  if(NOT $ENV{GO_COMPILER} STREQUAL "")
-    get_filename_component(CMAKE_Go_COMPILER_INIT $ENV{GO_COMPILER} PROGRAM PROGRAM_ARGS CMAKE_Go_FLAGS_ENV_INIT)
-
-    if(CMAKE_Go_FLAGS_ENV_INIT)
-      set(CMAKE_Go_COMPILER_ARG1 "${CMAKE_Go_FLAGS_ENV_INIT}" CACHE STRING "First argument to Go compiler")
-    endif()
-
-    if(NOT EXISTS ${CMAKE_Go_COMPILER_INIT})
-      message(SEND_ERROR "Could not find compiler set in environment variable GO_COMPILER:\n$ENV{GO_COMPILER}.")
-    endif()
-
-  endif()
-
-  set(Go_BIN_PATH
-    $ENV{GOPATH}
-    $ENV{GOROOT}
-    $ENV{GOROOT}/../bin
-    $ENV{GO_COMPILER}
-    /usr/bin
-    /usr/local/bin
-    )
-
-  if(CMAKE_Go_COMPILER_INIT)
-    set(CMAKE_Go_COMPILER ${CMAKE_Go_COMPILER_INIT} CACHE PATH "Go Compiler")
-  else()
-    find_program(CMAKE_Go_COMPILER
-      NAMES go
-      PATHS ${Go_BIN_PATH}
-    )
-    EXEC_PROGRAM(${CMAKE_Go_COMPILER} ARGS version OUTPUT_VARIABLE GOLANG_VERSION)
-    STRING(REGEX MATCH "go[0-9]+.[0-9]+.[0-9]+[ /A-Za-z0-9]*" VERSION "${GOLANG_VERSION}")
-    message("-- The Golang compiler identification is ${VERSION}")
-    message("-- Check for working Golang compiler: ${CMAKE_Go_COMPILER}")
-  endif()
-
-endif()
-
-mark_as_advanced(CMAKE_Go_COMPILER)
-
-configure_file(${CMAKE_MODULE_PATH}/CMakeGoCompiler.cmake.in
-  ${CMAKE_PLATFORM_INFO_DIR}/CMakeGoCompiler.cmake @ONLY)
-
-set(CMAKE_Go_COMPILER_ENV_VAR "GO_COMPILER")
--- a/go/cmake/CMakeGoCompiler.cmake.in
+++ b/go/cmake/CMakeGoCompiler.cmake.in
-set(CMAKE_Go_COMPILER "@CMAKE_Go_COMPILER@")
-set(CMAKE_Go_COMPILER_LOADED 1)
-
-set(CMAKE_Go_SOURCE_FILE_EXTENSIONS go)
-set(CMAKE_Go_LINKER_PREFERENCE 40)
-set(CMAKE_Go_OUTPUT_EXTENSION .o)
-set(CMAKE_Go_OUTPUT_EXTENSION_REPLACE 1)
-set(CMAKE_Go_COMPILER_ENV_VAR "GO_COMPILER")
--- a/go/cmake/CMakeGoInformation.cmake
+++ b/go/cmake/CMakeGoInformation.cmake
-if(NOT CMAKE_Go_COMPILE_OBJECT)
-  set(CMAKE_Go_COMPILE_OBJECT "go tool compile -l -N -o <OBJECT> <SOURCE> ")
-endif()
-
-if(NOT CMAKE_Go_LINK_EXECUTABLE)
-  set(CMAKE_Go_LINK_EXECUTABLE "go tool link -o <TARGET> <OBJECTS>  ")
-endif()
--- a/go/cmake/CMakeTestGoCompiler.cmake
+++ b/go/cmake/CMakeTestGoCompiler.cmake
-set(CMAKE_Go_COMPILER_WORKS 1 CACHE INTERNAL "")
--- a/go/cmake/flags.cmake
+++ b/go/cmake/flags.cmake
-# Setting Paddle Compile Flags
-include(CheckCXXCompilerFlag)
-include(CheckCCompilerFlag)
-include(CheckCXXSymbolExists)
-include(CheckTypeSize)
-
-function(CheckCompilerCXX11Flag)
-    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
-            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
-        endif()
-    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-        # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
-        # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
-        # https://gist.github.com/yamaya/2924292
-        if(APPLE)  # cmake < 3.0 compiler id "Clang" on Mac OS X
-            if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.1)
-                message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.")
-            endif()
-        else()
-            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
-                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
-            endif()
-        endif()
-    endif()
-endfunction()
-
-CheckCompilerCXX11Flag()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-# Common gpu architectures: Kepler, Maxwell
-foreach(capability 30 35 50)
-      list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
-endforeach()
-
-if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
-      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
-endif()
-
-# Modern gpu architectures: Pascal
-if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
-      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
-endif()
-
-set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
--- a/go/cmake/golang.cmake
+++ b/go/cmake/golang.cmake
-set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
-file(MAKE_DIRECTORY ${GOPATH})
-set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle")
-file(MAKE_DIRECTORY ${PADDLE_IN_GOPATH})
-
-function(GO_LIBRARY NAME BUILD_TYPE)
-  if(BUILD_TYPE STREQUAL "STATIC")
-    set(BUILD_MODE -buildmode=c-archive)
-    set(LIB_NAME "lib${NAME}.a")
-  else()
-    set(BUILD_MODE -buildmode=c-shared)
-    if(APPLE)
-      set(LIB_NAME "lib${NAME}.dylib")
-    else()
-      set(LIB_NAME "lib${NAME}.so")
-    endif()
-  endif()
-
-  file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
-  file(RELATIVE_PATH rel ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
-
-  # find Paddle directory.
-  get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
-  get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
-  get_filename_component(PADDLE_DIR ${PARENT_DIR} DIRECTORY)
-
-  # automatically get all dependencies specified in the source code
-  # for given target.
-  add_custom_target(goGet env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ${rel}/...)
-
-  # make a symlink that references Paddle inside $GOPATH, so go get
-  # will use the local changes in Paddle rather than checkout Paddle
-  # in github.
-  add_custom_target(copyPaddle
-    COMMAND ln -sf ${PADDLE_DIR} ${PADDLE_IN_GOPATH})
-  add_dependencies(goGet copyPaddle)
-
-  add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
-    -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
-    ${CMAKE_GO_FLAGS} ${GO_SOURCE}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-
-  add_custom_target(${NAME} ALL DEPENDS ${OUTPUT_DIR}/.timestamp ${ARGN})
-  add_dependencies(${NAME} goGet)
-
-  if(NOT BUILD_TYPE STREQUAL "STATIC")
-    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME} DESTINATION bin)
-  endif()
-endfunction(GO_LIBRARY)
--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
@@ -5,89 +5,65 @@ import (
 	"net"
 	"net/http"
 	"net/rpc"
-	"os"
-	"path/filepath"
 	"strconv"
 	"strings"
 	"time"

 	"github.com/namsral/flag"
+	log "github.com/sirupsen/logrus"

 	"github.com/PaddlePaddle/Paddle/go/master"
-	"github.com/PaddlePaddle/recordio"
+	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
 )

 func main() {
 	port := flag.Int("port", 8080, "port of the master server.")
-	dataset := flag.String("training_dataset", "", "dataset: comma separated path to RecordIO paths, supports golb patterns.")
-	faultTolerance := flag.Bool("fault_tolerance", false, "enable fault tolerance (requires etcd).")
+	ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.")
+	endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.")
 	taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
 	taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
 	chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
 	flag.Parse()

-	if *dataset == "" {
-		panic("no dataset specified.")
+	if *endpoints == "" {
+		log.Warningln("-endpoints not set, fault tolerance not be enabled.")
 	}

-	if *faultTolerance {
-		panic("fault tolernance not implemented.")
-	}
-
-	var chunks []master.Chunk
-	var paths []string
-	ss := strings.Split(*dataset, ",")
-	fmt.Println(ss)
-	for _, s := range ss {
-		match, err := filepath.Glob(s)
+	var store master.Store
+	if *endpoints != "" {
+		eps := strings.Split(*endpoints, ",")
+		ip, err := networkhelper.GetExternalIP()
 		if err != nil {
-			panic(err)
+			log.Fatal(err)
 		}
-		paths = append(paths, match...)
-	}

-	if len(paths) == 0 {
-		panic("no valid datset specified.")
-	}
-
-	idx := 0
-	for _, path := range paths {
-		f, err := os.Open(path)
+		addr := fmt.Sprintf("%s:%d", ip, *port)
+		store, err = master.NewEtcdClient(eps, addr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, *ttlSec)
 		if err != nil {
-			panic(err)
+			log.Fatal(err)
 		}
+	} else {
+		store = &master.InMemStore{}
+	}

-		index, err := recordio.LoadIndex(f)
-		if err != nil {
-			panic(err)
-		}
-		f.Close()
-
-		count := index.NumChunks()
-		for i := 0; i < count; i++ {
-			chunk := master.Chunk{
-				Idx:   idx,
-				Path:  path,
-				Index: *index.ChunkIndex(i),
-			}
-			chunks = append(chunks, chunk)
-		}
+	s, err := master.NewService(store, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
+	if err != nil {
+		log.Fatal(err)
 	}

-	s := master.NewService(chunks, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
-	err := rpc.Register(s)
+	err = rpc.Register(s)
 	if err != nil {
-		panic(err)
+		log.Fatal(err)
 	}

 	rpc.HandleHTTP()
 	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
 	if err != nil {
-		panic(err)
+		log.Fatal(err)
 	}

 	err = http.Serve(l, nil)
 	if err != nil {
-		panic(err)
+		log.Fatal(err)
 	}
 }
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -5,18 +5,36 @@ import (
 	"net/http"
 	"net/rpc"
 	"strconv"
+	"time"

 	"github.com/namsral/flag"

 	"github.com/PaddlePaddle/Paddle/go/pserver"
+	log "github.com/sirupsen/logrus"
 )

 func main() {
 	port := flag.Int("port", 0, "port of the pserver")
+	etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
+		"comma separated endpoint string for pserver to connect to etcd")
+	etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls")
+	numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job")
+	logLevel := flag.String("log-level", "info",
+		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()

-	s := pserver.NewService()
-	err := rpc.Register(s)
+	level, err := log.ParseLevel(*logLevel)
+	if err != nil {
+		panic(err)
+	}
+	log.SetLevel(level)
+
+	timeout := time.Second * time.Duration((*etcdTimeout))
+	s, err := pserver.NewService(*etcdEndpoint, *numPservers, timeout)
+	if err != nil {
+		panic(err)
+	}
+	err = rpc.Register(s)
 	if err != nil {
 		panic(err)
 	}
@@ -27,7 +45,9 @@ func main() {
 		panic(err)
 	}

+	log.Infof("start pserver at port %d", *port)
 	err = http.Serve(l, nil)
+
 	if err != nil {
 		panic(err)
 	}

--- a/go/pserver/internal/connection/conn.go
+++ b/go/pserver/internal/connection/conn.go
@@ -4,6 +4,8 @@ import (
 	"errors"
 	"net/rpc"
 	"sync"
+
+	log "github.com/sirupsen/logrus"
 )

 // TODO(helin): add TCP re-connect logic
@@ -21,6 +23,18 @@ func New() *Conn {
 	return c
 }

+// Close closes the connection.
+func (c *Conn) Close() error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if c.client == nil {
+		return nil
+	}
+
+	return c.client.Close()
+}
+
 // Connect connects the connection to a address.
 func (c *Conn) Connect(addr string) error {
 	c.mu.Lock()
@@ -50,12 +64,20 @@ func (c *Conn) Connect(addr string) error {
 			c.waitConn = nil
 		}
 	} else {
+		err := client.Close()
+		if err != nil {
+			log.Errorln(err)
+		}
+
 		return errors.New("client already set from a concurrent goroutine")
 	}

 	return nil
 }

+// TODO(helin): refactor Call to be able to perform given retry
+// policy.
+
 // Call make a RPC call.
 //
 // Call will be blocked until the connection to remote RPC service

--- a/go/master/c/CMakeLists.txt
+++ b/go/master/c/CMakeLists.txt
+cmake_minimum_required(VERSION 3.0)
+
+get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
+get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake")
+
+project(cxx_go C Go)
+
+include(golang)
+include(flags)
+
+set(MASTER_LIB_NAME "paddle_master")
+go_library(${MASTER_LIB_NAME} SHARED)
+
+if(PROJ_ROOT)
+  add_custom_command(OUTPUT ${PROJ_ROOT}/python/paddle/v2/master/lib${MASTER_LIB_NAME}.so
+    COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/lib${MASTER_LIB_NAME}.h
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/lib${MASTER_LIB_NAME}.so ${PROJ_ROOT}/python/paddle/v2/master/
+    DEPENDS ${MASTER_LIB_NAME})
+  add_custom_target(paddle_master_shared ALL DEPENDS ${PROJ_ROOT}/python/paddle/v2/master/lib${MASTER_LIB_NAME}.so)
+endif(PROJ_ROOT)
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
+package main
+
+/*
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#define PADDLE_MASTER_OK    0
+#define PADDLE_MASTER_ERROR -1
+
+typedef int paddle_master_client;
+*/
+import "C"
+
+import (
+	"sync"
+	"unsafe"
+
+	"github.com/PaddlePaddle/Paddle/go/master"
+	log "github.com/sirupsen/logrus"
+)
+
+var nullPtr = unsafe.Pointer(uintptr(0))
+var mu sync.Mutex
+var handleMap = make(map[C.paddle_master_client]*master.Client)
+var curHandle C.paddle_master_client
+
+func add(c *master.Client) C.paddle_master_client {
+	mu.Lock()
+	defer mu.Unlock()
+	client := curHandle
+	curHandle++
+	handleMap[client] = c
+	return client
+}
+
+func get(client C.paddle_master_client) *master.Client {
+	mu.Lock()
+	defer mu.Unlock()
+	return handleMap[client]
+}
+
+func remove(client C.paddle_master_client) *master.Client {
+	mu.Lock()
+	defer mu.Unlock()
+	h := handleMap[client]
+	delete(handleMap, client)
+	return h
+}
+
+type addresser string
+
+func (a addresser) Address() string {
+	return string(a)
+}
+
+//export paddle_new_master_client
+func paddle_new_master_client(addr *C.char, bufSize int) C.paddle_master_client {
+	a := C.GoString(addr)
+	c := master.NewClient(addresser(a), bufSize)
+	return add(c)
+}
+
+//export paddle_release_master_client
+func paddle_release_master_client(client C.paddle_master_client) {
+	remove(client)
+}
+
+//export paddle_set_dataset
+func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int) C.int {
+	c := get(client)
+	var paths []string
+	for i := 0; i < int(size); i++ {
+		ptr := (**C.char)(unsafe.Pointer(uintptr(unsafe.Pointer(path)) + uintptr(i)*unsafe.Sizeof(*path)))
+		str := C.GoString(*ptr)
+		paths = append(paths, str)
+	}
+	err := c.SetDataset(paths)
+	if err != nil {
+		log.Errorln(err)
+		return C.PADDLE_MASTER_ERROR
+	}
+
+	return C.PADDLE_MASTER_OK
+}
+
+//export paddle_next_record
+func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
+	c := get(client)
+	r := c.NextRecord()
+	if len(r) == 0 {
+		*record = (*C.uchar)(nullPtr)
+		return 0
+	}
+
+	size := C.size_t(len(r))
+	*record = (*C.uchar)(C.malloc(size))
+	C.memcpy(unsafe.Pointer(*record), unsafe.Pointer(&r[0]), size)
+	return C.int(size)
+}
+
+//export mem_free
+func mem_free(p unsafe.Pointer) {
+	// "free" may be a better name for this function, but doing so
+	// will cause calling any function of this library from Python
+	// ctypes hanging.
+	C.free(p)
+}
+
+func main() {}
--- a/go/master/client.go
+++ b/go/master/client.go
+package master
+
+import (
+	"os"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/connection"
+	"github.com/PaddlePaddle/recordio"
+	log "github.com/sirupsen/logrus"
+)
+
+// Addresser provide the address of the master server.
+type Addresser interface {
+	Address() string
+}
+
+// Client is the client of the master server.
+type Client struct {
+	conn *connection.Conn
+	ch   chan []byte
+}
+
+// NewClient creates a new Client.
+//
+// bufSize is the record buffer size. NextRecord will read from this
+// buffer.
+func NewClient(addr Addresser, bufSize int) *Client {
+	c := &Client{}
+	c.conn = connection.New()
+	c.ch = make(chan []byte, bufSize)
+	go c.monitorMaster(addr)
+	go c.getRecords()
+	return c
+}
+
+func (c *Client) getRecords() {
+	for {
+		t, err := c.getTask()
+		if err != nil {
+			// TODO(helin): wait before move on with next
+			// getTask call.
+			log.Errorln(err)
+			continue
+		}
+
+		for _, chunk := range t.Chunks {
+			f, err := os.Open(chunk.Path)
+			if err != nil {
+				log.Errorln(err)
+				continue
+			}
+
+			s := recordio.NewRangeScanner(f, &chunk.Index, -1, -1)
+			for s.Scan() {
+				c.ch <- s.Record()
+			}
+
+			if s.Err() != nil {
+				log.Errorln(err, chunk.Path)
+			}
+
+			err = f.Close()
+			if err != nil {
+				log.Errorln(err)
+			}
+		}
+
+		// We treat a task as finished whenever the last data
+		// instance of the task is read. This is not exactly
+		// correct, but a reasonable approximation.
+		c.taskFinished(t.ID)
+	}
+}
+
+func (c *Client) monitorMaster(addr Addresser) {
+	lastMaster := ""
+	monitor := func() {
+		// get the lastest address of the master server,
+		// connect to the new address once address changed.
+		curMaster := addr.Address()
+		if curMaster != lastMaster {
+			if curMaster == "" {
+				err := c.conn.Close()
+				if err != nil {
+					log.Errorln(err)
+				}
+			} else {
+				err := c.conn.Connect(curMaster)
+				if err != nil {
+					log.Errorln(err)
+
+					// connect to addr failed, set
+					// to last known addr in order
+					// to retry next time.
+					curMaster = lastMaster
+				}
+
+			}
+		}
+
+		lastMaster = curMaster
+	}
+
+	monitor()
+	ticker := time.NewTicker(10 * time.Second)
+	for _ = range ticker.C {
+		monitor()
+	}
+}
+
+// SetDataset set dataset for the master server to dispatch.
+//
+// SetDataset can be call multiple times from different nodes. But
+// only the first call will be honored.
+func (c *Client) SetDataset(globPaths []string) error {
+	return c.conn.Call("Service.SetDataset", globPaths, nil)
+}
+
+// getTask gets a new task from the master server.
+func (c *Client) getTask() (Task, error) {
+	var t Task
+	err := c.conn.Call("Service.GetTask", 0, &t)
+	return t, err
+}
+
+// TaskFinished tells the master server a task is finished.
+func (c *Client) taskFinished(taskID int) error {
+	return c.conn.Call("Service.TaskFinished", taskID, nil)
+}
+
+// NextRecord returns next record in the dataset.
+//
+// NextRecord will block until the next record is available. It is
+// thread-safe.
+func (c *Client) NextRecord() []byte {
+	return <-c.ch
+}
--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
+package master
+
+import (
+	"fmt"
+	"net"
+	"net/http"
+	"net/rpc"
+	"os"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	log "github.com/sirupsen/logrus"
+
+	"github.com/PaddlePaddle/Paddle/go/connection"
+	"github.com/PaddlePaddle/recordio"
+)
+
+const (
+	totalTask    = 20
+	chunkPerTask = 10
+)
+
+func init() {
+	log.SetLevel(log.ErrorLevel)
+}
+
+type TestAddresser string
+
+func (a TestAddresser) Address() string {
+	return string(a)
+}
+
+func TestGetFinishTask(t *testing.T) {
+	const path = "/tmp/master_client_test_0"
+
+	l, err := net.Listen("tcp", ":0")
+	if err != nil {
+		panic(err)
+	}
+
+	ss := strings.Split(l.Addr().String(), ":")
+	p, err := strconv.Atoi(ss[len(ss)-1])
+	if err != nil {
+		panic(err)
+	}
+
+	go func(l net.Listener) {
+		s, err := NewService(&InMemStore{}, chunkPerTask, time.Second, 1)
+		if err != nil {
+			panic(err)
+		}
+
+		server := rpc.NewServer()
+		err = server.Register(s)
+		if err != nil {
+			panic(err)
+		}
+
+		mux := http.NewServeMux()
+		mux.Handle(rpc.DefaultRPCPath, server)
+		err = http.Serve(l, mux)
+		if err != nil {
+			panic(err)
+		}
+	}(l)
+
+	f, err := os.Create(path)
+	if err != nil {
+		panic(err)
+	}
+
+	for i := 0; i < totalTask*chunkPerTask; i++ {
+		w := recordio.NewWriter(f, -1, -1)
+		w.Write(nil)
+		// call Close to force RecordIO writing a chunk.
+		w.Close()
+	}
+	f.Close()
+
+	// Manually intialize client to avoid calling c.getRecords()
+	c := &Client{}
+	c.conn = connection.New()
+	go c.monitorMaster(TestAddresser(fmt.Sprintf(":%d", p)))
+	c.SetDataset([]string{path})
+
+	checkOnePass := func(i int) {
+		var tasks []Task
+		for idx := 0; idx < totalTask; idx++ {
+			task, err := c.getTask()
+			if err != nil {
+				t.Fatalf("Error: %v, pass: %d\n", err, i)
+			}
+			tasks = append(tasks, task)
+		}
+
+		_, err = c.getTask()
+		if err == nil {
+			t.Fatalf("Should get error, pass: %d\n", i)
+		}
+
+		err = c.taskFinished(tasks[0].ID)
+		if err != nil {
+			t.Fatalf("Error: %v, pass: %d\n", err, i)
+		}
+		tasks = tasks[1:]
+		task, err := c.getTask()
+		if err != nil {
+			t.Fatal(err)
+		}
+		tasks = append(tasks, task)
+
+		for _, task := range tasks {
+			err = c.taskFinished(task.ID)
+			if err != nil {
+				t.Fatalf("Error: %v, pass: %d\n", err, i)
+			}
+		}
+	}
+
+	for i := 0; i < 10; i++ {
+		checkOnePass(i)
+	}
+}
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
+package master_test
+
+import (
+	"fmt"
+	"net"
+	"net/http"
+	"net/rpc"
+	"os"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/master"
+	"github.com/PaddlePaddle/recordio"
+)
+
+func TestNextRecord(t *testing.T) {
+	const (
+		path  = "/tmp/master_client_TestFull"
+		total = 50
+	)
+
+	l, err := net.Listen("tcp", ":0")
+	if err != nil {
+		panic(err)
+	}
+
+	ss := strings.Split(l.Addr().String(), ":")
+	p, err := strconv.Atoi(ss[len(ss)-1])
+	if err != nil {
+		panic(err)
+	}
+
+	go func(l net.Listener) {
+		s, err := master.NewService(&master.InMemStore{}, 10, time.Second, 1)
+		if err != nil {
+			panic(err)
+		}
+
+		server := rpc.NewServer()
+		err = server.Register(s)
+		if err != nil {
+			panic(err)
+		}
+
+		mux := http.NewServeMux()
+		mux.Handle(rpc.DefaultRPCPath, server)
+		err = http.Serve(l, mux)
+		if err != nil {
+			panic(err)
+		}
+	}(l)
+
+	f, err := os.Create(path)
+	if err != nil {
+		panic(err)
+	}
+
+	w := recordio.NewWriter(f, -1, -1)
+	for i := 0; i < total; i++ {
+		w.Write([]byte{byte(i)})
+	}
+	w.Close()
+	f.Close()
+
+	c := master.NewClient(master.TestAddresser(fmt.Sprintf(":%d", p)), 10)
+	c.SetDataset([]string{path})
+
+	for pass := 0; pass < 50; pass++ {
+		received := make(map[byte]bool)
+		for i := 0; i < total; i++ {
+			r := c.NextRecord()
+			if len(r) != 1 {
+				t.Fatal("Length should be 1.", r)
+			}
+			if received[r[0]] {
+				t.Fatal("Received duplicate.", received, r)
+			}
+			received[r[0]] = true
+		}
+	}
+}
--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
+package master
+
+import (
+	"context"
+	"time"
+
+	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/clientv3/concurrency"
+	log "github.com/sirupsen/logrus"
+)
+
+const (
+	// DefaultLockPath is the default etcd master lock path.
+	DefaultLockPath = "/master/lock"
+	// DefaultStatePath is the default etcd key for master state.
+	DefaultStatePath = "/master/state"
+	// DefaultAddrPath is the default etcd key for master address.
+	DefaultAddrPath = "/master/addr"
+)
+
+// EtcdClient is the etcd client that master uses for fault tolerance
+// and service registry.
+type EtcdClient struct {
+	lockPath  string
+	statePath string
+	client    *clientv3.Client
+	lock      *concurrency.Mutex
+}
+
+// NewEtcdClient creates a new EtcdClient.
+func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
+	log.Debugf("Connecting to etcd at %v", endpoints)
+	// TODO(helin): gracefully shutdown etcd store. Becuase etcd
+	// store holds a etcd lock, even though the lock will expire
+	// when the lease timeout, we need to implement graceful
+	// shutdown to release the lock.
+	cli, err := clientv3.New(clientv3.Config{
+		Endpoints:   endpoints,
+		DialTimeout: dialTimeout,
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	sess, err := concurrency.NewSession(cli, concurrency.WithTTL(ttlSec))
+	if err != nil {
+		return nil, err
+	}
+
+	lock := concurrency.NewMutex(sess, lockPath)
+	// It's fine for the lock to get stuck, in this case we have
+	// multiple master servers running (only configured to have
+	// one master running, but split-brain problem may cuase
+	// multiple master servers running), and the cluster management
+	// software will kill one of them.
+	log.Debugf("Trying to acquire lock at %s.", lockPath)
+	err = lock.Lock(context.TODO())
+	if err != nil {
+		return nil, err
+	}
+	log.Debugf("Successfully acquired lock at %s.", lockPath)
+
+	put := clientv3.OpPut(addrPath, string(addr))
+	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
+	if err != nil {
+		return nil, err
+	}
+
+	if !resp.Succeeded {
+		log.Fatal("No longer owns the master lock. Exiting.")
+	}
+
+	e := &EtcdClient{
+		lockPath:  lockPath,
+		statePath: statePath,
+		client:    cli,
+		lock:      lock,
+	}
+
+	return e, nil
+}
+
+// Save saves the state into the etcd.
+func (e *EtcdClient) Save(state []byte) error {
+	ctx := context.TODO()
+	put := clientv3.OpPut(e.statePath, string(state))
+	resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
+	if err != nil {
+		return err
+	}
+
+	if !resp.Succeeded {
+		log.Errorln("No longer owns the lock, trying to lock again")
+		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+		err := e.lock.Lock(ctx)
+		cancel()
+		if err != nil {
+			// We lost the master lock and can not acquire
+			// it back, it means some other master is
+			// already started. We don't want cluster
+			// managment system to kill the master server
+			// who is holding the lock and running
+			// correctly. So the most feasible solution is
+			// to kill current master server. The current
+			// state is not saved, but the trainer's RPC
+			// call will fail, so the trainer will retry.
+			log.Fatalf("Could not acquire the lock at %s: %v. Exiting.", e.lockPath, err)
+		}
+		log.Infof("Successfully acquired lock at %s.", e.lockPath)
+		return e.Save(state)
+	}
+
+	return nil
+}
+
+// Load loads the state from etcd.
+func (e *EtcdClient) Load() ([]byte, error) {
+	ctx := context.TODO()
+	get := clientv3.OpGet(e.statePath)
+
+	resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(get).Commit()
+	if err != nil {
+		return nil, err
+	}
+
+	if !resp.Succeeded {
+		log.Errorln("No longer owns the lock, trying to lock and load again.")
+		err = e.lock.Lock(context.Background())
+		if err != nil {
+			return nil, err
+		}
+
+		return e.Load()
+	}
+
+	kvs := resp.Responses[0].GetResponseRange().Kvs
+	if len(kvs) == 0 {
+		// No state exists
+		return nil, nil
+	}
+
+	state := kvs[0].Value
+	return state, nil
+}
--- a/go/master/inmem_store.go
+++ b/go/master/inmem_store.go
+package master
+
+import "sync"
+
+// InMemStore is an in memory implementation of Store interface.
+//
+// It does not tolerate the fault that casues the program to crash.
+type InMemStore struct {
+	mu  sync.Mutex
+	buf []byte
+}
+
+// Save saves the state into the in-memory store.
+func (m *InMemStore) Save(state []byte) error {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	m.buf = state
+	return nil
+}
+
+// Load loads the state from the in-memory store.
+func (m *InMemStore) Load() ([]byte, error) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	return m.buf, nil
+}
--- a/go/master/service.go
+++ b/go/master/service.go
 package master

 import (
+	"bytes"
+	"compress/gzip"
+	"encoding/gob"
 	"errors"
-	"log"
+	"os"
+	"path/filepath"
 	"sync"
 	"time"

+	log "github.com/sirupsen/logrus"
+
 	"github.com/PaddlePaddle/recordio"
 )

 const (
-	targetTaskCount = 300
+	dialTimeout = 5 * time.Second
 )

-// errors
-var (
-	ErrNoMoreTask          = errors.New("no more task for current pass")
-	ErrPendingTaskNotFound = errors.New("pending task not found")
-)
+// Store is the interface for save and load the master state.
+type Store interface {
+	Save([]byte) error
+	Load() ([]byte, error)
+}
+
+// Chunk is a chunk of data consisted of several data instances.
+type Chunk struct {
+	Path  string
+	Index recordio.Index // chunk index
+}
+
+// Task is the basic unit of data instances assigned to trainers.
+type Task struct {
+	ID     int
+	Chunks []Chunk
+}
+
+type taskEntry struct {
+	Epoch      int
+	NumTimeout int
+	Task       Task
+}
+
+type taskQueues struct {
+	Todo    []taskEntry
+	Pending map[int]taskEntry // map from task ID to task entry
+	Done    []taskEntry
+	Failed  []Task
+}

 // Service is the master server service.
 type Service struct {
-	timeoutDur time.Duration
-	timeoutMax int
+	chunksPerTask int
+	timeoutDur    time.Duration
+	timeoutMax    int
+	ready         chan struct{}
+	store         Store

 	mu         sync.Mutex
+	initDone   bool
 	taskQueues taskQueues
 }

-// Recover recovers service state from etcd.
-func Recover() (*Service, error) {
-	// TODO(helin): recover from snapshot state from etcd.
-	return nil, nil
-}
-
 func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	id := 0
 	if chunksPerTask <= 0 {
@@ -55,7 +84,6 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {

 	if len(cur.Task.Chunks) > 0 {
 		cur.Task.ID = id
-		id++
 		result = append(result, cur)
 	}

@@ -63,55 +91,251 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 }

 // NewService creates a new service.
-func NewService(chunks []Chunk, chunksPerTask int, timeoutDur time.Duration, timeoutMax int) *Service {
+func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, timeoutMax int) (*Service, error) {
 	s := &Service{}
+	s.chunksPerTask = chunksPerTask
 	s.timeoutDur = timeoutDur
 	s.timeoutMax = timeoutMax
 	s.taskQueues = taskQueues{}
 	s.taskQueues.Pending = make(map[int]taskEntry)
-	s.taskQueues.Todo = partition(chunks, chunksPerTask)
-	return s
-}
+	s.ready = make(chan struct{})
+	s.store = store
+	recovered, err := s.recover()
+	if err != nil {
+		return nil, err
+	}

-// Chunk is a chunk of data consisted of several data instances.
-type Chunk struct {
-	Idx   int // index of the chunk within the file
-	Path  string
-	Index recordio.Index // block index
+	if recovered {
+		// Recovered. Now the state is already initialized,
+		// and the master is ready.
+		s.initDone = true
+		close(s.ready)
+		log.Info("Master recovered from saved state.")
+	}
+
+	return s, nil
 }

-// Task is the basic unit of data instances assigned to trainers.
-type Task struct {
-	ID     int
-	Chunks []Chunk
+// recover recovers service state from etcd.
+func (s *Service) recover() (bool, error) {
+	state, err := s.store.Load()
+	if err != nil {
+		return false, err
+	}
+
+	if state == nil {
+		log.Infoln("No state exists, not recovered.")
+		return false, nil
+	}
+
+	log.Infof("Loaded snapshot of size: %d bytes.", len(state))
+	gr, err := gzip.NewReader(bytes.NewReader(state))
+	if err != nil {
+		return false, err
+	}
+
+	dec := gob.NewDecoder(gr)
+	var tqs taskQueues
+	err = dec.Decode(&tqs)
+	if err != nil {
+		return false, err
+	}
+
+	err = gr.Close()
+	if err != nil {
+		// Only close failed, recover actually succeed, so
+		// just log error.
+		log.Errorln(err)
+	}
+
+	s.taskQueues = tqs
+	return true, nil
 }

-type taskEntry struct {
-	Epoch      int
-	NumTimeout int
-	Task       Task
+// snapshot *must* be called with s.mu being held.
+func (s *Service) snapshot() error {
+	// TOOD(helin): etcd request has a size limit, so the snapshot
+	// size is limited by the max request size. We should either
+	// divide the snapshot into smaller chunks and save under
+	// different keys, or configure the request size to be big
+	// enough:
+	// https://github.com/coreos/etcd/blob/2f84f3d8d8ed8f9537ab6ffa44a3a1c7eddfa9b1/embed/config.go#L44
+	var buf bytes.Buffer
+	gw := gzip.NewWriter(&buf)
+	enc := gob.NewEncoder(gw)
+	err := enc.Encode(s.taskQueues)
+	if err != nil {
+		return err
+	}
+	err = gw.Close()
+	if err != nil {
+		return err
+	}
+
+	state := buf.Bytes()
+	log.Infof("Saving snapshot of size: %d bytes.", len(state))
+	return s.store.Save(state)
 }

-type taskQueues struct {
-	Todo    []taskEntry
-	Pending map[int]taskEntry // map from task ID to task entry
-	Done    []taskEntry
-	Failed  []Task
+func readChunks(globPaths []string) ([]Chunk, error) {
+	var chunks []Chunk
+	var paths []string
+
+	for _, s := range globPaths {
+		match, err := filepath.Glob(s)
+		if err != nil {
+			return nil, err
+		}
+		paths = append(paths, match...)
+	}
+
+	if len(paths) == 0 {
+		return nil, errors.New("no valid dataset specified")
+	}
+
+	for _, path := range paths {
+		f, err := os.Open(path)
+		if err != nil {
+			return nil, err
+		}
+
+		index, err := recordio.LoadIndex(f)
+		if err != nil {
+			return nil, err
+		}
+		err = f.Close()
+		if err != nil {
+			return nil, err
+		}
+
+		count := index.NumChunks()
+		for i := 0; i < count; i++ {
+			chunk := Chunk{
+				Path:  path,
+				Index: *index.ChunkIndex(i),
+			}
+			chunks = append(chunks, chunk)
+		}
+	}
+
+	return chunks, nil
 }

-// *must* be called with s.mu being held.
-func (s *Service) snapshot() error {
-	// TODO(helin): snapshot state on etcd.
+// SetDataset sets dataset to dispatch for the master server.
+//
+// SetDataset can be call multiple times. But only the first call will
+// be honored.
+func (s *Service) SetDataset(globPaths []string, dummy *int) error {
+	if len(globPaths) == 0 {
+		return errors.New("no dataset specified")
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.initDone {
+		// Already initialized. All trainer will call
+		// SetDataset, but we only handle the first one. Treat
+		// other calls as successful but do nothing.
+		return nil
+	}
+
+	chunks, err := readChunks(globPaths)
+	if err != nil {
+		return err
+	}
+
+	s.taskQueues.Todo = partition(chunks, s.chunksPerTask)
+
+	err = s.snapshot()
+	if err != nil {
+		log.Errorln(err)
+		return err
+	}
+
+	close(s.ready)
+	s.initDone = true
 	return nil
 }

+func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
+	return func() {
+		s.mu.Lock()
+		defer s.mu.Unlock()
+
+		t, ok := s.taskQueues.Pending[taskID]
+		if !ok {
+			return
+		}
+
+		if t.Epoch != epoch {
+			// new epoch, task launched after the
+			// schedule of this timeout check.
+			return
+		}
+
+		defer func() {
+			err := s.snapshot()
+			if err != nil {
+				log.Errorln(err)
+			}
+		}()
+
+		delete(s.taskQueues.Pending, t.Task.ID)
+
+		t.NumTimeout++
+		if t.NumTimeout > s.timeoutMax {
+			log.Warningf("Task %v timed out %d times, discard.", t.Task, t.NumTimeout)
+			s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
+			return
+		}
+
+		log.Warningf("Task %v timed out %d times, retry.", t.Task, t.NumTimeout)
+		s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+	}
+}
+
+// must be called with lock held.
+func (s *Service) logFields() log.Fields {
+	return log.Fields{
+		"todoLen":    len(s.taskQueues.Todo),
+		"pendingLen": len(s.taskQueues.Pending),
+		"doneLen":    len(s.taskQueues.Done),
+		"failedLen":  len(s.taskQueues.Failed),
+	}
+}
+
 // GetTask gets a new task from the service.
 func (s *Service) GetTask(dummy int, task *Task) error {
+	select {
+	case <-s.ready:
+	}
+
 	s.mu.Lock()
 	defer s.mu.Unlock()

 	if len(s.taskQueues.Todo) == 0 {
-		return ErrNoMoreTask
+		if len(s.taskQueues.Done) == 0 {
+			if len(s.taskQueues.Pending) == 0 {
+				err := errors.New("all task failed")
+				log.WithFields(s.logFields()).Warningln("All tasks failed.")
+				return err
+			}
+
+			// TODO(helin): client need to retry in this
+			// error case. Gotcha: RPC client can't
+			// compare returned error with predefined
+			// errors like io.EOF, because the error
+			// instance deserialized from RPC is a
+			// different instance than the error defined
+			// in package. So we need to figure out a way
+			// for client to check this error correctly.
+			err := errors.New("no more available task")
+			log.WithFields(s.logFields()).Warningln("No more available task.")
+			return err
+		}
+		s.taskQueues.Todo = s.taskQueues.Done
+		s.taskQueues.Done = nil
+		log.WithFields(s.logFields()).Infoln("No more todo task, but trainer is requesting task to do. Move all done task to todo.")
 	}

 	t := s.taskQueues.Todo[0]
@@ -123,56 +347,45 @@ func (s *Service) GetTask(dummy int, task *Task) error {
 		return err
 	}

-	time.AfterFunc(s.timeoutDur, func(taskID int, epoch int) func() {
-		return func() {
-			s.mu.Lock()
-			defer s.mu.Unlock()
-
-			t, ok := s.taskQueues.Pending[taskID]
-			if !ok {
-				return
-			}
+	*task = t.Task
+	log.WithFields(s.logFields()).Infof("Task #%d dispatched.", task.ID)

-			if t.Epoch != epoch {
-				// new epoch, task launched after the
-				// schedule of this timeout check.
-				return
-			}
-
-			defer func() {
-				err := s.snapshot()
-				if err != nil {
-					log.Println(err)
-				}
-			}()
-
-			delete(s.taskQueues.Pending, t.Task.ID)
-
-			t.NumTimeout++
-			if t.NumTimeout > s.timeoutMax {
-				s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
-				return
-			}
-
-			s.taskQueues.Todo = append(s.taskQueues.Todo, t)
-		}
-	}(t.Task.ID, t.Epoch))
+	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.ID, t.Epoch))
 	return nil
 }

 // TaskFinished tell the service that a task is finished.
 func (s *Service) TaskFinished(taskID int, dummy *int) error {
+	select {
+	case <-s.ready:
+	}
+
 	s.mu.Lock()
 	defer s.mu.Unlock()

 	t, ok := s.taskQueues.Pending[taskID]
 	if !ok {
-		return ErrPendingTaskNotFound
+		err := errors.New("pending task not found")
+		log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID)
+		return err
 	}

 	// task finished, reset timeout
 	t.NumTimeout = 0
 	s.taskQueues.Done = append(s.taskQueues.Done, t)
 	delete(s.taskQueues.Pending, taskID)
-	return s.snapshot()
+
+	log.WithFields(s.logFields()).Infof("Task #%d finished.", taskID)
+
+	if len(s.taskQueues.Pending) == 0 && len(s.taskQueues.Todo) == 0 {
+		log.WithFields(s.logFields()).Infoln("No more todo and pending task, start a new pass.")
+		s.taskQueues.Todo = append(s.taskQueues.Todo, s.taskQueues.Done...)
+		s.taskQueues.Done = nil
+	}
+
+	err := s.snapshot()
+	if err != nil {
+		log.Errorln(err)
+	}
+	return err
 }
--- a/go/pserver/cclient/CMakeLists.txt
+++ b/go/pserver/cclient/CMakeLists.txt
-cmake_minimum_required(VERSION 3.0)
+go_library(paddle_pserver_cclient STATIC)

-get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
-get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake")
-
-project(cxx_go C Go)
-
-include(golang)
-include(flags)
-
-go_library(client STATIC)
 add_subdirectory(test)
--- a/go/pserver/cclient/cclient.go
+++ b/go/pserver/cclient/cclient.go
 package main

 /*
-#include <stdlib.h>
 #include <string.h>
 typedef enum {
  PADDLE_ELEMENT_TYPE_INT32   = 0,
@@ -19,39 +18,27 @@ typedef struct {
  int                 content_len;
 } paddle_parameter, paddle_gradient;

-static inline void paddle_release_param(paddle_parameter* param) {
-  if (param != NULL) {
-    if (param->name != NULL) {
-      free(param->name);
-    }
-
-    if (param->content != NULL) {
-      free(param->content);
-    }
-
-    free(param);
-  }
-}
-
-typedef int client;
+typedef int paddle_pserver_client;
+#define PSERVER_ERROR -1
+#define PSERVER_OK 0
 */
 import "C"

 import (
-	"log"
 	"strings"
 	"sync"
 	"unsafe"

 	"github.com/PaddlePaddle/Paddle/go/pserver"
+	log "github.com/sirupsen/logrus"
 )

 var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
-var handleMap = make(map[C.client]*pserver.Client)
-var curHandle C.client
+var handleMap = make(map[C.paddle_pserver_client]*pserver.Client)
+var curHandle C.paddle_pserver_client

-func add(c *pserver.Client) C.client {
+func add(c *pserver.Client) C.paddle_pserver_client {
 	mu.Lock()
 	defer mu.Unlock()
 	client := curHandle
@@ -60,13 +47,13 @@ func add(c *pserver.Client) C.client {
 	return client
 }

-func get(client C.client) *pserver.Client {
+func get(client C.paddle_pserver_client) *pserver.Client {
 	mu.Lock()
 	defer mu.Unlock()
 	return handleMap[client]
 }

-func remove(client C.client) *pserver.Client {
+func remove(client C.paddle_pserver_client) *pserver.Client {
 	mu.Lock()
 	defer mu.Unlock()
 	h := handleMap[client]
@@ -100,7 +87,7 @@ func (l lister) List() []pserver.Server {
 }

 //export paddle_new_pserver_client
-func paddle_new_pserver_client(addrs *C.char, selected int) C.client {
+func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_client {
 	a := C.GoString(addrs)
 	as := strings.Split(a, ",")
 	servers := make([]pserver.Server, len(as))
@@ -113,27 +100,27 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.client {
 }

 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcd_addr *C.char) C.client {
+func paddle_new_etcd_pserver_client(etcd_addr *C.char) C.paddle_pserver_client {
 	// TODO(helin): fault tolerant pserver client using etcd.
 	panic("not implemented.")
 }

 //export paddle_pserver_client_release
-func paddle_pserver_client_release(client C.client) {
+func paddle_pserver_client_release(client C.paddle_pserver_client) {
 	remove(client)
 }

 //export paddle_begin_init_params
-func paddle_begin_init_params(client C.client) C.int {
+func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
 	if selected := c.BeginInitParams(); selected {
 		return 1
 	}
-	return 0
+	return C.PSERVER_OK
 }

 //export paddle_init_param
-func paddle_init_param(client C.client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
+func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
 	et := pserver.ElementType(param.element_type)
 	name := C.GoString(param.name)
 	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
@@ -143,31 +130,41 @@ func paddle_init_param(client C.client, param C.paddle_parameter, param_config u
 	}
 	c := get(client)
 	err := c.InitParam(pc)
+
 	if err != nil {
-		log.Println(err)
-		return -1
+		if err.Error() == pserver.AlreadyInitialized {
+			log.Warningf("parameter %s already initialized, treat paddle_init_param as sucessful.", name)
+			return C.PSERVER_OK
+		}
+		log.Errorln(err)
+		return C.PSERVER_ERROR
 	}

-	return 0
+	return C.PSERVER_OK
 }

 //export paddle_finish_init_params
-func paddle_finish_init_params(client C.client) C.int {
+func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
 	err := c.FinishInitParams()
 	if err != nil {
-		log.Println(err)
-		return -1
+		if err.Error() == pserver.AlreadyInitialized {
+			log.Warningln("parameters already initialized, treat paddle_finish_init_params as sucessful.")
+			return C.PSERVER_OK
+		}
+
+		log.Errorln(err)
+		return C.PSERVER_ERROR
 	}

-	return 0
+	return C.PSERVER_OK
 }

 //export paddle_send_grads
-func paddle_send_grads(client C.client, grads *C.paddle_gradient, total C.int) C.int {
+func paddle_send_grads(client C.paddle_pserver_client, grads **C.paddle_gradient, total C.int) C.int {
 	var gs []pserver.Gradient
 	for i := 0; i < int(total); i++ {
-		grad := (*C.paddle_gradient)(unsafe.Pointer((uintptr(unsafe.Pointer(grads)) + uintptr(i)*unsafe.Sizeof(*grads))))
+		grad := *(**C.paddle_gradient)(unsafe.Pointer((uintptr(unsafe.Pointer(grads)) + uintptr(i)*unsafe.Sizeof(*grads))))
 		et := pserver.ElementType(grad.element_type)
 		name := C.GoString(grad.name)
 		content := cArrayToSlice(unsafe.Pointer(grad.content), int(grad.content_len))
@@ -177,84 +174,82 @@ func paddle_send_grads(client C.client, grads *C.paddle_gradient, total C.int) C
 	c := get(client)
 	err := c.SendGrads(gs)
 	if err != nil {
-		log.Println(err)
-		return -1
+		log.Errorln(err)
+		return C.PSERVER_ERROR
 	}

-	return 0
+	return C.PSERVER_OK
 }

 //export paddle_get_params
-func paddle_get_params(client C.client, names **C.char, dst **C.paddle_parameter, total C.int) C.int {
+func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter, total C.int) C.int {
 	var ns []string
 	for i := 0; i < int(total); i++ {
-		name := *(**C.char)(unsafe.Pointer((uintptr(unsafe.Pointer(names)) + uintptr(i)*unsafe.Sizeof(*names))))
-		ns = append(ns, C.GoString(name))
+		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
+		ns = append(ns, C.GoString(param.name))
 	}
 	c := get(client)
 	ps, err := c.GetParams(ns)
 	if err != nil {
-		log.Println(err)
-		return -1
+		log.Errorln(err)
+		return C.PSERVER_ERROR
 	}

-	for i := 0; i < int(total); i++ {
-		if i >= len(ps) {
-			break
+	if len(ps) != len(ns) {
+		pn := make([]string, len(ps))
+		for i, p := range ps {
+			pn[i] = p.Name
 		}
+		log.Errorf("pserver returned wrong number of parameters. Requested: %s, returned: %s.", strings.Join(pn, ", "), strings.Join(ns, ", "))
+		return C.PSERVER_ERROR
+	}
+
+	for i := range ps {
+		if ns[i] != ps[i].Name {
+			pn := make([]string, len(ps))
+			for i, p := range ps {
+				pn[i] = p.Name
+			}
+			log.Errorf("pserver returned wrong parameters, or not in requested order. Requested: %s, returned: %s.", strings.Join(pn, ", "), strings.Join(ns, ", "))
+			return C.PSERVER_ERROR
+		}
+	}

+	for i := 0; i < int(total); i++ {
 		p := ps[i]
 		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
-		nameReady := false
-		contentAllocated := false

 		if unsafe.Pointer(param) == nullPtr {
-			param = (*C.paddle_parameter)(C.calloc(1, C.size_t(unsafe.Sizeof(*param))))
-		} else {
-			if unsafe.Pointer(param.name) != nullPtr {
-				if n := C.GoString(param.name); n != p.Name {
-					log.Println("Warning: the pre-allocated parameter name does not match the parameter name, it will be freed.", n, p.Name)
-					C.free(unsafe.Pointer(param.name))
-				} else {
-					nameReady = true
-				}
-			}
+			log.Errorln("must pre-allocate parameter.")
+			return C.PSERVER_ERROR
+		}

-			if unsafe.Pointer(param.content) != nullPtr {
-				if int(param.content_len) == len(p.Content) {
-					contentAllocated = true
-				} else {
-					log.Println("Warning: the pre-allocated content len does not match parameter content len, the pre-allocated content will be freed.", param.content_len, len(p.Content))
-					C.free(unsafe.Pointer(param.content))
-				}
+		if unsafe.Pointer(param.content) != nullPtr {
+			if int(param.content_len) != len(p.Content) {
+				log.Errorf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content))
+				return C.PSERVER_ERROR
 			}
 		}

-		if !nameReady {
-			param.name = C.CString(p.Name)
-		}
-		if !contentAllocated {
-			param.content = (*C.uchar)(C.malloc(C.size_t(len(p.Content))))
-		}
 		C.memcpy(unsafe.Pointer(param.content), unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)))
 		param.content_len = C.int(len(p.Content))
 		param.element_type = C.paddle_element_type(p.ElementType)
 	}

-	return 0
+	return C.PSERVER_OK
 }

 //export paddle_save_model
-func paddle_save_model(client C.client, path *C.char) C.int {
+func paddle_save_model(client C.paddle_pserver_client, path *C.char) C.int {
 	p := C.GoString(path)
 	c := get(client)
 	err := c.Save(p)
 	if err != nil {
-		log.Println(err)
-		return -1
+		log.Errorln(err)
+		return C.PSERVER_ERROR
 	}

-	return 0
+	return C.PSERVER_OK
 }

 func main() {} // Required but ignored
--- a/go/pserver/cclient/test/CMakeLists.txt
+++ b/go/pserver/cclient/test/CMakeLists.txt
-cmake_minimum_required(VERSION 3.0)

-include_directories(${CMAKE_BINARY_DIR})
-
-add_executable(main main.c)
-add_dependencies(main client)
-
-if(APPLE)
-  set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
-endif()
-target_link_libraries(main ${CMAKE_BINARY_DIR}/libclient.a)
+cc_library(main SRCS main.c DEPS paddle_pserver_cclient)
+cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient)
--- a/go/pserver/cclient/test/main.c
+++ b/go/pserver/cclient/test/main.c
 #include <stdio.h>
+#include <stdlib.h>

-#include "libclient.h"
+#include "libpaddle_pserver_cclient.h"

-void fail() {
-  // TODO(helin): fix: gtest using cmake is not working, using this
-  // hacky way for now.
-  printf("test failed.\n");
+// TODO(helin): Fix: gtest using cmake is not working, using this
+// hacky way for now.
+#define fail()                                          \
+  fprintf(stderr, "info: %s:%d: ", __FILE__, __LINE__); \
  exit(-1);
+
+void sendGrads(paddle_pserver_client c) {
+  unsigned char grad_a[2000] = {2};
+  unsigned char grad_b[3000] = {3};
+  paddle_gradient grad1 = {
+      "param_a", PADDLE_ELEMENT_TYPE_FLOAT32, grad_a, 2000};
+  paddle_gradient grad2 = {
+      "param_b", PADDLE_ELEMENT_TYPE_FLOAT32, grad_b, 3000};
+  paddle_gradient* grads[2] = {&grad1, &grad2};
+  if (paddle_send_grads(c, grads, 2)) {
+    fail();
+  }
+}
+
+void getParams(paddle_pserver_client c) {
+  paddle_parameter param_a;
+  paddle_parameter param_b;
+  char name_a[] = "param_a";
+  char name_b[] = "param_b";
+  // Must pre-allocate the prameter content before calling paddle_get_params.
+  unsigned char content_a[2000] = {};
+  unsigned char content_b[3000] = {};
+  param_a.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+  param_a.name = name_a;
+  param_a.content = content_a;
+  param_a.content_len = 2000;
+  param_b.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+  param_b.name = name_b;
+  param_b.content = content_b;
+  param_b.content_len = 3000;
+
+  paddle_parameter* params[2] = {&param_a, &param_b};
+  if (paddle_get_params(c, params, 2)) {
+    fail();
+  }
 }

 int main() {
  char addr[] = "localhost:3000";
-  client c = paddle_new_pserver_client(addr, 1);
+  paddle_pserver_client c = paddle_new_pserver_client(addr, 1);
 retry:
  if (paddle_begin_init_params(c)) {
    paddle_parameter param;
    char name_a[] = "param_a";
    char name_b[] = "param_b";
-    unsigned char content[] = {0x00, 0x11, 0x22};
+    unsigned char content_a[2000] = {1};
+    unsigned char content_b[3000] = {0};
    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
    param.name = name_a;
-    param.content = content;
-    param.content_len = 3;
-    if (paddle_init_param(c, param, NULL, 0) != 0) {
+    param.content = content_a;
+    param.content_len = 2000;
+    int error = paddle_init_param(c, param, NULL, 0);
+    if (error != 0) {
      goto retry;
    }
-    param.element_type = PADDLE_ELEMENT_TYPE_INT32;
+
+    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
    param.name = name_b;
-    param.content = content;
-    param.content_len = 3;
-    if (paddle_init_param(c, param, NULL, 0) != 0) {
+    param.content = content_b;
+    param.content_len = 3000;
+    error = paddle_init_param(c, param, NULL, 0);
+    if (error != 0) {
      goto retry;
    }
-    if (paddle_finish_init_params(c) != 0) {
+
+    error = paddle_finish_init_params(c);
+    if (error != 0) {
      goto retry;
    }
-  } else {
-    fail();
-  }
-
-  unsigned char content[] = {0x00, 0x11, 0x22};
-  paddle_gradient grads[2] = {
-      {"param_a", PADDLE_ELEMENT_TYPE_INT32, content, 3},
-      {"param_b", PADDLE_ELEMENT_TYPE_FLOAT32, content, 3}};
-
-  if (!paddle_send_grads(c, grads, 2)) {
-    fail();
  }

-  paddle_parameter* params[2] = {NULL, NULL};
-  char* names[] = {"param_a", "param_b"};
-  if (!paddle_get_params(c, names, params, 2)) {
-    fail();
+  int i;
+  for (i = 0; i < 100; i++) {
+    sendGrads(c);
+    getParams(c);
  }

-  // get parameters again by reusing the allocated parameter buffers.
-  if (!paddle_get_params(c, names, params, 2)) {
-    fail();
-  }
-
-  paddle_release_param(params[0]);
-  paddle_release_param(params[1]);
-
-  if (!paddle_save_model(c, "/tmp/")) {
+  if (paddle_save_model(c, "/tmp/")) {
    fail();
  }


--- a/go/pserver/cclient/test/test_cclient.c
+++ b/go/pserver/cclient/test/test_cclient.c
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "libpaddle_pserver_cclient.h"
+
+typedef float real;
+
+void fail() {
+  // TODO(helin): fix: gtest using cmake is not working, using this
+  // hacky way for now.
+  printf("test failed.\n");
+  exit(-1);
+}
+
+void print_parameter(paddle_gradient* param) {
+  if (param == NULL) {
+    printf("param is NULL!!\n");
+  } else {
+    printf("==== parameter ====\n");
+    printf("name: %s\n", param->name);
+    printf("content_len: %d\n", param->content_len);
+    printf("content_type: %d\n", param->element_type);
+    int i;
+    for (i = 0; i < param->content_len / (int)sizeof(real); ++i) {
+      printf("%f ", ((float*)param->content)[i]);
+    }
+    printf("\n\n");
+  }
+}
+
+int main() {
+  char addr[] = "localhost:3000";
+  paddle_pserver_client c = paddle_new_pserver_client(addr, 1);
+
+  char* names[] = {"param_a", "param_b"};
+
+retry:
+  printf("init parameter to pserver:\n");
+
+  real param_content1[] = {0.1, 0.2, 0.3};
+  real param_content2[] = {0.4, 0.5, 0.6};
+  paddle_parameter** params =
+      (paddle_parameter**)malloc(sizeof(paddle_parameter*) * 2);
+  params[0] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
+  params[0]->name = names[0];
+  params[0]->content = (unsigned char*)param_content1;
+  params[0]->content_len = 3 * sizeof(real);
+  params[0]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+
+  params[1] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
+  params[1]->name = names[1];
+  params[1]->content = (unsigned char*)param_content2;
+  params[1]->content_len = 3 * sizeof(real);
+  params[1]->element_type = PADDLE_ELEMENT_TYPE_INT32;
+
+  if (paddle_begin_init_params(c)) {
+    if (paddle_init_param(c, *params[0], NULL, 0) != 0) {
+      goto retry;
+    }
+    if (paddle_init_param(c, *params[1], NULL, 0) != 0) {
+      goto retry;
+    }
+    if (paddle_finish_init_params(c) != 0) {
+      goto retry;
+    }
+  } else {
+    fail();
+  }
+
+  printf("get inited parameters from pserver:\n");
+  // get parameters again by reusing the allocated parameter buffers.
+  if (paddle_get_params(c, params, 2) != 0) {
+    fail();
+  }
+  print_parameter(params[0]);
+  print_parameter(params[1]);
+
+  printf("send gradient to pserver:\n");
+  real gradient_content1[] = {0.01, 0.02, 0.03};
+  real gradinet_content2[] = {0.04, 0.05, 0.06};
+
+  paddle_gradient** grads =
+      (paddle_gradient**)malloc(sizeof(paddle_gradient*) * 2);
+  grads[0] = (paddle_gradient*)malloc(sizeof(paddle_gradient));
+  grads[0]->name = names[0];
+  grads[0]->content = (unsigned char*)gradient_content1;
+  grads[0]->content_len = 3 * sizeof(real);
+  grads[0]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+
+  grads[1] = (paddle_gradient*)malloc(sizeof(paddle_gradient));
+  grads[1]->name = names[1];
+  grads[1]->content = (unsigned char*)gradinet_content2;
+  grads[1]->content_len = 3 * sizeof(real);
+  grads[1]->element_type = PADDLE_ELEMENT_TYPE_INT32;
+
+  printf("print gradient sent to pserver:\n");
+  print_parameter(grads[0]);
+  print_parameter(grads[1]);
+
+  if (paddle_send_grads(c, grads, 2) != 0) {
+    fail();
+  }
+
+  printf("get updated parameters from pserver:\n");
+  // get parameters again by reusing the allocated parameter buffers.
+  if (paddle_get_params(c, params, 2) != 0) {
+    fail();
+  }
+  print_parameter(params[0]);
+  print_parameter(params[1]);
+
+  if (paddle_save_model(c, "/tmp/") != 0) {
+    fail();
+  }
+
+  return 0;
+}
--- a/go/pserver/cclient/test/test_mnist.py
+++ b/go/pserver/cclient/test/test_mnist.py
+import paddle.v2 as paddle
+import gzip
+
+
+def softmax_regression(img):
+    predict = paddle.layer.fc(input=img,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def multilayer_perceptron(img):
+    # The first fully-connected layer
+    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
+    # The second fully-connected layer and the according activation function
+    hidden2 = paddle.layer.fc(input=hidden1,
+                              size=64,
+                              act=paddle.activation.Relu())
+    # The thrid fully-connected layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=hidden2,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def convolutional_neural_network(img):
+    # first conv layer
+    conv_pool_1 = paddle.networks.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        num_channel=1,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # second conv layer
+    conv_pool_2 = paddle.networks.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        num_channel=20,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # The first fully-connected layer
+    fc1 = paddle.layer.fc(input=conv_pool_2,
+                          size=128,
+                          act=paddle.activation.Tanh())
+    # The softmax layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=fc1,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # define network topology
+    images = paddle.layer.data(
+        name='pixel', type=paddle.data_type.dense_vector(784))
+    label = paddle.layer.data(
+        name='label', type=paddle.data_type.integer_value(10))
+
+    # Here we can build the prediction network in different ways. Please
+    # choose one by uncomment corresponding line.
+    predict = softmax_regression(images)
+    #predict = multilayer_perceptron(images)
+    #predict = convolutional_neural_network(images)
+
+    cost = paddle.layer.classification_cost(input=predict, label=label)
+    parameters = paddle.parameters.create(cost)
+
+    optimizer = paddle.optimizer.Momentum(
+        learning_rate=0.1 / 128.0,
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer,
+                                 is_local=False,
+                                 pserver_spec="localhost:3000")
+
+    lists = []
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1000 == 0:
+                print "Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+
+        elif isinstance(event, paddle.event.EndPass):
+            result = trainer.test(reader=paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=128))
+            print "Test with Pass %d, Cost %f, %s\n" % (
+                event.pass_id, result.cost, result.metrics)
+            lists.append((event.pass_id, result.cost,
+                          result.metrics['classification_error_evaluator']))
+
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=128),
+        event_handler=event_handler,
+        num_passes=100)
+
+    # find the best pass
+    best = sorted(lists, key=lambda list: float(list[1]))[0]
+    print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
+    print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
+
+    test_creator = paddle.dataset.mnist.test()
+    test_data = []
+    for item in test_creator():
+        test_data.append((item[0], ))
+        if len(test_data) == 100:
+            break
+
+    # output is a softmax layer. It returns probabilities.
+    # Shape should be (100, 10)
+    probs = paddle.infer(
+        output_layer=predict, parameters=parameters, input=test_data)
+    print probs.shape
+
+
+if __name__ == '__main__':
+    main()
--- a/go/pserver/cclient/test/test_train.py
+++ b/go/pserver/cclient/test/test_train.py
+import paddle.v2 as paddle
+import paddle.v2.dataset.uci_housing as uci_housing
+
+
+def main():
+    # init
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # network config
+    x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+    y_predict = paddle.layer.fc(input=x,
+                                param_attr=paddle.attr.Param(name='w'),
+                                size=1,
+                                act=paddle.activation.Linear(),
+                                bias_attr=paddle.attr.Param(name='b'))
+    y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+    cost = paddle.layer.mse_cost(input=y_predict, label=y)
+
+    # create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # create optimizer
+    optimizer = paddle.optimizer.Momentum(momentum=0)
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer,
+                                 is_local=False,
+                                 pserver_spec="localhost:3000")
+
+    # event_handler to print training and testing info
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d, Batch %d, Cost %f" % (
+                    event.pass_id, event.batch_id, event.cost)
+
+        if isinstance(event, paddle.event.EndPass):
+            if (event.pass_id + 1) % 10 == 0:
+                result = trainer.test(
+                    reader=paddle.batch(
+                        uci_housing.test(), batch_size=2),
+                    feeding={'x': 0,
+                             'y': 1})
+                print "Test %d, %.2f" % (event.pass_id, result.cost)
+
+    # training
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                uci_housing.train(), buf_size=500),
+            batch_size=2),
+        feeding={'x': 0,
+                 'y': 1},
+        event_handler=event_handler,
+        num_passes=30)
+
+
+if __name__ == '__main__':
+    main()
--- a/go/pserver/client.go
+++ b/go/pserver/client.go
@@ -2,11 +2,11 @@ package pserver

 import (
 	"hash/fnv"
-	"log"
 	"sort"
 	"time"

-	"github.com/PaddlePaddle/Paddle/go/pserver/internal/connection"
+	"github.com/PaddlePaddle/Paddle/go/connection"
+	log "github.com/sirupsen/logrus"
 )

 // TODO(helin): add RPC call retry logic
@@ -47,7 +47,7 @@ func NewClient(l Lister, pserverNum int, sel Selector) *Client {
 // monitorPservers monitors pserver addresses, and updates connection
 // when the address changes.
 func (c *Client) monitorPservers(l Lister, pserverNum int) {
-	knownServers := make([]Server, pserverNum)
+	lastServers := make([]Server, pserverNum)
 	ticker := time.NewTicker(10 * time.Second)
 	monitor := func() {
 		curServers := make([]Server, pserverNum)
@@ -56,25 +56,37 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) {
 			curServers[l.Index] = l
 		}

-		for i := range knownServers {
-			if knownServers[i].Addr != curServers[i].Addr {
-				err := c.pservers[i].Connect(curServers[i].Addr)
-				if err != nil {
-					log.Println(err)
+		for i := range lastServers {
+			if lastServers[i].Addr == curServers[i].Addr {
+				continue
+			}

-					// connect to addr failed, set
-					// to last known addr in order
-					// to retry next time.
-					curServers[i].Addr = knownServers[i].Addr
+			if curServers[i].Addr == "" {
+				err := c.pservers[i].Close()
+				if err != nil {
+					log.Errorln(err)
 				}
+
+				continue
 			}
+
+			err := c.pservers[i].Connect(curServers[i].Addr)
+			if err != nil {
+				log.Errorln(err)
+
+				// connect to addr failed, set
+				// to last known addr in order
+				// to retry next time.
+				curServers[i].Addr = lastServers[i].Addr
+			}
+
 		}

-		knownServers = curServers
+		lastServers = curServers
 	}

 	monitor()
-	for _ = range ticker.C {
+	for range ticker.C {
 		monitor()
 	}
 }
@@ -93,16 +105,14 @@ func (c *Client) BeginInitParams() bool {

 // InitParam initializes the parameter on parameter servers.
 func (c *Client) InitParam(paramWithConfigs ParameterWithConfig) error {
-	var dummy int
-	return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, &dummy)
+	return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, nil)
 }

 // FinishInitParams tells parameter servers client has sent all
 // parameters to parameter servers as initialization.
 func (c *Client) FinishInitParams() error {
 	for _, p := range c.pservers {
-		var dummy int
-		err := p.Call("Service.FinishInitParams", dummy, &dummy)
+		err := p.Call("Service.FinishInitParams", 0, nil)
 		if err != nil {
 			return err
 		}
@@ -116,8 +126,7 @@ func (c *Client) SendGrads(grads []Gradient) error {
 	errCh := make(chan error, len(grads))
 	for _, g := range grads {
 		go func(g Gradient) {
-			var dummy int
-			err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, &dummy)
+			err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, nil)
 			errCh <- err
 		}(g)
 	}
@@ -196,8 +205,7 @@ func (c *Client) Save(path string) error {
 	errCh := make(chan error, len(c.pservers))

 	for _, p := range c.pservers {
-		var dummy int
-		err := p.Call("Service.Save", path, &dummy)
+		err := p.Call("Service.Save", path, nil)
 		errCh <- err
 	}


--- a/go/pserver/client_test.go
+++ b/go/pserver/client_test.go
@@ -7,6 +7,7 @@ import (
 	"strconv"
 	"strings"
 	"testing"
+	"time"

 	"github.com/PaddlePaddle/Paddle/go/pserver"
 )
@@ -30,9 +31,12 @@ func init() {
 		port[i] = p

 		go func(l net.Listener) {
-			s := pserver.NewService()
+			s, err := pserver.NewService("", time.Second*5)
+			if err != nil {
+				panic(err)
+			}
 			server := rpc.NewServer()
-			err := server.Register(s)
+			err = server.Register(s)
 			if err != nil {
 				panic(err)
 			}
@@ -117,7 +121,7 @@ func TestClientFull(t *testing.T) {

 	for i := range params {
 		if names[i] != params[i].Name {
-			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i])
+			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i].Name)
 		}
 	}
 }
--- a/go/pserver/optimizer.c
+++ b/go/pserver/optimizer.c
@@ -32,7 +32,13 @@ int update_SGD(void* optimizer,
               const void* gradient,
               int num_bytes) {
  SGD_optimizer* o = (SGD_optimizer*)optimizer;
-  // TODO
+  float* parameter = (float*)buffer;
+  float* grad = (float*)gradient;
+
+  int i;
+  for (i = 0; i < num_bytes / sizeof(float); ++i) {
+    parameter[i] -= o->learning_rate * grad[i];
+  }
  return 0;
 }


--- a/go/pserver/service.go
+++ b/go/pserver/service.go
 package pserver

 import (
+	"context"
 	"errors"
 	"fmt"
+	"strconv"
+	"strings"
 	"sync"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
+	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/clientv3/concurrency"
+	log "github.com/sirupsen/logrus"
 )

 // ElementType is the type of elements of a Parameter.
 type ElementType int

-var ErrAlreadyInitialized = errors.New("pserver already initialized")
-var ErrUninitialized = errors.New("pserver not fully initialized")
+const (
+	AlreadyInitialized = "pserver already initialized"
+	Uninitialized      = "pserver not fully initialized"
+)

 // Supported element types
 const (
@@ -22,6 +33,9 @@ const (
 	Float64
 )

+// PsDesired is etcd path for store desired pserver count
+const PsDesired = "/ps_desired"
+
 // Parameter is a piece of data to sync with the parameter server.
 type Parameter struct {
 	Name        string
@@ -45,21 +59,161 @@ type Service struct {
 	mu       sync.Mutex
 	opt      *optimizer
 	paramMap map[string]Parameter
+
+	etcdEndpoints string
+	etcdClient    *clientv3.Client
+	// etcdTimeout is also used as retry intervals.
+	etcdTimeout time.Duration
+	// desired number of pservers in the job.
+	// assume desired will not change during one training job.
+	desired int
+	// FIXME: ensure GetExternalIP gets the correct ip for trainers to connect.
+	externalIP string
 }

-// NewService creates a new service.
-func NewService() *Service {
-	s := &Service{opt: newOptimizer(sgd, 0.01)}
+// NewService creates a new service, will bypass etcd registration if no
+// endpoints specified.
+func NewService(endpoints string, numPservers int, timeout time.Duration) (*Service, error) {
+	s := &Service{opt: newOptimizer(sgd, 0.005)}
 	s.paramMap = make(map[string]Parameter)
 	s.initialized = make(chan struct{})
-	return s
+	s.etcdEndpoints = endpoints
+	s.etcdTimeout = timeout
+
+	var err error
+	s.externalIP, err = networkhelper.GetExternalIP()
+	if err != nil {
+		return nil, err
+	}
+
+	if endpoints != "" {
+		// initialize connection to etcd, try
+		ep := strings.Split(s.etcdEndpoints, ",")
+		for {
+			cli, err := clientv3.New(clientv3.Config{
+				Endpoints:   ep,
+				DialTimeout: s.etcdTimeout,
+			})
+			if err != nil {
+				log.Errorf("connect to etcd error: %v", err)
+				time.Sleep(s.etcdTimeout)
+				continue
+			}
+			s.etcdClient = cli
+			log.Debugf("inited client to %s", s.etcdEndpoints)
+			break
+		}
+		// init /ps_desired using transaction, for multiple pservers may want to write
+		// it at the same time.
+		for {
+			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+			_, err := s.initDesiredPsercers(ctx, numPservers)
+			cancel()
+			if err != nil {
+				log.Warn(err)
+				time.Sleep(s.etcdTimeout)
+				continue
+			}
+			break
+		}
+		// TODO: when implementing extending or reducing pservers, /ps_desired is
+		// changed, then we need to watch /ps_desired node for events. For now, just
+		// write once when init and read from it.
+		// wait and set s.desired init value
+		for {
+			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+			resp, err := s.etcdClient.Get(ctx, PsDesired)
+			cancel()
+			if err != nil {
+				log.Errorf("getting %s error: %v", PsDesired, err)
+				time.Sleep(s.etcdTimeout)
+				continue
+			}
+			if len(resp.Kvs) != 0 {
+				s.desired, err = strconv.Atoi(string(resp.Kvs[0].Value))
+				if err != nil {
+					log.Errorf("value of %s invalid %v\n", PsDesired, err)
+					time.Sleep(s.etcdTimeout)
+					// NOTE: wait util ps_desired value change
+					continue
+				}
+				break
+			}
+		}
+		// try register pserver node on etcd
+		for {
+			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+			_, err := s.registerPserverEtcd(ctx)
+			cancel()
+			if err != nil {
+				log.Warn(err)
+				time.Sleep(s.etcdTimeout)
+				continue
+			}
+			break
+		}
+	} // if endpoints != ""
+	// Bypass etcd registration if no endpoints specified
+	return s, nil
+}
+
+func (s *Service) initDesiredPsercers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
+	return concurrency.NewSTM(s.etcdClient, func(c concurrency.STM) error {
+		dsStr := c.Get(PsDesired)
+		if dsStr == "" {
+			c.Put(PsDesired, strconv.Itoa(numPservers))
+		}
+		return nil
+	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
+}
+
+// registerPserverEtcd registers pserver node on etcd using transaction.
+func (s *Service) registerPserverEtcd(ctx context.Context) (*clientv3.TxnResponse, error) {
+	return concurrency.NewSTM(s.etcdClient, func(c concurrency.STM) error {
+		registered := false
+		for i := 0; i < s.desired; i++ {
+			psKey := "/ps/" + strconv.Itoa(i)
+			log.Debugf("checking %s", psKey)
+			ps := c.Get(psKey)
+			log.Debugf("got value (%s) for key: %s", ps, psKey)
+
+			if ps == "" {
+				resp, err := s.etcdClient.Grant(context.TODO(), 5)
+				if err != nil {
+					log.Fatal(err)
+				}
+				// find the first id and write info
+				c.Put(psKey, s.externalIP, clientv3.WithLease(resp.ID))
+				log.Debugf("set pserver node %s with value %s", psKey, s.externalIP)
+				ch, kaerr := s.etcdClient.KeepAlive(context.TODO(), resp.ID)
+				if kaerr != nil {
+					log.Errorf("keepalive etcd node error: %v", kaerr)
+					return kaerr
+				}
+
+				// Eat the keep alive message so etcd
+				// will not expire the lease.
+				go func(ch <-chan *clientv3.LeaseKeepAliveResponse) {
+					ka := <-ch
+					log.Debugf("keepalive: %d\n", ka.TTL)
+				}(ch)
+				log.Debug("register finished")
+				registered = true
+				break
+			}
+		}
+		if registered == true {
+			return nil
+		}
+		return errors.New("not registerd, may due to already have enough pservers")
+	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
 }

 // InitParam initializes a parameter.
 func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) error {
 	select {
 	case <-s.initialized:
-		return ErrAlreadyInitialized
+		return errors.New(AlreadyInitialized)
 	default:
 	}

@@ -80,7 +234,7 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) er
 func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
 	select {
 	case <-s.initialized:
-		return ErrAlreadyInitialized
+		return errors.New(AlreadyInitialized)
 	default:
 	}

@@ -94,7 +248,7 @@ func (s *Service) SendGrad(g Gradient, dummy *int) error {
 	select {
 	case <-s.initialized:
 	default:
-		return ErrUninitialized
+		return errors.New(Uninitialized)
 	}

 	s.mu.Lock()

--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -10,13 +10,15 @@ import (
 )

 func TestFull(t *testing.T) {
-	s := pserver.NewService()
+	s, err := pserver.NewService("", time.Second*5)
+	if err != nil {
+		t.Error(err)
+	}
 	var p pserver.Parameter
 	p.Name = "param_a"
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
 	p.ElementType = pserver.Int32
-	var dummy int
-	err := s.InitParam(pserver.ParameterWithConfig{p, nil}, &dummy)
+	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: nil}, nil)
 	if err != nil {
 		t.FailNow()
 	}
@@ -25,12 +27,12 @@ func TestFull(t *testing.T) {
 	p1.Name = "param_b"
 	p1.Content = []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
 	p1.ElementType = pserver.Float32
-	err = s.InitParam(pserver.ParameterWithConfig{p1, nil}, &dummy)
+	err = s.InitParam(pserver.ParameterWithConfig{Param: p1, Config: nil}, nil)
 	if err != nil {
 		t.FailNow()
 	}

-	err = s.FinishInitParams(0, &dummy)
+	err = s.FinishInitParams(0, nil)
 	if err != nil {
 		t.FailNow()
 	}
@@ -46,11 +48,11 @@ func TestFull(t *testing.T) {
 	}

 	g1, g2 := pserver.Gradient(p1), pserver.Gradient(p)
-	err = s.SendGrad(g1, &dummy)
+	err = s.SendGrad(g1, nil)
 	if err != nil {
 		t.FailNow()
 	}
-	err = s.SendGrad(g2, &dummy)
+	err = s.SendGrad(g2, nil)

 	if err != nil {
 		t.FailNow()
@@ -73,38 +75,43 @@ func TestFull(t *testing.T) {
 }

 func TestMultipleInit(t *testing.T) {
-	s := pserver.NewService()
-	var dummy int
-	err := s.FinishInitParams(0, &dummy)
+	s, err := pserver.NewService("", time.Second*5)
+	if err != nil {
+		t.Error(err)
+	}
+	err = s.FinishInitParams(0, nil)
 	if err != nil {
 		t.FailNow()
 	}

-	err = s.FinishInitParams(0, &dummy)
-	if err != pserver.ErrAlreadyInitialized {
+	err = s.FinishInitParams(0, nil)
+	if err.Error() != pserver.AlreadyInitialized {
 		t.FailNow()
 	}
 }

 func TestUninitialized(t *testing.T) {
-	s := pserver.NewService()
-	var dummy int
-	err := s.SendGrad(pserver.Gradient{}, &dummy)
-	if err != pserver.ErrUninitialized {
+	s, err := pserver.NewService("", time.Second*5)
+	err = s.SendGrad(pserver.Gradient{}, nil)
+	if err.Error() != pserver.Uninitialized {
 		t.FailNow()
 	}
 }

 func TestBlockUntilInitialized(t *testing.T) {
-	s := pserver.NewService()
+	s, err := pserver.NewService("", time.Second*5)
+	if err != nil {
+		t.Error(err)
+	}
 	ch := make(chan struct{}, 2)
+	errCh := make(chan error, 2)
 	var wg sync.WaitGroup
 	wg.Add(1)
 	go func() {
 		var param pserver.Parameter
 		err := s.GetParam("param_a", &param)
 		if err != nil {
-			t.FailNow()
+			errCh <- err
 		}
 		wg.Done()
 		ch <- struct{}{}
@@ -112,10 +119,9 @@ func TestBlockUntilInitialized(t *testing.T) {

 	wg.Add(1)
 	go func() {
-		var dummy int
-		err := s.Save("", &dummy)
+		err := s.Save("", nil)
 		if err != nil {
-			t.FailNow()
+			errCh <- err
 		}
 		wg.Done()
 		ch <- struct{}{}
@@ -127,6 +133,8 @@ func TestBlockUntilInitialized(t *testing.T) {
 	case <-ch:
 		// some function returned before initialization is completed.
 		t.FailNow()
+	case <-errCh:
+		t.FailNow()
 	default:
 	}

@@ -134,13 +142,12 @@ func TestBlockUntilInitialized(t *testing.T) {
 	p.Name = "param_a"
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
 	p.ElementType = pserver.Int32
-	var dummy int
-	err := s.InitParam(pserver.ParameterWithConfig{p, nil}, &dummy)
+	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: nil}, nil)
 	if err != nil {
 		t.FailNow()
 	}

-	err = s.FinishInitParams(0, &dummy)
+	err = s.FinishInitParams(0, nil)
 	if err != nil {
 		t.FailNow()
 	}

--- a/go/utils/networkhelper/helper.go
+++ b/go/utils/networkhelper/helper.go
+package networkhelper
+
+import (
+	"errors"
+	"net"
+)
+
+// GetExternalIP returns the ip address of local network interface, not the
+// loopback device.
+func GetExternalIP() (string, error) {
+	ifaces, err := net.Interfaces()
+	if err != nil {
+		return "", err
+	}
+	for _, iface := range ifaces {
+		if iface.Flags&net.FlagUp == 0 {
+			continue // interface down
+		}
+		if iface.Flags&net.FlagLoopback != 0 {
+			continue // loopback interface
+		}
+		addrs, err := iface.Addrs()
+		if err != nil {
+			return "", err
+		}
+		for _, addr := range addrs {
+			var ip net.IP
+			switch v := addr.(type) {
+			case *net.IPNet:
+				ip = v.IP
+			case *net.IPAddr:
+				ip = v.IP
+			}
+			if ip == nil || ip.IsLoopback() {
+				continue
+			}
+			ip = ip.To4()
+			if ip == nil {
+				continue // not an ipv4 address
+			}
+			return ip.String(), nil
+		}
+	}
+	return "", errors.New("are you connected to the network?")
+}
--- a/go/utils/networkhelper/helper_test.go
+++ b/go/utils/networkhelper/helper_test.go
+package networkhelper
+
+import "testing"
+
+func TestGetIP(t *testing.T) {
+	_, err := GetExternalIP()
+	if err != nil {
+		t.Errorf("GetExternalIP returns error : %v\n", err)
+	}
+}
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -8,6 +8,7 @@ add_subdirectory(gserver)
 add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
+add_subdirectory(optimizer)
 add_subdirectory(strings)

 # Do not build go directory until go cmake is working smoothly.
@@ -19,8 +20,8 @@ find_package(Boost QUIET)

 if(Boost_FOUND)
  include_directories(${Boost_INCLUDE_DIRS})
-  include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-  add_subdirectory(majel)
+  add_subdirectory(platform)
+  add_subdirectory(framework)
 endif()

 if(WITH_C_API)

--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -16,7 +16,7 @@ set(API_HEADER
    Internal.h)

 add_library(paddle_api STATIC ${API_SOURCES})
-add_dependencies(paddle_api gen_proto_cpp)
+add_dependencies(paddle_api gen_proto_cpp paddle_trainer_lib)

 INCLUDE(${SWIG_USE_FILE})
 INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)
@@ -45,7 +45,7 @@ SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
 )

 IF(APPLE)
-    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load")
+    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
 ELSE(APPLE)
    SET(START_GROUP "-Xlinker -start-group")
    SET(END_GROUP "-Xlinker -end-group")

--- a/paddle/api/Paddle.i
+++ b/paddle/api/Paddle.i
@@ -179,6 +179,7 @@ namespace std {
 %newobject ParameterOptimizer::needSpecialTraversal;
 %newobject ParameterUpdater::createLocalUpdater;
 %newobject ParameterUpdater::createRemoteUpdater;
+%newobject ParameterUpdater::createNewRemoteUpdater;

 %feature("director") UpdateCallback;
 %feature("autodoc", 1); // To generate method stub, for code hint in ide

--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -841,6 +841,9 @@ public:
  static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
                                               int passCount,
                                               bool useSparseUpdater);
+  static ParameterUpdater* createNewRemoteUpdater(
+      OptimizationConfig* config,
+      const std::string pserverSpec) throw(UnsupportError);
  ~ParameterUpdater();

  /**

--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -15,6 +15,9 @@ limitations under the License. */
 #include "PaddleAPI.h"

 #include "PaddleAPIPrivate.h"
+#ifndef PADDLE_WITHOUT_GOLANG
+#include "paddle/trainer/NewRemoteParameterUpdater.h"
+#endif
 #include "paddle/trainer/RemoteParameterUpdater.h"
 #include "paddle/trainer/ThreadParameterUpdater.h"

@@ -28,6 +31,19 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(
  return updater;
 }

+ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
+    OptimizationConfig *config,
+    const std::string pserverSpec) throw(UnsupportError) {
+#ifndef PADDLE_WITHOUT_GOLANG
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
+      config->m->getConfig(), pserverSpec));
+  return updater;
+#else
+  throw UnsupportError();
+#endif
+}
+
 ParameterUpdater *ParameterUpdater::createRemoteUpdater(
    OptimizationConfig *config, int passCount, bool useSparseUpdater) {
  auto updater = new ParameterUpdater();

--- a/paddle/framework/.clang-format
+++ b/paddle/framework/.clang-format
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
+cc_library(ddim SRCS ddim.cc)
+cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
+
+nv_test(dim_test SRCS dim_test.cu DEPS ddim)
+
+cc_test(variable_test SRCS variable_test.cc)
--- a/paddle/majel/ddim.cc
+++ b/paddle/majel/ddim.cc
-#include "paddle/majel/ddim.h"
+#include "paddle/framework/ddim.h"

-namespace majel {
+namespace paddle {
+namespace framework {

 ///@cond HIDDEN

@@ -66,7 +67,7 @@ DDim make_ddim(const std::vector<int>& dims) {
 ///@cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int&> {
-public:
+ public:
  DynamicMutableIndexer(int idx) : idx_(idx) {}

  template <int D>
@@ -74,12 +75,12 @@ public:
    return dim[idx_];
  }

-private:
+ private:
  int idx_;
 };

 class DynamicConstIndexer : public boost::static_visitor<int> {
-public:
+ public:
  DynamicConstIndexer(int idx) : idx_(idx) {}

  template <int D>
@@ -87,7 +88,7 @@ public:
    return dim[idx_];
  }

-private:
+ private:
  int idx_;
 };

@@ -213,10 +214,11 @@ struct DDimPrinter : boost::static_visitor<void> {

 ///\endcond

-std::ostream& operator<<(std::ostream& os, const majel::DDim& ddim) {
+std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
  DDimPrinter printer(os);
  boost::apply_visitor(printer, ddim);
  return os;
 }

-}  // namespace majel
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/majel/ddim.h
+++ b/paddle/majel/ddim.h
@@ -5,20 +5,14 @@
 #include <stdexcept>
 #include <vector>

-#include "paddle/majel/dim.h"
+#include "paddle/framework/dim.h"

-namespace majel {
+namespace paddle {
+namespace framework {

 namespace {
-typedef boost::variant<Dim<1>,
-                       Dim<2>,
-                       Dim<3>,
-                       Dim<4>,
-                       Dim<5>,
-                       Dim<6>,
-                       Dim<7>,
-                       Dim<8>,
-                       Dim<9>>
+typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
+                       Dim<8>, Dim<9>>
    DDimVar;
 }

@@ -95,14 +89,15 @@ ssize_t product(const DDim& ddim);

 int arity(const DDim& ddim);

-std::ostream& operator<<(std::ostream&, const majel::DDim&);
+std::ostream& operator<<(std::ostream&, const DDim&);

-}  // namespace majel
+}  // namespace framework
+}  // namespace paddle

 namespace boost {

 template <typename T>
-T get(const majel::DDim& in) {
+T get(const paddle::framework::DDim& in) {
  return boost::get<T>(in.var);
 }


--- a/paddle/majel/ddim_test.cc
+++ b/paddle/majel/ddim_test.cc
-//#include <stdexcept>
-//#include <unittest/unittest.h>
 #include <sstream>
 #include <vector>

 #include "gtest/gtest.h"
-#include "paddle/majel/ddim.h"
+#include "paddle/framework/ddim.h"

 TEST(DDim, Equality) {
  // construct a DDim from an initialization list
-  majel::DDim ddim = majel::make_ddim({9, 1, 5});
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({9, 1, 5});
  EXPECT_EQ(ddim[0], 9);
  EXPECT_EQ(ddim[1], 1);
  EXPECT_EQ(ddim[2], 5);

  // construct a DDim from a vector
  std::vector<int> vec({9, 1, 5});
-  majel::DDim vddim = majel::make_ddim(vec);
+  paddle::framework::DDim vddim = paddle::framework::make_ddim(vec);
  EXPECT_EQ(ddim[0], 9);
  EXPECT_EQ(ddim[1], 1);
  EXPECT_EQ(ddim[2], 5);
@@ -23,43 +21,43 @@ TEST(DDim, Equality) {
  // mutate a DDim
  ddim[1] = 2;
  EXPECT_EQ(ddim[1], 2);
-  majel::set(ddim, 0, 6);
-  EXPECT_EQ(majel::get(ddim, 0), 6);
+  paddle::framework::set(ddim, 0, 6);
+  EXPECT_EQ(paddle::framework::get(ddim, 0), 6);

  // vectorize a DDim
-  std::vector<int> res_vec = majel::vectorize(vddim);
+  std::vector<int> res_vec = paddle::framework::vectorize(vddim);
  EXPECT_EQ(res_vec[0], 9);
  EXPECT_EQ(res_vec[1], 1);
  EXPECT_EQ(res_vec[2], 5);
-  majel::Dim<3> d(3, 2, 1);
-  res_vec = majel::vectorize(majel::DDim(d));
+  paddle::framework::Dim<3> d(3, 2, 1);
+  res_vec = paddle::framework::vectorize(paddle::framework::DDim(d));
  EXPECT_EQ(res_vec[0], 3);
  EXPECT_EQ(res_vec[1], 2);
  EXPECT_EQ(res_vec[2], 1);

  // add two DDims
-  majel::DDim ddim_sum = ddim + vddim;
+  paddle::framework::DDim ddim_sum = ddim + vddim;
  EXPECT_EQ(ddim_sum[0], 15);
  EXPECT_EQ(ddim_sum[1], 3);
  EXPECT_EQ(ddim_sum[2], 10);

  // multiply two DDims
-  majel::DDim ddim_mul = ddim * vddim;
+  paddle::framework::DDim ddim_mul = ddim * vddim;
  EXPECT_EQ(ddim_mul[0], 54);
  EXPECT_EQ(ddim_mul[1], 2);
  EXPECT_EQ(ddim_mul[2], 25);

  // arity of a DDim
-  EXPECT_EQ(majel::arity(ddim), 3);
+  EXPECT_EQ(paddle::framework::arity(ddim), 3);

  // product of a DDim
-  EXPECT_EQ(majel::product(vddim), 45);
+  EXPECT_EQ(paddle::framework::product(vddim), 45);
 }

 TEST(DDim, Print) {
  // print a DDim
  std::stringstream ss;
-  majel::DDim ddim = majel::make_ddim({2, 3, 4});
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 3, 4});
  ss << ddim;
  EXPECT_EQ("2, 3, 4", ss.str());
 }
--- a/paddle/majel/dim.h
+++ b/paddle/majel/dim.h
@@ -5,10 +5,11 @@
 #include <stdexcept>
 #include <type_traits>

-#include "paddle/majel/detail/cuda_assert.h"
-#include "paddle/majel/detail/hostdevice.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/hostdevice.h"

-namespace majel {
+namespace paddle {
+namespace framework {

 // Statically sized, statically indexed dimension
 template <int i>
@@ -74,7 +75,7 @@ struct Dim<1> {
      throw std::invalid_argument("Index out of range.");
    }
 #else
-    MAJEL_ASSERT(idx < size.head);
+    PADDLE_ASSERT(idx < size.head);
 #endif
  }

@@ -131,7 +132,7 @@ HOSTDEVICE int& indexer(Dim<D>& dim, int idx) {
    throw std::invalid_argument("Tried to access a negative dimension");
  }
 #else
-  MAJEL_ASSERT(idx >= 0);
+  PADDLE_ASSERT(idx >= 0);
 #endif
  if (idx == 0) {
    return dim.head;
@@ -146,7 +147,7 @@ HOSTDEVICE int& indexer<1>(Dim<1>& dim, int idx) {
    throw std::invalid_argument("Invalid index");
  }
 #else
-  MAJEL_ASSERT(idx == 0);
+  PADDLE_ASSERT(idx == 0);
 #endif
  return dim.head;
 }
@@ -158,7 +159,7 @@ HOSTDEVICE int indexer(const Dim<D>& dim, int idx) {
    throw std::invalid_argument("Tried to access a negative dimension");
  }
 #else
-  MAJEL_ASSERT(idx >= 0);
+  PADDLE_ASSERT(idx >= 0);
 #endif
  if (idx == 0) {
    return dim.head;
@@ -173,7 +174,7 @@ HOSTDEVICE int indexer<1>(const Dim<1>& dim, int idx) {
    throw std::invalid_argument("Invalid index");
  }
 #else
-  MAJEL_ASSERT(idx == 0);
+  PADDLE_ASSERT(idx == 0);
 #endif
  return dim.head;
 }
@@ -411,7 +412,7 @@ HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
 // XXX For some reason, overloading fails to resolve this correctly
 template <int i>
 typename std::enable_if<(i > 1), std::ostream&>::type operator<<(
-    std::ostream& os, const majel::Dim<i>& d) {
+    std::ostream& os, const Dim<i>& d) {
  os << d.head << ", " << d.tail;
  return os;
 }
@@ -420,7 +421,7 @@ typename std::enable_if<(i > 1), std::ostream&>::type operator<<(
 // XXX I wish this could be an overload instead of a template
 template <int i>
 typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
-    std::ostream& os, const majel::Dim<i>& d) {
+    std::ostream& os, const Dim<i>& d) {
  os << d.head;
  return os;
 }
@@ -448,4 +449,5 @@ HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
  return result;
 }

-}  // namespace majel
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
+#include <thrust/device_vector.h>
+#include <sstream>
+
+#include "paddle/framework/dim.h"
+#include "gtest/gtest.h"
+
+__global__ void test(paddle::framework::Dim<2>* o) {
+    o[0] = paddle::framework::make_dim(5, 6);
+}
+
+__global__ void dyn_idx_gpu(int* o) {
+    auto d = paddle::framework::make_dim(5, 6);
+    o[0] = d[1];
+}
+
+TEST(Dim, Equality) {
+    // construct a Dim on the CPU
+    auto a = paddle::framework::make_dim(3, 4);
+    EXPECT_EQ(paddle::framework::get<0>(a), 3);
+    EXPECT_EQ(paddle::framework::get<1>(a), 4);
+
+    // construct a Dim on the GPU
+    thrust::device_vector<paddle::framework::Dim<2>> t(2);
+    test<<<1,1>>>(thrust::raw_pointer_cast(t.data()));
+    a = t[0];
+    EXPECT_EQ(paddle::framework::get<0>(a), 5);
+    EXPECT_EQ(paddle::framework::get<1>(a), 6);
+
+    // linearization
+    auto b = paddle::framework::make_dim(7, 8);
+    EXPECT_EQ(paddle::framework::linearize(a, b), 83);
+
+    // product
+    EXPECT_EQ(paddle::framework::product(a), 30);
+
+    // mutate a Dim
+    paddle::framework::get<1>(b) = 10;
+    EXPECT_EQ(paddle::framework::get<0>(b), 7);
+    EXPECT_EQ(paddle::framework::get<1>(b), 10);
+
+    // dynamic access
+    paddle::framework::get(b, 0) = 8;
+    b[1] = 11;
+    EXPECT_EQ(paddle::framework::get<0>(b), 8);
+    EXPECT_EQ(paddle::framework::get<1>(b), 11);
+    EXPECT_EQ(paddle::framework::get(b, 0), 8);
+    EXPECT_EQ(b[1], 11);
+
+    // dynamic access on GPU
+    thrust::device_vector<int> r(1);
+    dyn_idx_gpu<<<1,1>>>(thrust::raw_pointer_cast(r.data()));
+    int res = r[0];
+    EXPECT_EQ(res, 6);
+
+    // ex_prefix_mul
+    paddle::framework::Dim<3> c = paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 3);
+    EXPECT_EQ(paddle::framework::get<2>(c), 12);
+
+    // contiguous_strides
+    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 1, 10));
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 0);
+    EXPECT_EQ(paddle::framework::get<2>(c), 10);
+    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 10, 1));
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 10);
+    EXPECT_EQ(paddle::framework::get<2>(c), 0);
+    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(1, 10, 10));
+    EXPECT_EQ(paddle::framework::get<0>(c), 0);
+    EXPECT_EQ(paddle::framework::get<1>(c), 1);
+    EXPECT_EQ(paddle::framework::get<2>(c), 10);
+    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(2, 3, 4));
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 2);
+    EXPECT_EQ(paddle::framework::get<2>(c), 6);
+
+    // generate from an index
+    auto size = paddle::framework::make_dim(4, 5, 2);
+    c = paddle::framework::Dim<3>(14, size);
+    EXPECT_EQ(paddle::framework::get<0>(c), 2);
+    EXPECT_EQ(paddle::framework::get<1>(c), 3);
+    EXPECT_EQ(paddle::framework::get<2>(c), 0);
+    c = paddle::framework::Dim<3>(25, size);
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 1);
+    EXPECT_EQ(paddle::framework::get<2>(c), 1);
+}
+
+TEST(Dim, Bool) {
+    auto a = paddle::framework::make_dim(3, 4);
+    auto b = paddle::framework::make_dim(5, 6);
+    auto c = paddle::framework::make_dim(3, 4);
+
+    // in_bounds check
+    EXPECT_TRUE(paddle::framework::contained(a, b));
+    EXPECT_FALSE(paddle::framework::contained(b, a));
+
+    // comparison
+    EXPECT_TRUE(a == a);
+    EXPECT_FALSE(a == b);
+    EXPECT_TRUE(a == c);
+
+    // contiguous check
+    int x = 4, y = 5, z = 2;
+    paddle::framework::Dim<3> sizef(x, y, z);
+    paddle::framework::Dim<3> stridea(1, x, x*y);
+    paddle::framework::Dim<3> strideb(2, 2*x, 2*x*y);
+    paddle::framework::Dim<3> stridec(1, x, 2*x*y);
+    EXPECT_TRUE(paddle::framework::contiguous(sizef, stridea));
+    EXPECT_FALSE(paddle::framework::contiguous(sizef, strideb));
+    EXPECT_FALSE(paddle::framework::contiguous(sizef, stridec));
+}
+
+TEST(Dim, Print) {
+    {
+        std::stringstream ss;
+        auto a = paddle::framework::make_dim(2, 3);
+        ss << a;
+        EXPECT_EQ(ss.str(), "2, 3");
+    }
+    {
+        std::stringstream ss;
+        ss << paddle::framework::make_dim(8);
+        EXPECT_EQ(ss.str(), "8");
+    }
+}
--- a/paddle/majel/README.md
+++ b/paddle/majel/README.md
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+#pragma once
+
+#include <memory>
+#include <typeindex>
+#include <typeinfo>
+
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace framework {
+
+class Variable {
+ public:
+  template <typename T>
+  const T& Get() const {
+    PADDLE_ASSERT(IsType<T>());
+    return *static_cast<const T*>(holder_->Ptr());
+  }
+
+  template <typename T>
+  T* GetMutable() {
+    if (!IsType<T>()) {
+      holder_.reset(new PlaceholderImpl<T>(new T()));
+    }
+    return static_cast<T*>(holder_->Ptr());
+  }
+
+  template <typename T>
+  bool IsType() const {
+    return holder_ != nullptr &&
+           std::type_index(typeid(T)) == std::type_index(holder_->Type());
+  }
+
+ private:
+  struct Placeholder {
+    virtual ~Placeholder() {}
+    virtual const std::type_info& Type() const = 0;
+    virtual void* Ptr() const = 0;
+  };
+
+  // Placeholder hides type T, so it doesn't appear as a template
+  // parameter of Variable.
+  template <typename T>
+  struct PlaceholderImpl : public Placeholder {
+    PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {}
+
+    virtual const std::type_info& Type() const { return type_; }
+    virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
+
+    std::unique_ptr<T> ptr_;
+    const std::type_info& type_;
+  };
+
+  std::unique_ptr<Placeholder>
+      holder_;  // pointers to a PlaceholderImpl object indeed.
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/variable.md
+++ b/paddle/framework/variable.md
+# Design Doc: Variable
+
+
+Variable is also known as *blob* in MxNet and Caffe2.  It is the input and output type of operators, where a neural network is a graph of operators.
+
+## Requirements: Lazy Memory Allocation
+
+For the flexibility of a DL system, a variable should be able to contain any typed value -- a tensor in most cases, but could also be some integer IDs or a scope of other variables in the case of RNN.
+
+To use the minimum amount of memory, we'd like that a variable to allocate memory when it has to, or, lazy memory allocation.  Let's take the following example:
+
+```cpp
+Variable vr, v1, v2;
+
+Tensor* t1 = new Tensor();
+Tensor* t2 = new Tensor();
+
+Randomize(
+  /* malloc */ v1.GetMutable<Tensor>().mutable_data<float16>(DDim(100,200)),
+  /* size */ t1.Size());
+  
+Randomize(
+  /* malloc */ v2.GetMutable<Tensor>().mutable_data<float16>(DDim(200,300)),
+  /* size */ t2.Size());
+  
+Mult(
+  /*result*/ vr.GetMutable<Tensor>().mutable_data<v1.Type()>(SizeOfMult(v1, v2)),
+  /*input1*/ v1.Get<Tensor>().data(),
+  /*input2*/ v2.Get<Tensor>().data());
+```
+     
+We see that a variable holds nothing until `Variable::GetMutable<Tensor>()` allocates a tensor and puts it in the variable.  Similarly, a tensor gets its memory until `Tensor::mutable_data()`.
+
+This syntax for lazy memory allocation when we call `Randomize` and `Mult`, those functions that mutate the variable, so it saves us some line of C++ code.
+
+
+## Implementation: Type Hiding
+
+To make memory allocation lazy, we cannot assume that we know the type held by a variable at definition time.  In other words, `class Variable` cannot be a template `template <T> class Variable`.
+
+Because we don't know the type `T`, we cannot save a `T*` as `Variable's` data member.  Instead, we save an interface object `Placeholder`, who can return the pointer to the saved object via `Placeholder::Ptr()` as `void*`.
+
+But anyway, Variable needs to know `T` so could it `delete<T>(ptr)` and so could `Variable::Get` checks the expected type and the saved object's type.
+
+We save `T` in `PlaceholderImpl`, the implementation of `Placeholder`.  Please be aware that `PlaceholderImpl` is a class template and `T` is passed in as a template parameter.
+
+Because `PlaceholderImpl` knows `T`, it can save and return `typeid(T)` for the type comparison in `Variable::Get` and `Variable::GetMutable`.
+
+
+## Conclusion
+
+The technique type hiding utilizes C++ class templates, interface and derivation, and C++ RTTI (typeid).  This combination saves us from definition something like `caffe2::TypeMata`, which takes hundreds of lines of C++ code.
--- a/paddle/framework/variable_test.cc
+++ b/paddle/framework/variable_test.cc
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include <memory>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "paddle/framework/variable.h"
+
+TEST(Variable, GetMutable) {
+  using paddle::framework::Variable;
+
+  struct Tensor {
+    int content_;
+  };
+
+  std::unique_ptr<Variable> v(new Variable());
+
+  Tensor* t = v->GetMutable<Tensor>();
+  t->content_ = 1234;
+
+  const Tensor& tt = v->Get<Tensor>();
+  EXPECT_EQ(1234, tt.content_);
+
+  std::string* s = v->GetMutable<std::string>();
+  *s = "hello";
+
+  const std::string& ss = v->Get<std::string>();
+  EXPECT_EQ("hello", ss);
+}
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -14,8 +14,8 @@ add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
 add_dependencies(paddle_function gen_proto_cpp)

-if(WITH_GPU)
 if(WITH_TESTING)
+if(WITH_GPU)
    # TODO:
    # file(GLOB test_files . *OpTest.cpp)
    # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
@@ -30,6 +30,8 @@ if(WITH_TESTING)
    add_simple_unittest(CosSimOpTest)
    add_simple_unittest(RowConvOpTest)
 endif()
+
+add_simple_unittest(ConvOpTest)
 endif()

 add_style_check_target(paddle_function ${h_files})

--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -28,7 +28,7 @@ void testMatrixProjectionForward(int context_start,
               std::max(0, (int)(context_start + context_length - 1));
  if (pad == 0) is_padding = false;

-  FunctionCompare test(
+  CpuGpuFuncCompare test(
      "ContextProjectionForward",
      FuncConfig()
          .set("context_length", context_length)
@@ -60,7 +60,7 @@ void testMatrixProjectionBackward(int context_start,
               std::max(0, (int)(context_start + context_length - 1));
  if (pad == 0) is_padding = false;

-  FunctionCompare test(
+  CpuGpuFuncCompare test(
      "ContextProjectionBackward",
      FuncConfig()
          .set("context_length", context_length)

--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/*
+ * \brief Based on the ConvFunctionBase class, the forward calculation,
+ *        backward input calculation and backward filter calculation
+ *        of convolution operations can be implemented.
+ *
+ * Arguments of forward and backward calculation:
+ *   1. Forward calculation of convolution.
+ *      inputs = {INPUT, FILTER}, outputs = {OUTPUT}
+ *      The first and second input arguments are input image and filter data.
+ *      The output argument is output image.
+ *
+ *   2. Backward input calculation of convolution.
+ *      inputs = {OUTPUT_GRAD, FILTER}, outputs = {INPUT_GRAD}
+ *      The first and second input arguments are output grad image
+ *      and filter data.
+ *      The output argument is input grad image.
+ *
+ *   3. Backward filter calculation of convolution.
+ *      inputs = {OUTPUT_GRAD, INPUT}, outputs = {FILTER_GRAD}
+ *      The first and second input arguments are output grad image
+ *      and input image.
+ *      The output argument is filter grad.
+ *
+ * Arguments format of input, filter and output:
+ *   1. Input image, output image, input image gradient, output image gradient
+ *      are all NCHW format. Where N is batch size, C is the number of channels,
+ *      H and W is the height and width of image or image gradient.
+ *
+ *   2. The format of the filter data is MCHW, where M is the number of output
+ *      image channels, C is the number of input image channels,
+ *      H and W is height and width of filter.
+ *
+ *      If `groups` is greater than 1, the filter's data format should be GMCHW,
+ *      where G is the `groups`, and G * M is the number of output image
+ *      channels, G * C is the number of input image channels,
+ *      H and W is height and width of filter.
+ */
+class ConvFunctionBase : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    strides_ = config.get<std::vector<size_t>>("strides");
+    paddings_ = config.get<std::vector<size_t>>("paddings");
+    groups_ = config.get<size_t>("groups");
+
+    // number of inputs and outputs
+    numInputs_ = 2;
+    numOutputs_ = 1;
+  }
+
+  // input can be INPUT and INPUT_GRAD
+  // filter can be FILTER and FILTER_GRAD
+  // output can be OUTPUT and OUTPUT_GRAD
+  void checkShape(const TensorShape& input,
+                  const TensorShape& filter,
+                  const TensorShape& output) {
+    // inputs and outputs arguments should be 4-dimensional.
+    CHECK_EQ(input.ndims(), (size_t)4);
+    CHECK_EQ(output.ndims(), (size_t)4);
+    // The batchSize of the input needs to be equal to
+    // the batchSize of the output.
+    CHECK_EQ(input[0], output[0]);
+
+    if (filter.ndims() == (size_t)4) {
+      // If the filter's dimension is 4, groups convolution is not supported.
+      CHECK_EQ(groups_, (size_t)1);
+      // The input and output channel dimensions are the second and first
+      // dimensions of the filter shape.
+      CHECK_EQ(input[1], filter[1]);
+      CHECK_EQ(output[1], filter[0]);
+    } else {
+      // filter argument should be 5-dimensional.
+      CHECK_EQ(filter.ndims(), (size_t)5);
+      // The first dimension of the filter is the size of the group
+      CHECK_EQ(filter[0], groups_);
+      // The input and output channel dimensions are the third and second
+      // dimensions of the filter shape.
+      CHECK_EQ(input[1], filter[2] * groups_);
+      CHECK_EQ(output[1], filter[1] * groups_);
+    }
+  }
+
+protected:
+  size_t getFilterHeight(const TensorShape& filter) const {
+    return filter[filter.ndims() - 2];
+  }
+
+  size_t getFilterWidth(const TensorShape& filter) const {
+    return filter[filter.ndims() - 1];
+  }
+
+  std::vector<size_t> strides_;
+  std::vector<size_t> paddings_;
+
+  /// Group size, refer to grouped convolution in
+  /// Alex Krizhevsky's paper: when group=2, the first half of the
+  /// filters are only connected to the first half of the input channels,
+  /// and the second half only connected to the second half.
+  size_t groups_;
+
+  inline int strideH() const { return strides_[0]; }
+
+  inline int strideW() const { return strides_[1]; }
+
+  inline int paddingH() const { return paddings_[0]; }
+
+  inline int paddingW() const { return paddings_[1]; }
+
+  // A temporary memory in convolution calculation.
+  MemoryHandlePtr memory_;
+
+  template <DeviceType Device>
+  void resizeBuffer(size_t newSize) {
+    if (!memory_ || newSize * sizeof(real) > memory_->getAllocSize()) {
+      if (Device == DEVICE_TYPE_CPU) {
+        memory_ = std::make_shared<CpuMemoryHandle>(newSize * sizeof(real));
+      } else {
+        memory_ = std::make_shared<GpuMemoryHandle>(newSize * sizeof(real));
+      }
+    }
+  }
+};
+
+}  // namespace paddle
--- a/paddle/function/ConvOpTest.cpp
+++ b/paddle/function/ConvOpTest.cpp
--- a/paddle/function/CosSimOpTest.cpp
+++ b/paddle/function/CosSimOpTest.cpp
--- a/paddle/function/CrossMapNormalOpTest.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
--- a/paddle/gserver/layers/ExpandConvTransLayer.h
+++ b/paddle/gserver/layers/ExpandConvTransLayer.h
--- a/paddle/function/GemmConvOpGpu.cu
+++ b/paddle/function/GemmConvOpGpu.cu
--- a/paddle/function/GemmFunctor.h
+++ b/paddle/function/GemmFunctor.h
--- a/paddle/function/MulOpTest.cpp
+++ b/paddle/function/MulOpTest.cpp
--- a/paddle/function/NaiveConvOp.cpp
+++ b/paddle/function/NaiveConvOp.cpp
--- a/paddle/function/PadOpTest.cpp
+++ b/paddle/function/PadOpTest.cpp
--- a/paddle/function/RowConvOpTest.cpp
+++ b/paddle/function/RowConvOpTest.cpp
--- a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
+++ b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
--- a/paddle/gserver/layers/CudnnConvBaseLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
--- a/paddle/gserver/layers/DetectionUtil.cpp
+++ b/paddle/gserver/layers/DetectionUtil.cpp
--- a/paddle/gserver/layers/DetectionUtil.h
+++ b/paddle/gserver/layers/DetectionUtil.h
--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
--- a/paddle/gserver/layers/ExpandConvBaseLayer.h
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.h
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
--- a/paddle/gserver/layers/ExpandConvTransLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvTransLayer.cpp
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
--- a/paddle/gserver/tests/rnn_data_provider.py
+++ b/paddle/gserver/tests/rnn_data_provider.py
--- a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
--- a/paddle/gserver/tests/sequence_rnn_matched_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
--- a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
--- a/paddle/gserver/tests/sequence_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_rnn_multi_input.conf
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
--- a/paddle/majel/.gitignore
+++ b/paddle/majel/.gitignore
--- a/paddle/majel/dim_test.cu
+++ b/paddle/majel/dim_test.cu
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
--- a/paddle/memory/README.md
+++ b/paddle/memory/README.md
--- a/paddle/optimizer/CMakeLists.txt
+++ b/paddle/optimizer/CMakeLists.txt
--- a/paddle/optimizer/adadelta_optimizer.cc
+++ b/paddle/optimizer/adadelta_optimizer.cc
--- a/paddle/optimizer/adadelta_optimizer.h
+++ b/paddle/optimizer/adadelta_optimizer.h
--- a/paddle/optimizer/adagrad_optimizer.cc
+++ b/paddle/optimizer/adagrad_optimizer.cc
--- a/paddle/optimizer/adagrad_optimizer.h
+++ b/paddle/optimizer/adagrad_optimizer.h
--- a/paddle/optimizer/adam_optimizer.cc
+++ b/paddle/optimizer/adam_optimizer.cc
--- a/paddle/optimizer/adam_optimizer.h
+++ b/paddle/optimizer/adam_optimizer.h
--- a/paddle/optimizer/lr_policy.h
+++ b/paddle/optimizer/lr_policy.h
--- a/paddle/optimizer/optimizer.cc
+++ b/paddle/optimizer/optimizer.cc
--- a/paddle/optimizer/optimizer.h
+++ b/paddle/optimizer/optimizer.h
--- a/paddle/optimizer/parameter_optimizer.cc
+++ b/paddle/optimizer/parameter_optimizer.cc
--- a/paddle/optimizer/parameter_optimizer.h
+++ b/paddle/optimizer/parameter_optimizer.h
--- a/paddle/optimizer/parameter_optimizer_test.cpp
+++ b/paddle/optimizer/parameter_optimizer_test.cpp
--- a/paddle/optimizer/serialization.h
+++ b/paddle/optimizer/serialization.h
--- a/paddle/optimizer/serialization_test.cpp
+++ b/paddle/optimizer/serialization_test.cpp
--- a/paddle/optimizer/sgd_optimizer.cc
+++ b/paddle/optimizer/sgd_optimizer.cc
--- a/paddle/optimizer/sgd_optimizer.h
+++ b/paddle/optimizer/sgd_optimizer.h
--- a/paddle/optimizer/tensor.h
+++ b/paddle/optimizer/tensor.h
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
--- a/paddle/parameter/ParameterUpdaterHook.cpp
+++ b/paddle/parameter/ParameterUpdaterHook.cpp
--- a/paddle/parameter/tests/test_argument.cpp
+++ b/paddle/parameter/tests/test_argument.cpp
--- a/paddle/platform/.clang-format
+++ b/paddle/platform/.clang-format
--- a/paddle/majel/CMakeLists.txt
+++ b/paddle/majel/CMakeLists.txt
--- a/paddle/majel/detail/cuda_assert.h
+++ b/paddle/majel/detail/cuda_assert.h
--- a/paddle/majel/cuda_test.cu
+++ b/paddle/majel/cuda_test.cu
--- a/paddle/majel/detail/hostdevice.h
+++ b/paddle/majel/detail/hostdevice.h
--- a/paddle/utils/Compiler.h
+++ b/paddle/utils/Compiler.h
--- a/paddle/platform/must_check_test.cc
+++ b/paddle/platform/must_check_test.cc
--- a/paddle/majel/place.cc
+++ b/paddle/majel/place.cc
--- a/paddle/majel/place.h
+++ b/paddle/majel/place.h
--- a/paddle/majel/place_test.cc
+++ b/paddle/majel/place_test.cc
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
--- a/paddle/scripts/travis/precommit.sh
+++ b/paddle/scripts/travis/precommit.sh
--- a/paddle/scripts/travis/common.sh
+++ b/paddle/scripts/travis/common.sh
--- a/paddle/scripts/travis/main.sh
+++ b/paddle/scripts/travis/main.sh
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
--- a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
+++ b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
--- a/paddle/trainer/tests/sample_trainer_rnn_gen.conf
+++ b/paddle/trainer/tests/sample_trainer_rnn_gen.conf
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
--- a/paddle/utils/CustomStackTrace.h
+++ b/paddle/utils/CustomStackTrace.h
--- a/paddle/utils/Error.h
+++ b/paddle/utils/Error.h
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
--- a/proto/OptimizerConfig.proto
+++ b/proto/OptimizerConfig.proto
--- a/proto/ParameterConfig.proto
+++ b/proto/ParameterConfig.proto
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_repeat_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_repeat_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
--- a/python/paddle/v2/attr.py
+++ b/python/paddle/v2/attr.py
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
--- a/python/paddle/v2/dataset/movielens.py
+++ b/python/paddle/v2/dataset/movielens.py
--- a/python/paddle/v2/dataset/sentiment.py
+++ b/python/paddle/v2/dataset/sentiment.py
--- a/python/paddle/v2/dataset/tests/common_test.py
+++ b/python/paddle/v2/dataset/tests/common_test.py
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
--- a/python/paddle/v2/master/.gitignore
+++ b/python/paddle/v2/master/.gitignore
--- a/python/paddle/v2/master/__init__.py
+++ b/python/paddle/v2/master/__init__.py
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
--- a/python/paddle/v2/reader/tests/creator_test.py
+++ b/python/paddle/v2/reader/tests/creator_test.py
--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ b/python/paddle/v2/reader/tests/decorator_test.py
--- a/python/paddle/v2/reader/tests/test_recordio_creator.dat
+++ b/python/paddle/v2/reader/tests/test_recordio_creator.dat
--- a/python/paddle/v2/tests/test_parameters.py
+++ b/python/paddle/v2/tests/test_parameters.py
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
--- a/python/setup.py.in
+++ b/python/setup.py.in