diff --git a/.gitignore b/.gitignore
index 2b30f7938c8a1672acd0a14b7051af12c37889fb..275173b9677bffe028152fe8eadb3384329aeb5a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,6 @@ third_party/
 *~
 bazel-*
 third_party/
+
+# clion workspace.
+cmake-build-*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9b138576fcc695408c4cc0a03d227da7d0c6f440..4cd8eb12f6b23c67e8fb22f43d57afd4a96770fd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,8 +3,8 @@
     hooks:
     -   id: remove-crlf
         files: (?!.*third_party)^.*$ | (?!.*book)^.*$
--   repo: https://github.com/reyoung/mirrors-yapf.git
-    sha: v0.13.2
+-   repo: https://github.com/PaddlePaddle/mirrors-yapf.git
+    sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
     hooks:
     -   id: yapf
         files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
diff --git a/.travis.yml b/.travis.yml
index 865e21f046b7f3ac4bc3de09c1300a0a1d0337d4..44b755ee32d204c883f0d74e7ad0f78380918954 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,6 +25,7 @@ addons:
       - python2.7-dev
       - python-numpy
       - python-wheel
+      - libboost-dev
       - curl
       - swig
       - graphviz
@@ -47,7 +48,8 @@ before_install:
   - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
   # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python 
   # protobuf version.
-  - pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
+  - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
+  - pip install rarfile
   - |
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa4f1eaff9125f2ff11a6ef83e503acd56b79e21..79210d043648de5d493f0b998eeb885c993a6106 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,18 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
+cmake_minimum_required(VERSION 3.0)
+
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
 
 include(system)
 
-if(ANDROID)
-    cmake_minimum_required(VERSION 3.7)
-else()
-    cmake_minimum_required(VERSION 3.0)
-endif()
-
-project(paddle CXX C)
+project(paddle CXX C Go)
 
 find_package(Sphinx)
 if(NOT CMAKE_CROSSCOMPILING)
@@ -96,6 +92,7 @@ include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 
+include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(cpplint)            # set paddle c++ style
 include(ccache)             # set ccache for compilation
diff --git a/Dockerfile b/Dockerfile
index ad0d086d3c65b5901178aa681aa36ccc0ea0c246..b6f99ca539d077164c71d797a5ccda7b1b5c44ba 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -33,6 +33,15 @@ RUN apt-get update && \
     clang-3.8 llvm-3.8 libclang-3.8-dev && \
     apt-get clean -y
 
+# Install Go
+RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
+    tar -C /usr/local -xzf go.tgz && \
+    mkdir /root/gopath && \
+    rm go.tgz
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin
+
 # git credential to skip password typing
 RUN git config --global credential.helper store
 
@@ -47,7 +56,8 @@ RUN pip install --upgrade pip && \
     pip install -U docopt PyYAML sphinx && \
     pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
     pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
+    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ 
+    pip install rarfile
 
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
diff --git a/Dockerfile.android b/Dockerfile.android
new file mode 100644
index 0000000000000000000000000000000000000000..fa24f6f06c4e76444c83bcf13fe312afdcb6c348
--- /dev/null
+++ b/Dockerfile.android
@@ -0,0 +1,38 @@
+FROM ubuntu:16.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+ARG UBUNTU_MIRROR
+RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
+ENV HOME=/root \
+    ANDROID_NDK_HOME=/opt/android-ndk-linux \
+    ANDROID_STANDALONE_TOOLCHAIN=/opt/android-toolchain-gcc
+
+RUN apt-get update && \
+    apt-get install -y \
+    git python-dev python-pip python-numpy \
+    wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
+    apt-get clean -y
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+RUN pip install --upgrade pip && \
+    pip install -U 'protobuf==3.1.0' && \
+    pip install -U wheel sphinx && \
+    pip install pre-commit
+
+# Android NDK
+RUN mkdir /opt/android-ndk-tmp && \
+    cd /opt/android-ndk-tmp && \
+    wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
+    unzip -q android-ndk-r14b-linux-x86_64.zip && \
+    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
+    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_STANDALONE_TOOLCHAIN} && \
+    rm -rf /opt/android-ndk-tmp && \
+    rm -rf ${ANDROID_NDK_HOME}
+
+CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
diff --git a/README.md b/README.md
index bcc24b84128df282a2e3f0bc62aafe1ffe172338..fa16cc3cf2ef9c1200a19e03192c94c65fc08679 100644
--- a/README.md
+++ b/README.md
@@ -70,7 +70,7 @@ before looking into the
 We provide [English](http://www.paddlepaddle.org/develop/doc/) and
 [Chinese](http://www.paddlepaddle.org/doc_cn/) documentation.
 
-- [Deep Learning 101](http://book.paddlepaddle.org/index.en.html)
+- [Deep Learning 101](http://book.paddlepaddle.org/index.html)
 
   You might want to start from the this online interactive book that can run in Jupyter Notebook.
 
diff --git a/RELEASE.cn.md b/RELEASE.cn.md
old mode 100755
new mode 100644
diff --git a/cmake/CMakeDetermineGoCompiler.cmake b/cmake/CMakeDetermineGoCompiler.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..abf0a00c5e99e4201dede36f13200cfc9c151ad3
--- /dev/null
+++ b/cmake/CMakeDetermineGoCompiler.cmake
@@ -0,0 +1,46 @@
+if(NOT CMAKE_Go_COMPILER)
+  if(NOT $ENV{GO_COMPILER} STREQUAL "")
+    get_filename_component(CMAKE_Go_COMPILER_INIT $ENV{GO_COMPILER} PROGRAM PROGRAM_ARGS CMAKE_Go_FLAGS_ENV_INIT)
+
+    if(CMAKE_Go_FLAGS_ENV_INIT)
+      set(CMAKE_Go_COMPILER_ARG1 "${CMAKE_Go_FLAGS_ENV_INIT}" CACHE STRING "First argument to Go compiler")
+    endif()
+
+    if(NOT EXISTS ${CMAKE_Go_COMPILER_INIT})
+      message(SEND_ERROR "Could not find compiler set in environment variable GO_COMPILER:\n$ENV{GO_COMPILER}.")
+    endif()
+
+  endif()
+
+  set(Go_BIN_PATH
+    $ENV{GOPATH}
+    $ENV{GOROOT}
+    $ENV{GOROOT}/bin
+    $ENV{GO_COMPILER}
+    /usr/bin
+    /usr/local/bin
+    )
+
+  if(CMAKE_Go_COMPILER_INIT)
+    set(CMAKE_Go_COMPILER ${CMAKE_Go_COMPILER_INIT} CACHE PATH "Go Compiler")
+  else()
+    find_program(CMAKE_Go_COMPILER
+      NAMES go
+      PATHS ${Go_BIN_PATH}
+    )
+    if(CMAKE_Go_COMPILER)
+      EXEC_PROGRAM(${CMAKE_Go_COMPILER} ARGS version OUTPUT_VARIABLE GOLANG_VERSION)
+      STRING(REGEX MATCH "go[0-9]+[.0-9]*[ /A-Za-z0-9]*" VERSION "${GOLANG_VERSION}")
+      message("-- The Golang compiler identification is ${VERSION}")
+      message("-- Check for working Golang compiler: ${CMAKE_Go_COMPILER}")
+    endif()
+  endif()
+
+endif()
+
+mark_as_advanced(CMAKE_Go_COMPILER)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/CMakeGoCompiler.cmake.in
+  ${CMAKE_PLATFORM_INFO_DIR}/CMakeGoCompiler.cmake @ONLY)
+
+set(CMAKE_Go_COMPILER_ENV_VAR "GO_COMPILER")
diff --git a/cmake/CMakeGoCompiler.cmake.in b/cmake/CMakeGoCompiler.cmake.in
new file mode 100644
index 0000000000000000000000000000000000000000..a71f08e064656fbaad8cfa77aea6f216515712ef
--- /dev/null
+++ b/cmake/CMakeGoCompiler.cmake.in
@@ -0,0 +1,8 @@
+set(CMAKE_Go_COMPILER "@CMAKE_Go_COMPILER@")
+set(CMAKE_Go_COMPILER_LOADED 1)
+
+set(CMAKE_Go_SOURCE_FILE_EXTENSIONS go)
+set(CMAKE_Go_LINKER_PREFERENCE 40)
+set(CMAKE_Go_OUTPUT_EXTENSION .o)
+set(CMAKE_Go_OUTPUT_EXTENSION_REPLACE 1)
+set(CMAKE_Go_COMPILER_ENV_VAR "GO_COMPILER")
diff --git a/cmake/CMakeGoInformation.cmake b/cmake/CMakeGoInformation.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..ba51ac93fcd429478f324b66bd5129d94ea2a8f4
--- /dev/null
+++ b/cmake/CMakeGoInformation.cmake
@@ -0,0 +1,7 @@
+if(NOT CMAKE_Go_COMPILE_OBJECT)
+  set(CMAKE_Go_COMPILE_OBJECT "go tool compile -l -N -o <OBJECT> <SOURCE> ")
+endif()
+
+if(NOT CMAKE_Go_LINK_EXECUTABLE)
+  set(CMAKE_Go_LINK_EXECUTABLE "go tool link -o <TARGET> <OBJECTS>  ")
+endif()
diff --git a/cmake/CMakeTestGoCompiler.cmake b/cmake/CMakeTestGoCompiler.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..b9891b015baced05b51e34dba562fd98a84fe14c
--- /dev/null
+++ b/cmake/CMakeTestGoCompiler.cmake
@@ -0,0 +1 @@
+set(CMAKE_Go_COMPILER_WORKS 1 CACHE INTERNAL "")
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
index 02a5c0b2c9be782c459a255c6ffd6ba6441f2693..48f705818b70c92adef107fd3c973ae1ab3d34bb 100644
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -59,7 +59,7 @@ macro(add_style_check_target TARGET_NAME)
                                 "--filter=${STYLE_FILTER}"
                                 "--write-success=${CUR_GEN}" ${filename}
                     DEPENDS ${filename}
-                    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
             endif()
         endforeach()
     endif()
diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..9724c16122ab2e6be55864c8716698c9b9d7c3f0
--- /dev/null
+++ b/cmake/cross_compiling/android.cmake
@@ -0,0 +1,191 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is a toolchain file for cross-compiling for Android, and the
+# configuration refers to the open-source resposity:
+#     https://github.com/taka-no-me/android-cmake
+# Most of the variables are compatible with that used in
+#     https://developer.android.com/ndk/guides/cmake.html
+# The supported variables are listed belows:
+# 
+# ANDROID_STANDALONE_TOOLCHAIN
+# ANDROID_ABI
+# ANDROID_NATIVE_API_LEVEL
+# ANDROID_ARM_MODE
+# ANDROID_ARM_NEON
+#
+# For CMake >= 3.7.0, all the settings will be delivered to CMake system
+# variables to let CMake do the cross-compiling configurations itself.
+# More detail of cross-compiling settings
+#     https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html
+
+IF(NOT ANDROID)
+    return()
+ENDIF()
+
+# check the exist of android standalone toolchain
+IF(NOT DEFINED ANDROID_STANDALONE_TOOLCHAIN)
+    SET(ANDROID_STANDALONE_TOOLCHAIN $ENV{ANDROID_STANDALONE_TOOLCHAIN}
+        CACHE PATH "Folder holds the standalone toolchain of Android NDK")
+ENDIF()
+IF(NOT ANDROID_STANDALONE_TOOLCHAIN)
+    MESSAGE(WARNING "It is recommended to set ANDROID_STANDALONE_TOOLCHAIN to "
+            "use a standalone toolchain.\n"
+            "To cross-compile for Android, you need to:\n"
+            "1. Download an Android NDK from"
+            " https://developer.android.com/ndk/downloads/index.html\n"
+            "2. Setup a standalone toolchain"
+            "https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn\n")
+ENDIF()
+
+IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL)
+    IF(ANDROID_NATIVE_API_LEVEL MATCHES "^android-[0-9]+$")
+        STRING(REPLACE "android-" "" CMAKE_SYSTEM_VERSION "${CMAKE_MATCH_0}")
+    ELSEIF(ANDROID_NATIVE_API_LEVEL MATCHES "^[0-9]+$")
+        SET(CMAKE_SYSTEM_VERSION ${ANDROID_NATIVE_API_LEVEL})
+    ENDIF()
+ENDIF()
+
+IF(NOT DEFINED ANDROID_ABI)
+    SET(ANDROID_ABI "armeabi-v7a")
+ENDIF()
+
+IF(NOT DEFINED ANDROID_ARM_MODE)
+    SET(ANDROID_ARM_MODE ON)
+ENDIF()
+IF(ANDROID_ARM_MODE)
+    SET(ANDROID_ARM_MODE_NAME "arm")
+ELSE(ANDROID_ARM_MODE)
+    SET(ANDROID_ARM_MODE_NAME "thumb")
+ENDIF(ANDROID_ARM_MODE)
+
+IF(NOT DEFINED ANDROID_ARM_NEON)
+    SET(ANDROID_ARM_NEON ON)
+ENDIF()
+
+IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
+    IF("${CMAKE_VERSION}" VERSION_LESS "3.1.0")
+        SET(CMAKE_SYSTEM_NAME "Linux")
+    ENDIF()
+    MESSAGE(WARNING "It is recommended to use CMake >= 3.7.0 (current version: "
+            "${CMAKE_VERSION}), when cross-compiling for Android.")
+
+    IF(ANDROID_STANDALONE_TOOLCHAIN)
+        SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot")
+
+        IF(NOT CMAKE_SYSTEM_VERSION)
+            SET(ANDROID_STANDALONE_TOOLCHAIN_API "")
+            SET(ANDROID_API_LEVEL_H_REGEX "^[\t ]*#[\t ]*define[\t ]+__ANDROID_API__[\t ]+([0-9]+)")
+            FILE(STRINGS "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include/android/api-level.h"
+                ANDROID_API_LEVEL_H_CONTENT REGEX "${ANDROID_API_LEVEL_H_REGEX}")
+            IF(ANDROID_API_LEVEL_H_CONTENT MATCHES "${ANDROID_API_LEVEL_H_REGEX}")
+                SET(ANDROID_STANDALONE_TOOLCHAIN_API "${CMAKE_MATCH_1}")
+            ENDIF()
+            SET(CMAKE_SYSTEM_VERSION ${ANDROID_STANDALONE_TOOLCHAIN_API})
+        ENDIF()
+
+        # Toolchain
+        SET(ANDROID_TOOLCHAIN "gcc")
+        SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN})
+        IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+            SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
+            IF(ANDROID_ABI STREQUAL "armeabi")
+                SET(CMAKE_SYSTEM_PROCESSOR armv5te)
+            ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
+                SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
+            ENDIF()
+        ENDIF()
+        SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
+    ENDIF()
+
+    # C compiler
+    IF(NOT CMAKE_C_COMPILER)
+        SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc")
+    ELSE()
+        GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
+    ENDIF()
+    IF(NOT EXISTS ${ANDROID_C_COMPILER})
+        MESSAGE(FATAL_ERROR "Cannot find C compiler: ${ANDROID_C_COMPILER}")
+    ENDIF()
+
+    # CXX compiler
+    IF(NOT CMAKE_CXX_COMPILER)
+        SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}g++")
+    ELSE()
+        GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
+    ENDIF()
+    IF(NOT EXISTS ${ANDROID_CXX_COMPILER})
+        MESSAGE(FATAL_ERROR "Cannot find CXX compiler: ${ANDROID_CXX_COMPILER}")
+    ENDIF()
+
+    SET(CMAKE_C_COMPILER ${ANDROID_C_COMPILER} CACHE PATH "C compiler" FORCE)
+    SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
+
+    # Toolchain and ABI specific flags.
+    SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections -finline-limit=64")
+    SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections")
+
+    IF(ANDROID_ABI STREQUAL "armeabi")
+        LIST(APPEND ANDROID_COMPILER_FLAGS
+             -march=armv5te
+             -mtune=xscale
+             -msoft-float)
+    ENDIF()
+    IF(ANDROID_ABI STREQUAL "armeabi-v7a")
+        LIST(APPEND ANDROID_COMPILER_FLAGS
+             -march=armv7-a
+             -mfloat-abi=softfp)
+        IF(ANDROID_ARM_NEON)
+            LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=neon)
+        ELSE()
+            LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16)
+        ENDIF()
+        LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8)
+    ENDIF()
+
+    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+        IF(ANDROID_ARM_MODE)
+            LIST(APPEND ANDROID_COMPILER_FLAGS -marm)
+        ELSE()
+            LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb)
+        ENDIF()
+    ENDIF()
+
+    STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
+    STRING(REPLACE ";" " " ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}")
+
+    SET(CMAKE_C_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_C_FLAGS}"
+        CACHE STRING "C flags")
+    SET(CMAKE_CXX_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_CXX_FLAGS}"
+        CACHE STRING "CXX flags")
+    SET(CMAKE_SHARED_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}"
+        CACHE STRING "shared linker flags")
+
+    SET(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
+    SET(CMAKE_EXE_LINKER_FLAGS "-pie -fPIE ${ANDROID_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}"
+        CACHE STRING "executable linker flags")
+
+    MESSAGE(STATUS "Android: Targeting API '${CMAKE_SYSTEM_VERSION}' "
+            "with architecture '${ANDROID_ARM_MODE_NAME}', "
+            "ABI '${ANDROID_ABI}', and processor '${CMAKE_SYSTEM_PROCESSOR}'")
+    MESSAGE(STATUS "System CMAKE_C_FLAGS: " ${CMAKE_C_FLAGS})
+    MESSAGE(STATUS "System CMAKE_CXX_FLAGS: " ${CMAKE_CXX_FLAGS})
+ELSE()
+    IF(ANDROID_STANDALONE_TOOLCHAIN)
+        SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN})
+    ENDIF()
+    SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
+    SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
+    SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+ENDIF()
diff --git a/cmake/cross_compiling/host.cmake b/cmake/cross_compiling/host.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..14c35266ec60b439aaef30e5e4e0540c534160ae
--- /dev/null
+++ b/cmake/cross_compiling/host.cmake
@@ -0,0 +1,49 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# find host C compiler
+IF(HOST_C_COMPILER)
+    SET(HOST_C_COMPILER_NAME ${HOST_C_COMPILER})
+ELSEIF(NOT $ENV{CC} STREQUAL "")
+    SET(HOST_C_COMPILER_NAME $ENV{CC})
+ELSE()
+    SET(HOST_C_COMPILER_NAME cc)
+ENDIF()
+
+GET_FILENAME_COMPONENT(HOST_C_COMPILER_PATH ${HOST_C_COMPILER_NAME} PROGRAM)
+IF(NOT HOST_C_COMPILER_PATH OR NOT EXISTS ${HOST_C_COMPILER_PATH})
+    MESSAGE(FATAL_ERROR "Cannot find host C compiler, set host C compiler:\n"
+            "\tcmake .. -DHOST_C_COMPILER=...")
+ENDIF()
+
+# find host CXX compiler
+IF(HOST_CXX_COMPILER)
+    SET(HOST_CXX_COMPILER_NAME ${HOST_CXX_COMPILER})
+ELSEIF(NOT $ENV{CXX} STREQUAL "")
+    SET(HOST_CXX_COMPILER_NAME $ENV{CXX})
+ELSE()
+    SET(HOST_CXX_COMPILER_NAME c++)
+ENDIF()
+
+GET_FILENAME_COMPONENT(HOST_CXX_COMPILER_PATH ${HOST_CXX_COMPILER_NAME} PROGRAM)
+IF(NOT HOST_CXX_COMPILER_PATH OR NOT EXISTS ${HOST_CXX_COMPILER_PATH})
+    MESSAGE(FATAL_ERROR "Cannot find host CXX compiler, set host CXX compiler:\n"
+            "\tcmake .. -DHOST_CXX_COMPILER=...")
+ENDIF()
+
+SET(HOST_C_COMPILER ${HOST_C_COMPILER_PATH} CACHE PATH "Host C compiler")
+SET(HOST_CXX_COMPILER ${HOST_CXX_COMPILER_PATH} CACHE PATH "Host CXX compiler")
+
+MESSAGE(STATUS "Found host C compiler: " ${HOST_C_COMPILER})
+MESSAGE(STATUS "Found host CXX compiler: " ${HOST_CXX_COMPILER})
diff --git a/cmake/cross_compiling/raspberry_pi.cmake b/cmake/cross_compiling/raspberry_pi.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..817b39f6833e37c340d4ee465048480cfc3db151
--- /dev/null
+++ b/cmake/cross_compiling/raspberry_pi.cmake
@@ -0,0 +1,84 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is a toolchain file for cross-compiling for Raspberry Pi.
+# The supported variables are listed belows:
+#
+# RPI_TOOLCHAIN
+# RPI_ARM_NEON
+#
+# Also you can set CMAKE_C/CXX_COMPILER yourself, through cmake arguments.
+
+IF(NOT RPI)
+    return()
+ENDIF()
+ 
+SET(CMAKE_SYSTEM_NAME Linux)
+SET(CMAKE_SYSTEM_VERSION 1)
+SET(CMAKE_SYSTEM_PROCESSOR arm)
+
+# check the exist of raspberry pi toolchain
+IF(NOT DEFINED RPI_TOOLCHAIN)
+    SET(RPI_TOOLCHAIN $ENV{RPI_TOOLCHAIN}
+        CACHE PATH "Folder holds the toolchain of Raspberr Pi")
+ENDIF()
+IF(NOT RPI_TOOLCHAIN)
+    MESSAGE(WARNING "It is recommended to set RPI_TOOLCHAIN to use toolchain.\n"
+            "To cross-compile for Raspberry Pi, you need to download the tools using:\n"
+            " git clone https://github.com/raspberrypi/tools\n")
+ENDIF()
+
+IF(NOT DEFINED RPI_ARM_NEON)
+    SET(RPI_ARM_NEON ON)
+ENDIF()
+
+IF(RPI_TOOLCHAIN)
+    SET(RPI_TOOLCHAIN_ROOT ${RPI_TOOLCHAIN})
+    IF(RPI_TOOLCHAIN_ROOT MATCHES "gcc-linaro-arm-linux-gnueabihf-raspbian(-x64)?$")
+        # gcc-linaro-arm-linux-gnueabihf-raspbian
+        # gcc-linaro-arm-linux-gnueabihf-raspbian-x64
+        SET(RPI_TOOLCHAIN_NAME arm-linux-gnueabihf)
+    ENDIF()
+    SET(RPI_TOOLCHAIN_PREFIX "${RPI_TOOLCHAIN_ROOT}/bin/${RPI_TOOLCHAIN_NAME}-")
+ENDIF()
+
+# C compiler
+IF(NOT CMAKE_C_COMPILER)
+    SET(RPI_C_COMPILER "${RPI_TOOLCHAIN_PREFIX}gcc")
+ELSE()
+    GET_FILENAME_COMPONENT(RPI_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
+ENDIF()
+IF(NOT EXISTS ${RPI_C_COMPILER})
+    MESSAGE(FATAL_ERROR "Cannot find C compiler: ${RPI_C_COMPILER}")
+ENDIF()
+
+# CXX compiler
+IF(NOT CMAKE_CXX_COMPILER)
+    SET(RPI_CXX_COMPILER "${RPI_TOOLCHAIN_PREFIX}g++")
+ELSE()
+    GET_FILENAME_COMPONENT(RPI_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
+ENDIF()
+IF(NOT EXISTS ${RPI_CXX_COMPILER})
+    MESSAGE(FATAL_ERROR "Cannot find CXX compiler: ${RPI_CXX_COMPILER}")
+ENDIF()
+
+SET(CMAKE_C_COMPILER ${RPI_C_COMPILER} CACHE PATH "C compiler" FORCE)
+SET(CMAKE_CXX_COMPILER ${RPI_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
+
+IF(RPI_ARM_NEON)
+    SET(RPI_C_FLAGS "${RPI_C_FLAGS} -mfpu=neon")
+ENDIF()
+
+SET(CMAKE_C_FLAGS "${RPI_C_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
+SET(CMAKE_CXX_FLAGS "${RPI_C_FLAGS} ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index af9be86961833dcd62371227165d411a3b61d79e..92dce20c698acb7257321bf50c569331a13b106b 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -11,11 +11,23 @@ find_path(CUDNN_INCLUDE_DIR cudnn.h
 
 get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
 
+if(NOT ${CMAKE_HOST_SYSTEM_PROCESSOR})
+    execute_process(
+        COMMAND uname -m COMMAND tr -d '\n'
+        OUTPUT_VARIABLE HOST_ARCH
+        RESULT_VARIABLE UNAME_RESULT)
+    if(${UNAME_RESULT})
+        set(HOST_ARCH "x86_64")
+    endif(${UNAME_RESULT})
+else(NOT ${CMAKE_HOST_SYSTEM_PROCESSOR})
+    set(HOST_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
+endif(NOT ${CMAKE_HOST_SYSTEM_PROCESSOR})
+
 list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     ${CUDNN_ROOT}
     ${CUDNN_ROOT}/lib64
     ${CUDNN_ROOT}/lib
-    ${CUDNN_ROOT}/lib/x86_64-linux-gnu
+    ${CUDNN_ROOT}/lib/${HOST_ARCH}-linux-gnu
     $ENV{CUDNN_ROOT}
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake
index 8116f235d535917c03deb646ff4ec083a0cdadc7..62eea42692b4191e53d0bbb0805786fd15ac7944 100644
--- a/cmake/external/any.cmake
+++ b/cmake/external/any.cmake
@@ -18,3 +18,4 @@ ExternalProject_Add(
 )
 
 add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
+LIST(APPEND external_project_dependencies linb_any)
\ No newline at end of file
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 0afb3ab9af48046af01f03838eefa0bd2fcb2821..a0d0a892c4b3cc3743ac725f3cd90444f18abf34 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -26,7 +26,7 @@ ENDIF(WIN32)
 INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 
 ExternalProject_Add(
-    gflags
+    extern_gflags
     ${EXTERNAL_PROJECT_LOG_ARGS}
     GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
     PREFIX          ${GFLAGS_SOURCES_DIR}
@@ -44,4 +44,8 @@ ExternalProject_Add(
                      -DCMAKE_BUILD_TYPE:STRING=Release
 )
 
+ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
+ADD_DEPENDENCIES(gflags extern_gflags)
+
 LIST(APPEND external_project_dependencies gflags)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 4a9e2ecc6bbe74c5856a55fb0c982777d7ac25b7..b70e94a170f17cc61f61673609e6eb941662ea62 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -27,7 +27,7 @@ ENDIF(WIN32)
 INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
 
 ExternalProject_Add(
-    glog
+    extern_glog
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS gflags
     GIT_REPOSITORY  "https://github.com/google/glog.git"
@@ -48,4 +48,8 @@ ExternalProject_Add(
                      -DCMAKE_BUILD_TYPE:STRING=Release
 )
 
+ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
+ADD_DEPENDENCIES(glog extern_glog)
+
 LIST(APPEND external_project_dependencies glog)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 49c7d71443cda700a14af6be65ff6658eec7229f..77e06e983e9f8bfaf6320e3c67b85b692ed877fc 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -35,7 +35,7 @@ IF(WITH_TESTING)
     ENDIF(WIN32)
 
     ExternalProject_Add(
-        gtest
+        extern_gtest
         ${EXTERNAL_PROJECT_LOG_ARGS}
         GIT_REPOSITORY  "https://github.com/google/googletest.git"
         GIT_TAG         "release-1.8.0"
@@ -55,5 +55,14 @@ IF(WITH_TESTING)
                          -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                          -DCMAKE_BUILD_TYPE:STRING=Release
     )
-    LIST(APPEND external_project_dependencies gtest)
+
+    ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET gtest PROPERTY IMPORTED_LOCATION ${GTEST_LIBRARIES})
+    ADD_DEPENDENCIES(gtest extern_gtest)
+
+    ADD_LIBRARY(gtest_main STATIC IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES})
+    ADD_DEPENDENCIES(gtest_main extern_gtest)
+
+    LIST(APPEND external_project_dependencies gtest gtest_main)
 ENDIF(WITH_TESTING)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 18ac74aa6f7531c4001fe91960f8332619c99342..cb67793cf974cb8cdd0779227e8642cf7437f7fb 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -21,26 +21,46 @@ IF(NOT ${CBLAS_FOUND})
     SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
     SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
 
-    IF(WIN32)
-        SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/openblas.lib" CACHE FILEPATH "openblas library." FORCE)
-    ELSE(WIN32)
-        SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/libopenblas.a" CACHE FILEPATH "openblas library" FORCE)
-    ENDIF(WIN32)
+    SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/${LIBRARY_PREFIX}openblas${STATIC_LIBRARY_SUFFIX}"
+        CACHE FILEPATH "openblas library." FORCE)
+
+    SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1)
+
+    IF(ANDROID)
+        # arm_soft_fp_abi branch of OpenBLAS to support softfp
+        #   https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
+        SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
+        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0 libs)
+    ELSEIF(RPI)
+        # use hardfp
+        SET(OPENBLAS_COMMIT "v0.2.19")
+        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0 libs)
+    ELSE()
+        SET(OPENBLAS_COMMIT "v0.2.19")
+        SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 libs NUM_THREADS=64)
+    ENDIF()
 
     ExternalProject_Add(
-        openblas
+        extern_openblas
         ${EXTERNAL_PROJECT_LOG_ARGS}
         GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
-        GIT_TAG             v0.2.19
+        GIT_TAG             ${OPENBLAS_COMMIT}
         PREFIX              ${CBLAS_SOURCES_DIR}
         INSTALL_DIR         ${CBLAS_INSTALL_DIR}
         BUILD_IN_SOURCE     1
-        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_LAPACK=1 DYNAMIC_ARCH=1 NO_SHARED=1 libs netlib
+        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
         INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR>
         UPDATE_COMMAND      ""
         CONFIGURE_COMMAND   ""
     )
-    LIST(APPEND external_project_dependencies openblas)
 ENDIF(NOT ${CBLAS_FOUND})
 
+MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
 INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
+
+ADD_LIBRARY(cblas STATIC IMPORTED)
+SET_PROPERTY(TARGET cblas PROPERTY IMPORTED_LOCATION ${CBLAS_LIBRARIES})
+IF(NOT ${CBLAS_FOUND})
+    ADD_DEPENDENCIES(cblas extern_openblas)
+    LIST(APPEND external_project_dependencies cblas)
+ENDIF(NOT ${CBLAS_FOUND})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index a9db4e8ba410c718f1ee4d69f4551e2773c60125..7340394b1e1fad9e1893ac87d62febb8dd72751c 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -14,42 +14,70 @@
 
 INCLUDE(ExternalProject)
 
-set(PROTOBUF_VERSION 3.1)
-FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION})
-
-IF(PROTOBUF_FOUND)
+macro(PROMPT_PROTOBUF_LIB)
+    MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}")
+    MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}")
+    MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}")
+    INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
+    RETURN()
+endmacro()
+macro(SET_PROTOBUF_VERSION)
     EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
     STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}")
-    IF ("${PROTOBUF_VERSION}" VERSION_LESS "3.1.0")
-        SET(PROTOBUF_FOUND OFF)
-    ENDIF()
-ENDIF(PROTOBUF_FOUND)
+endmacro()
 
-IF(NOT PROTOBUF_FOUND)
-    SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf)
-    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf)
-    SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE)
+set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
+if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
+    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include)
+    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib)
+    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib)
+    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib)
+    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin)
+    if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
+        message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
+        SET_PROTOBUF_VERSION()
+        PROMPT_PROTOBUF_LIB()
+    else()
+        message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}.")
+    endif()
+endif()
 
-    IF(WIN32)
-        SET(PROTOBUF_LITE_LIBRARY
-            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE)
-        SET(PROTOBUF_LIBRARY
-            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE)
-        SET(PROTOBUF_PROTOC_LIBRARY
-            "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE)
-        SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE)
-    ELSE(WIN32)
-        SET(PROTOBUF_LITE_LIBRARY
-            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
-        SET(PROTOBUF_LIBRARY
-            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
-        SET(PROTOBUF_PROTOC_LIBRARY
-            "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
-        SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE)
-    ENDIF(WIN32)
+FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
+    SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_NAME})
+    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_NAME})
+
+    SET(${TARGET_NAME}_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE)
+    SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE)
+    SET(${TARGET_NAME}_LITE_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${STATIC_LIBRARY_SUFFIX}"
+         PARENT_SCOPE)
+    SET(${TARGET_NAME}_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${STATIC_LIBRARY_SUFFIX}"
+         PARENT_SCOPE)
+    SET(${TARGET_NAME}_PROTOC_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${STATIC_LIBRARY_SUFFIX}"
+         PARENT_SCOPE)
+    SET(${TARGET_NAME}_PROTOC_EXECUTABLE
+        "${PROTOBUF_INSTALL_DIR}/bin/protoc${EXECUTABLE_SUFFIX}"
+         PARENT_SCOPE)
+
+    SET(OPTIONAL_CACHE_ARGS "")
+    SET(OPTIONAL_ARGS "")
+    IF(BUILD_FOR_HOST)
+        SET(OPTIONAL_ARGS "-Dprotobuf_WITH_ZLIB=OFF")
+    ELSE()
+        SET(OPTIONAL_ARGS
+            "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+            "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
+            "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
+            "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
+            "-Dprotobuf_WITH_ZLIB=ON"
+            "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}")
+        SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
+    ENDIF()
 
     ExternalProject_Add(
-        protobuf
+        ${TARGET_NAME}
         ${EXTERNAL_PROJECT_LOG_ARGS}
         PREFIX          ${PROTOBUF_SOURCES_DIR}
         UPDATE_COMMAND  ""
@@ -57,11 +85,9 @@ IF(NOT PROTOBUF_FOUND)
         GIT_REPOSITORY  "https://github.com/google/protobuf.git"
         GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
         CONFIGURE_COMMAND
-        ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
+        ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
+            ${OPTIONAL_ARGS}
             -Dprotobuf_BUILD_TESTS=OFF
-            -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
-            -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
             -DCMAKE_BUILD_TYPE=Release
             -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
@@ -71,10 +97,41 @@ IF(NOT PROTOBUF_FOUND)
             -DCMAKE_BUILD_TYPE:STRING=Release
             -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
             -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-            -DZLIB_ROOT:STRING=${ZLIB_ROOT}
+            ${OPTIONAL_CACHE_ARGS}
     )
+ENDFUNCTION()
 
+SET(PROTOBUF_VERSION 3.1)
+IF(NOT CMAKE_CROSSCOMPILING)
+    FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION})
+
+    IF(PROTOBUF_FOUND)
+        SET_PROTOBUF_VERSION()
+        IF("${PROTOBUF_VERSION}" VERSION_LESS "3.1.0")
+            SET(PROTOBUF_FOUND OFF)
+        ENDIF()
+    ENDIF(PROTOBUF_FOUND)
+ELSE()
+    build_protobuf(protobuf_host TRUE)
+    LIST(APPEND external_project_dependencies protobuf_host)
+
+    SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_host_PROTOC_EXECUTABLE}
+        CACHE FILEPATH "protobuf executable." FORCE)
+ENDIF()
+
+IF(NOT PROTOBUF_FOUND)
+    build_protobuf(protobuf FALSE)
     LIST(APPEND external_project_dependencies protobuf)
+
+    SET(PROTOBUF_INCLUDE_DIR ${protobuf_INCLUDE_DIR}
+        CACHE PATH "protobuf include directory." FORCE)
+    IF(NOT CMAKE_CROSSCOMPILING)
+        SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_PROTOC_EXECUTABLE}
+            CACHE FILEPATH "protobuf executable." FORCE)
+    ENDIF()
+    SET(PROTOBUF_LITE_LIBRARY ${protobuf_LITE_LIBRARY} CACHE FILEPATH "protobuf lite library." FORCE)
+    SET(PROTOBUF_LIBRARY ${protobuf_LIBRARY} CACHE FILEPATH "protobuf library." FORCE)
+    SET(PROTOBUF_PROTOC_LIBRARY ${protobuf_PROTOC_LIBRARY} CACHE FILEPATH "protoc library." FORCE)
 ENDIF(NOT PROTOBUF_FOUND)
 
-INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
+PROMPT_PROTOBUF_LIB()
\ No newline at end of file
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 293070c3cfcc1196001f64469f3254289b0de792..2d7daed9bcd5b8d854ffae6dc1ea191d154c16fe 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -43,7 +43,7 @@ ELSE()
 ENDIF()
 
 ExternalProject_Add(
-    warpctc
+    extern_warpctc
     ${EXTERNAL_PROJECT_LOG_ARGS}
     GIT_REPOSITORY  "https://github.com/gangliao/warp-ctc.git"
     PREFIX          ${WARPCTC_SOURCES_DIR}
@@ -65,4 +65,8 @@ ExternalProject_Add(
                      -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )
 
+ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
+ADD_DEPENDENCIES(warpctc extern_warpctc)
+
 LIST(APPEND external_project_dependencies warpctc)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..43cd6b398b1caac55b938d576b96eb0282c00fda
--- /dev/null
+++ b/cmake/generic.cmake
@@ -0,0 +1,226 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# To simplify the build process of PaddlePaddle, we defined couple of
+# fundamental abstractions, e.g., how to build library, binary and
+# test in C++, CUDA and Go.
+#
+# -------------------------------------------
+#    C++	      CUDA C++	      Go
+# -------------------------------------------
+# cc_library	 nv_library	  go_library
+# cc_binary  	 nv_binary	  go_binary
+# cc_test        nv_test	  go_test
+# -------------------------------------------
+#
+# cmake_parse_arguments can help us to achieve this goal.
+# https://cmake.org/cmake/help/v3.0/module/CMakeParseArguments.html
+#
+
+if(NOT APPLE)
+    find_package(Threads REQUIRED)
+    link_libraries(${CMAKE_THREAD_LIBS_INIT})
+endif(NOT APPLE)
+
+# cc_library parses tensor.cc and figures out that target also depend on tensor.h.
+# cc_library(tensor
+#   SRCS
+#   tensor.cc
+#   DEPS
+#   variant)
+function(cc_library TARGET_NAME)
+  set(options OPTIONAL)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  if (${cc_library_OPTIONAL} STREQUAL "SHARED")
+    add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
+  else()
+    add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
+  endif()
+  if (cc_library_DEPS)
+    add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
+  endif()
+endfunction(cc_library)
+
+# cc_binary parses tensor.cc and figures out that target also depend on tensor.h.
+# cc_binary(tensor
+#   SRCS
+#   tensor.cc)
+function(cc_binary TARGET_NAME)
+  set(options OPTIONAL)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  add_executable(${TARGET_NAME} ${cc_binary_SRCS})
+  if(cc_binary_DEPS)
+    target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS})
+    add_dependencies(${TARGET_NAME} ${cc_binary_DEPS})
+  endif()
+endfunction(cc_binary)
+
+# The dependency to target tensor implies that if any of
+# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built.
+# cc_test(tensor_test
+#   SRCS
+#   tensor_test.cc
+#   DEPS
+#   tensor)
+function(cc_test TARGET_NAME)
+  if(WITH_TESTING)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    add_executable(${TARGET_NAME} ${cc_test_SRCS})
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
+    add_test(${TARGET_NAME} ${TARGET_NAME})
+  endif()
+endfunction(cc_test)
+
+# Suppose that ops.cu includes global functions that take Tensor as
+# their parameters, so ops depend on tensor. This implies that if
+# any of tensor.{h.cc}, ops.{h,cu} is changed, ops need to be re-built.
+# nv_library(ops
+#   SRCS
+#   ops.cu
+#   DEPS
+#   tensor)
+function(nv_library TARGET_NAME)
+  if (WITH_GPU)
+    set(options OPTIONAL)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    if (${nv_library_OPTIONAL} STREQUAL "SHARED")
+      cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
+    else()
+      cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
+    endif()
+    if (nv_library_DEPS)
+      add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
+    endif()
+  endif()
+endfunction(nv_library)
+
+function(nv_binary TARGET_NAME)
+  if (WITH_GPU)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS})
+    if(nv_binary_DEPS)
+      target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
+      add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
+    endif()
+  endif()
+endfunction(nv_binary)
+
+# The dependency to target tensor implies that if any of
+# ops{.h,.cu,_test.cu} is changed, ops_test need to be re-built.
+# nv_test(ops_test
+#   SRCS
+#   ops_test.cu
+#   DEPS
+#   ops)
+function(nv_test TARGET_NAME)
+  if (WITH_GPU AND WITH_TESTING)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
+    add_test(${TARGET_NAME} ${TARGET_NAME})
+  endif()
+endfunction(nv_test)
+
+set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
+file(MAKE_DIRECTORY ${GOPATH})
+
+# Because api.go defines a GO wrapper to ops and tensor, it depends on
+# both.  This implies that if any of tensor.{h,cc}, ops.{h,cu}, or
+# api.go is changed, api need to be re-built.
+# go_library(api
+#   SRCS
+#   api.go
+#   DEPS
+#   tensor # Because ops depend on tensor, this line is optional.
+#   ops)
+function(go_library TARGET_NAME)
+  set(options OPTIONAL)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(go_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  if (${go_library_OPTIONAL} STREQUAL "SHARED")
+    set(BUILD_MODE "-buildmode=c-shared")
+    if(APPLE)
+      set(LIB_NAME "lib${TARGET_NAME}.dylib")
+    else()
+      set(LIB_NAME "lib${TARGET_NAME}.so")
+    endif()
+  else()
+    set(BUILD_MODE "-buildmode=c-archive")
+    set(LIB_NAME "lib${TARGET_NAME}.a")
+  endif()
+  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
+    -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
+    ${go_library_SRCS}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+  add_custom_target(${TARGET_NAME}_lib ALL DEPENDS ${TARGET_NAME}_timestamp ${go_library_DEPS})
+  add_library(${TARGET_NAME} STATIC IMPORTED)
+  set_property(TARGET ${TARGET_NAME} PROPERTY
+    IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}")
+  add_dependencies(${TARGET_NAME} ${TARGET_NAME}_lib)
+endfunction(go_library)
+
+function(go_binary TARGET_NAME)
+  set(options OPTIONAL)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
+    -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
+    ${go_library_SRCS}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_binary_DEPS})
+  install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
+endfunction(go_binary)
+
+function(go_test TARGET_NAME)
+  set(options OPTIONAL)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(go_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
+    -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
+    ${go_test_SRCS}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS})
+  add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME})
+endfunction(go_test)
+
+# go_extern will download extern go project.
+# go_extern(target_name extern_source)
+# go_extern(go_redis github.com/hoisie/redis)
+function(go_extern TARGET_NAME)
+  add_custom_target(${TARGET_NAME} env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get ${ARGN})
+endfunction(go_extern)
diff --git a/cmake/rdma.cmake b/cmake/rdma.cmake
index 9ff1a77cac74fb1bdfe470a78d225ed1767bb1b5..b698f3bdc3ff586a72badee3e0109e29285b457f 100644
--- a/cmake/rdma.cmake
+++ b/cmake/rdma.cmake
@@ -10,7 +10,7 @@ if(WITH_RDMA)
 
   function(generate_rdma_links)
     #redirect to current DIR to isolate the pollution from system runtime environment
-    #it can benifits unified control for different gcc environment. 
+    #it can benifits unified control for different gcc environment.
     #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
     #runtime libraries that will crash process while loading it. That redirect trick
     #can fix it.
@@ -19,7 +19,9 @@ if(WITH_RDMA)
       COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
       COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
       COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
-      COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so 
+      COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so
+      COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so.1
+      COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
     )
   endfunction(generate_rdma_links)
@@ -44,7 +46,7 @@ if(WITH_RDMA)
       RDMA_INC_XIO AND
       RDMA_INC_EVENT AND
       RDMA_INC_NUMA AND
-      RDMA_LIB_SXISOCK AND 
+      RDMA_LIB_SXISOCK AND
       RDMA_LIB_XIO AND
       RDMA_LIB_EVENT AND
       RDMA_LIB_EVENT_CORE AND
@@ -53,19 +55,19 @@ if(WITH_RDMA)
       RDMA_LIB_NUMA
       )
 
-    set(RDMA_INC_DIR 
-      ${RDMA_INC_SXISOCK} 
+    set(RDMA_INC_DIR
+      ${RDMA_INC_SXISOCK}
       ${RDMA_INC_XIO}
       ${RDMA_INC_EVENT}
       ${RDMA_INC_NUMA})
-    set(RDMA_LIBS  
-      ${RDMA_LIB_SXISOCK} 
-      ${RDMA_LIB_XIO} 
-      ${RDMA_LIB_EVENT} 
-      ${RDMA_LIB_EVENT_CORE} 
-      ${RDMA_LIB_EVENT_EXTRA} 
-      ${RDMA_LIB_EVENT_PTHREADS} 
-      ${RDMA_LIB_NUMA} 
+    set(RDMA_LIBS
+      ${RDMA_LIB_SXISOCK}
+      ${RDMA_LIB_XIO}
+      ${RDMA_LIB_EVENT}
+      ${RDMA_LIB_EVENT_CORE}
+      ${RDMA_LIB_EVENT_EXTRA}
+      ${RDMA_LIB_EVENT_PTHREADS}
+      ${RDMA_LIB_NUMA}
       )
     set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
     include_directories("${RDMA_INC_DIR}")
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 75a9d8fc25674e1dd0f5b73cd0ccde48204f63aa..904652413e026e3a7f3f2a19f48f4e906ce6babb 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 # Detects the OS and sets appropriate variables.
-# CMAKE_SYSTEM_NAME only give us a coarse-grained name,
-# but the name like centos is necessary in some scenes
-# to distinguish system for customization.
+# CMAKE_SYSTEM_NAME only give us a coarse-grained name of the OS CMake is
+# building for, but the host processor name like centos is necessary
+# in some scenes to distinguish system for customization.
 #
 # for instance, protobuf libs path is <install_dir>/lib64
 # on CentOS, but <install_dir>/lib on other systems.
@@ -72,12 +72,36 @@ MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
 MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
 MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
 
+# configuration for cross-compiling
 IF(DEFINED CMAKE_SYSTEM_NAME)
+    INCLUDE(cross_compiling/host)
     IF(${CMAKE_SYSTEM_NAME} STREQUAL "Android")
         SET(ANDROID TRUE)
+        INCLUDE(cross_compiling/android)
+    ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "RPi")
+        SET(RPI TRUE)
+        INCLUDE(cross_compiling/raspberry_pi)
     ENDIF()
 ENDIF()
 
+# prefix and suffix on different os
+IF(WIN32)
+    SET(LIBRARY_PREFIX "")
+    SET(SHARED_LIBRARY_SUFFIX ".dll")
+    SET(STATIC_LIBRARY_SUFFIX ".lib")
+    SET(EXECUTABLE_SUFFIX ".exe")
+ELSE(WIN32)
+    SET(LIBRARY_PREFIX "lib")
+    IF(APPLE)
+        SET(SHARED_LIBRARY_SUFFIX ".dylib")
+    ELSE(APPLE)
+        SET(SHARED_LIBRARY_SUFFIX ".so")
+    ENDIF(APPLE)
+
+    SET(STATIC_LIBRARY_SUFFIX ".a")
+    SET(EXECUTABLE_SUFFIX "")
+ENDIF(WIN32)
+
 # external dependencies log output
 SET(EXTERNAL_PROJECT_LOG_ARGS
     LOG_DOWNLOAD    0     # Wrap download in script to log output
diff --git a/cmake/util.cmake b/cmake/util.cmake
index b828eef322bc570c07f5c357353641117a094c16..8c9143462227e7081142f6be250b1a45e4b6d51b 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -149,8 +149,9 @@ endfunction()
 # Create a python unittest using run_python_tests.sh,
 # which takes care of making correct running environment
 function(add_python_test TEST_NAME)
-    add_test(NAME ${TEST_NAME}
-        COMMAND bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh
+  add_test(NAME ${TEST_NAME}
+        COMMAND env PADDLE_PACKAGE_DIR=${PADDLE_PYTHON_PACKAGE_DIR}
+        bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh
         ${USE_VIRTUALENV_FOR_TEST} ${PYTHON_EXECUTABLE} ${ARGN}
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 endfunction()
diff --git a/demo/image_classification/.gitignore b/demo/image_classification/.gitignore
deleted file mode 100644
index 6a05b8f6632db0977fceade8b48a89b9f7f6e6cc..0000000000000000000000000000000000000000
--- a/demo/image_classification/.gitignore
+++ /dev/null
@@ -1,9 +0,0 @@
-data/cifar-10-batches-py
-data/cifar-out
-cifar_vgg_model/*
-plot.png
-train.log
-image_provider_copy_1.py
-*pyc
-train.list
-test.list
diff --git a/demo/image_classification/api_v2_resnet.py b/demo/image_classification/api_v2_resnet.py
deleted file mode 100644
index 19d20540780becf504973a23b50445d4b65dc2ef..0000000000000000000000000000000000000000
--- a/demo/image_classification/api_v2_resnet.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2 as paddle
-
-__all__ = ['resnet_cifar10']
-
-
-def conv_bn_layer(input,
-                  ch_out,
-                  filter_size,
-                  stride,
-                  padding,
-                  active_type=paddle.activation.Relu(),
-                  ch_in=None):
-    tmp = paddle.layer.img_conv(
-        input=input,
-        filter_size=filter_size,
-        num_channels=ch_in,
-        num_filters=ch_out,
-        stride=stride,
-        padding=padding,
-        act=paddle.activation.Linear(),
-        bias_attr=False)
-    return paddle.layer.batch_norm(input=tmp, act=active_type)
-
-
-def shortcut(ipt, n_in, n_out, stride):
-    if n_in != n_out:
-        return conv_bn_layer(ipt, n_out, 1, stride, 0,
-                             paddle.activation.Linear())
-    else:
-        return ipt
-
-
-def basicblock(ipt, ch_out, stride):
-    ch_in = ch_out * 2
-    tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
-    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, paddle.activation.Linear())
-    short = shortcut(ipt, ch_in, ch_out, stride)
-    return paddle.layer.addto(input=[tmp, short], act=paddle.activation.Relu())
-
-
-def layer_warp(block_func, ipt, features, count, stride):
-    tmp = block_func(ipt, features, stride)
-    for i in range(1, count):
-        tmp = block_func(tmp, features, 1)
-    return tmp
-
-
-def resnet_cifar10(ipt, depth=32):
-    # depth should be one of 20, 32, 44, 56, 110, 1202
-    assert (depth - 2) % 6 == 0
-    n = (depth - 2) / 6
-    nStages = {16, 64, 128}
-    conv1 = conv_bn_layer(
-        ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 64, n, 2)
-    pool = paddle.layer.img_pool(
-        input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg())
-    return pool
diff --git a/demo/image_classification/api_v2_train.py b/demo/image_classification/api_v2_train.py
deleted file mode 100644
index 53cffa6fb4e8b2e19725f4f44bf7b9ffffb25232..0000000000000000000000000000000000000000
--- a/demo/image_classification/api_v2_train.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
-
-import sys
-
-import paddle.v2 as paddle
-
-from api_v2_vgg import vgg_bn_drop
-
-
-def main():
-    datadim = 3 * 32 * 32
-    classdim = 10
-
-    # PaddlePaddle init
-    paddle.init(use_gpu=False, trainer_count=1)
-
-    image = paddle.layer.data(
-        name="image", type=paddle.data_type.dense_vector(datadim))
-
-    # Add neural network config
-    # option 1. resnet
-    # net = resnet_cifar10(image, depth=32)
-    # option 2. vgg
-    net = vgg_bn_drop(image)
-
-    out = paddle.layer.fc(input=net,
-                          size=classdim,
-                          act=paddle.activation.Softmax())
-
-    lbl = paddle.layer.data(
-        name="label", type=paddle.data_type.integer_value(classdim))
-    cost = paddle.layer.classification_cost(input=out, label=lbl)
-
-    # Create parameters
-    parameters = paddle.parameters.create(cost)
-
-    # Create optimizer
-    momentum_optimizer = paddle.optimizer.Momentum(
-        momentum=0.9,
-        regularization=paddle.optimizer.L2Regularization(rate=0.0002 * 128),
-        learning_rate=0.1 / 128.0,
-        learning_rate_decay_a=0.1,
-        learning_rate_decay_b=50000 * 100,
-        learning_rate_schedule='discexp',
-        batch_size=128)
-
-    # End batch and end pass event handler
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
-                print "\nPass %d, Batch %d, Cost %f, %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics)
-            else:
-                sys.stdout.write('.')
-                sys.stdout.flush()
-        if isinstance(event, paddle.event.EndPass):
-            result = trainer.test(
-                reader=paddle.batch(
-                    paddle.dataset.cifar.test10(), batch_size=128),
-                feeding={'image': 0,
-                         'label': 1})
-            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
-
-    # Create trainer
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=momentum_optimizer)
-    trainer.train(
-        reader=paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.cifar.train10(), buf_size=50000),
-            batch_size=128),
-        num_passes=5,
-        event_handler=event_handler,
-        feeding={'image': 0,
-                 'label': 1})
-
-
-if __name__ == '__main__':
-    main()
diff --git a/demo/image_classification/api_v2_vgg.py b/demo/image_classification/api_v2_vgg.py
deleted file mode 100644
index 1e0e6b93adde30425f17aa9cd07542275f4fec37..0000000000000000000000000000000000000000
--- a/demo/image_classification/api_v2_vgg.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2 as paddle
-
-__all__ = ['vgg_bn_drop']
-
-
-def vgg_bn_drop(input):
-    def conv_block(ipt, num_filter, groups, dropouts, num_channels=None):
-        return paddle.networks.img_conv_group(
-            input=ipt,
-            num_channels=num_channels,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act=paddle.activation.Relu(),
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type=paddle.pooling.Max())
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = paddle.layer.dropout(input=conv5, dropout_rate=0.5)
-    fc1 = paddle.layer.fc(input=drop, size=512, act=paddle.activation.Linear())
-    bn = paddle.layer.batch_norm(
-        input=fc1,
-        act=paddle.activation.Relu(),
-        layer_attr=paddle.attr.Extra(drop_rate=0.5))
-    fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear())
-    return fc2
diff --git a/demo/image_classification/data/download_cifar.sh b/demo/image_classification/data/download_cifar.sh
deleted file mode 100755
index 532178d627fe19ab8ea79ecae73e5328b5294bea..0000000000000000000000000000000000000000
--- a/demo/image_classification/data/download_cifar.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
-tar zxf cifar-10-python.tar.gz
-rm cifar-10-python.tar.gz
-rm -rf cifar-out/*
-echo Converting CIFAR data to images.....
-python process_cifar.py ./cifar-10-batches-py ./cifar-out
diff --git a/demo/image_classification/data/process_cifar.py b/demo/image_classification/data/process_cifar.py
deleted file mode 100644
index db6666189e5b8008a6b66fb64afcdf98980e72bb..0000000000000000000000000000000000000000
--- a/demo/image_classification/data/process_cifar.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import sys
-import os
-import PIL.Image as Image
-"""
-  Usage: python process_cifar input_dir output_dir
-"""
-
-
-def mkdir_not_exist(path):
-    """
-    Make dir if the path does not exist.
-    path: the path to be created.
-    """
-    if not os.path.exists(path):
-        os.mkdir(path)
-
-
-def create_dir_structure(output_dir):
-    """
-    Create the directory structure for the directory.
-    output_dir: the direcotry structure path.
-    """
-    mkdir_not_exist(os.path.join(output_dir))
-    mkdir_not_exist(os.path.join(output_dir, "train"))
-    mkdir_not_exist(os.path.join(output_dir, "test"))
-
-
-def convert_batch(batch_path, label_set, label_map, output_dir, data_split):
-    """
-    Convert CIFAR batch to the structure of Paddle format.
-    batch_path: the batch to be converted.
-    label_set: the set of labels.
-    output_dir: the output path.
-    data_split: whether it is training or testing data.
-    """
-    data = np.load(batch_path)
-    for data, label, filename in zip(data['data'], data['labels'],
-                                     data['filenames']):
-        data = data.reshape((3, 32, 32))
-        data = np.transpose(data, (1, 2, 0))
-        label = label_map[label]
-        output_dir_this = os.path.join(output_dir, data_split, str(label))
-        output_filename = os.path.join(output_dir_this, filename)
-        if not label in label_set:
-            label_set[label] = True
-            mkdir_not_exist(output_dir_this)
-        Image.fromarray(data).save(output_filename)
-
-
-if __name__ == '__main__':
-    input_dir = sys.argv[1]
-    output_dir = sys.argv[2]
-    num_batch = 5
-    create_dir_structure(output_dir)
-    label_map = {
-        0: "airplane",
-        1: "automobile",
-        2: "bird",
-        3: "cat",
-        4: "deer",
-        5: "dog",
-        6: "frog",
-        7: "horse",
-        8: "ship",
-        9: "truck"
-    }
-    labels = {}
-    for i in range(1, num_batch + 1):
-        convert_batch(
-            os.path.join(input_dir, "data_batch_%d" % i), labels, label_map,
-            output_dir, "train")
-    convert_batch(
-        os.path.join(input_dir, "test_batch"), {}, label_map, output_dir,
-        "test")
diff --git a/demo/image_classification/image_provider.py b/demo/image_classification/image_provider.py
deleted file mode 100644
index 6a315ff094c1af5f8250d8a22ff96740dddd9808..0000000000000000000000000000000000000000
--- a/demo/image_classification/image_provider.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import io
-import random
-
-import paddle.utils.image_util as image_util
-from paddle.trainer.PyDataProvider2 import *
-
-
-#
-# {'img_size': 32,
-# 'settings': a global object,
-# 'color': True,
-# 'mean_img_size': 32,
-# 'meta': './data/cifar-out/batches/batches.meta',
-# 'num_classes': 10,
-# 'file_list': ('./data/cifar-out/batches/train_batch_000',),
-# 'use_jpeg': True}
-def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
-         is_train, **kwargs):
-    settings.mean_img_size = mean_img_size
-    settings.img_size = img_size
-    settings.num_classes = num_classes
-    settings.color = color
-    settings.is_train = is_train
-
-    if settings.color:
-        settings.img_raw_size = settings.img_size * settings.img_size * 3
-    else:
-        settings.img_raw_size = settings.img_size * settings.img_size
-
-    settings.meta_path = meta
-    settings.use_jpeg = use_jpeg
-
-    settings.img_mean = image_util.load_meta(settings.meta_path,
-                                             settings.mean_img_size,
-                                             settings.img_size, settings.color)
-
-    settings.logger.info('Image size: %s', settings.img_size)
-    settings.logger.info('Meta path: %s', settings.meta_path)
-    settings.input_types = {
-        'image': dense_vector(settings.img_raw_size),
-        'label': integer_value(settings.num_classes)
-    }
-
-    settings.logger.info('DataProvider Initialization finished')
-
-
-@provider(init_hook=hook, min_pool_size=0)
-def processData(settings, file_list):
-    """
-    The main function for loading data.
-    Load the batch, iterate all the images and labels in this batch.
-    file_list: the batch file list.
-    """
-    with open(file_list, 'r') as fdata:
-        lines = [line.strip() for line in fdata]
-        random.shuffle(lines)
-        for file_name in lines:
-            with io.open(file_name.strip(), 'rb') as file:
-                data = cPickle.load(file)
-                indexes = list(range(len(data['images'])))
-                if settings.is_train:
-                    random.shuffle(indexes)
-                for i in indexes:
-                    if settings.use_jpeg == 1:
-                        img = image_util.decode_jpeg(data['images'][i])
-                    else:
-                        img = data['images'][i]
-                    img_feat = image_util.preprocess_img(
-                        img, settings.img_mean, settings.img_size,
-                        settings.is_train, settings.color)
-                    label = data['labels'][i]
-                    yield {
-                        'image': img_feat.astype('float32'),
-                        'label': int(label)
-                    }
diff --git a/demo/image_classification/image_util.py b/demo/image_classification/image_util.py
deleted file mode 100644
index f09605394a19e09d92e555eeefb0b5646625b618..0000000000000000000000000000000000000000
--- a/demo/image_classification/image_util.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-from PIL import Image
-from cStringIO import StringIO
-
-
-def resize_image(img, target_size):
-    """
-    Resize an image so that the shorter edge has length target_size.
-    img: the input image to be resized.
-    target_size: the target resized image size.
-    """
-    percent = (target_size / float(min(img.size[0], img.size[1])))
-    resized_size = int(round(img.size[0] * percent)), int(
-        round(img.size[1] * percent))
-    img = img.resize(resized_size, Image.ANTIALIAS)
-    return img
-
-
-def flip(im):
-    """
-    Return the flipped image.
-    Flip an image along the horizontal direction.
-    im: input image, (H x W x K) ndarrays 
-    """
-    if len(im.shape) == 3:
-        return im[:, :, ::-1]
-    else:
-        return im[:, ::-1]
-
-
-def crop_img(im, inner_size, color=True, test=True):
-    """
-    Return cropped image.
-    The size of the cropped image is inner_size * inner_size.
-    im: (K x H x W) ndarrays
-    inner_size: the cropped image size.
-    color: whether it is color image.
-    test: whether in test mode.
-      If False, does random cropping and flipping.
-      If True, crop the center of images.
-    """
-    if color:
-        height, width = max(inner_size, im.shape[1]), max(inner_size,
-                                                          im.shape[2])
-        padded_im = np.zeros((3, height, width))
-        startY = (height - im.shape[1]) / 2
-        startX = (width - im.shape[2]) / 2
-        endY, endX = startY + im.shape[1], startX + im.shape[2]
-        padded_im[:, startY:endY, startX:endX] = im
-    else:
-        im = im.astype('float32')
-        height, width = max(inner_size, im.shape[0]), max(inner_size,
-                                                          im.shape[1])
-        padded_im = np.zeros((height, width))
-        startY = (height - im.shape[0]) / 2
-        startX = (width - im.shape[1]) / 2
-        endY, endX = startY + im.shape[0], startX + im.shape[1]
-        padded_im[startY:endY, startX:endX] = im
-    if test:
-        startY = (height - inner_size) / 2
-        startX = (width - inner_size) / 2
-    else:
-        startY = np.random.randint(0, height - inner_size + 1)
-        startX = np.random.randint(0, width - inner_size + 1)
-    endY, endX = startY + inner_size, startX + inner_size
-    if color:
-        pic = padded_im[:, startY:endY, startX:endX]
-    else:
-        pic = padded_im[startY:endY, startX:endX]
-    if (not test) and (np.random.randint(2) == 0):
-        pic = flip(pic)
-    return pic
-
-
-def decode_jpeg(jpeg_string):
-    np_array = np.array(Image.open(StringIO(jpeg_string)))
-    if len(np_array.shape) == 3:
-        np_array = np.transpose(np_array, (2, 0, 1))
-    return np_array
-
-
-def preprocess_img(im, img_mean, crop_size, is_train, color=True):
-    """
-    Does data augmentation for images.
-    If is_train is false, cropping the center region from the image.
-    If is_train is true, randomly crop a region from the image,
-    and randomy does flipping.
-    im: (K x H x W) ndarrays
-    """
-    im = im.astype('float32')
-    test = not is_train
-    pic = crop_img(im, crop_size, color, test)
-    pic -= img_mean
-    return pic.flatten()
-
-
-def load_meta(meta_path, mean_img_size, crop_size, color=True):
-    """
-    Return the loaded meta file.
-    Load the meta image, which is the mean of the images in the dataset.
-    The mean image is subtracted from every input image so that the expected mean
-    of each input image is zero.
-    """
-    mean = np.load(meta_path)['data_mean']
-    border = (mean_img_size - crop_size) / 2
-    if color:
-        assert (mean_img_size * mean_img_size * 3 == mean.shape[0])
-        mean = mean.reshape(3, mean_img_size, mean_img_size)
-        mean = mean[:, border:border + crop_size, border:border +
-                    crop_size].astype('float32')
-    else:
-        assert (mean_img_size * mean_img_size == mean.shape[0])
-        mean = mean.reshape(mean_img_size, mean_img_size)
-        mean = mean[border:border + crop_size, border:border +
-                    crop_size].astype('float32')
-    return mean
-
-
-def load_image(img_path, is_color=True):
-    """
-    Load image and return. 
-    img_path: image path.
-    is_color: is color image or not.
-    """
-    img = Image.open(img_path)
-    img.load()
-    return img
-
-
-def oversample(img, crop_dims):
-    """
-    image : iterable of (H x W x K) ndarrays
-    crop_dims: (height, width) tuple for the crops.
-    Returned data contains ten crops of input image, namely,
-    four corner patches and the center patch as well as their
-    horizontal reflections.
-    """
-    # Dimensions and center.
-    im_shape = np.array(img[0].shape)
-    crop_dims = np.array(crop_dims)
-    im_center = im_shape[:2] / 2.0
-
-    # Make crop coordinates
-    h_indices = (0, im_shape[0] - crop_dims[0])
-    w_indices = (0, im_shape[1] - crop_dims[1])
-    crops_ix = np.empty((5, 4), dtype=int)
-    curr = 0
-    for i in h_indices:
-        for j in w_indices:
-            crops_ix[curr] = (i, j, i + crop_dims[0], j + crop_dims[1])
-            curr += 1
-    crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate(
-        [-crop_dims / 2.0, crop_dims / 2.0])
-    crops_ix = np.tile(crops_ix, (2, 1))
-
-    # Extract crops
-    crops = np.empty(
-        (10 * len(img), crop_dims[0], crop_dims[1], im_shape[-1]),
-        dtype=np.float32)
-    ix = 0
-    for im in img:
-        for crop in crops_ix:
-            crops[ix] = im[crop[0]:crop[2], crop[1]:crop[3], :]
-            ix += 1
-        crops[ix - 5:ix] = crops[ix - 5:ix, :, ::-1, :]  # flip for mirrors
-    return crops
-
-
-class ImageTransformer:
-    def __init__(self,
-                 transpose=None,
-                 channel_swap=None,
-                 mean=None,
-                 is_color=True):
-        self.transpose = transpose
-        self.channel_swap = None
-        self.mean = None
-        self.is_color = is_color
-
-    def set_transpose(self, order):
-        if self.is_color:
-            assert 3 == len(order)
-        self.transpose = order
-
-    def set_channel_swap(self, order):
-        if self.is_color:
-            assert 3 == len(order)
-        self.channel_swap = order
-
-    def set_mean(self, mean):
-        # mean value, may be one value per channel 
-        if mean.ndim == 1:
-            mean = mean[:, np.newaxis, np.newaxis]
-        else:
-            # elementwise mean
-            if self.is_color:
-                assert len(mean.shape) == 3
-        self.mean = mean
-
-    def transformer(self, data):
-        if self.transpose is not None:
-            data = data.transpose(self.transpose)
-        if self.channel_swap is not None:
-            data = data[self.channel_swap, :, :]
-        if self.mean is not None:
-            data -= self.mean
-        return data
diff --git a/demo/image_classification/predict.sh b/demo/image_classification/predict.sh
deleted file mode 100755
index 9d5785c9a1a4dac12f7940fa708b1a79c6ec8a93..0000000000000000000000000000000000000000
--- a/demo/image_classification/predict.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-model=cifar_vgg_model/pass-00299/
-image=data/cifar-out/test/airplane/seaplane_s_000978.png
-use_gpu=1
-python prediction.py $model $image $use_gpu
diff --git a/demo/image_classification/prediction.py b/demo/image_classification/prediction.py
deleted file mode 100755
index 49c0ff600c40e0222093ff0a8a2f7e8e38ccba29..0000000000000000000000000000000000000000
--- a/demo/image_classification/prediction.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os, sys
-import numpy as np
-import logging
-from PIL import Image
-from optparse import OptionParser
-
-import paddle.utils.image_util as image_util
-
-from py_paddle import swig_paddle, DataProviderConverter
-from paddle.trainer.PyDataProvider2 import dense_vector
-from paddle.trainer.config_parser import parse_config
-
-logging.basicConfig(
-    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
-logging.getLogger().setLevel(logging.INFO)
-
-
-class ImageClassifier():
-    def __init__(self,
-                 train_conf,
-                 use_gpu=True,
-                 model_dir=None,
-                 resize_dim=None,
-                 crop_dim=None,
-                 mean_file=None,
-                 oversample=False,
-                 is_color=True):
-        """
-        train_conf: network configure.
-        model_dir: string, directory of model.
-        resize_dim: int, resized image size.
-        crop_dim: int, crop size.
-        mean_file: string, image mean file.
-        oversample: bool, oversample means multiple crops, namely five
-                    patches (the four corner patches and the center
-                    patch) as well as their horizontal reflections,
-                    ten crops in all.
-        """
-        self.train_conf = train_conf
-        self.model_dir = model_dir
-        if model_dir is None:
-            self.model_dir = os.path.dirname(train_conf)
-
-        self.resize_dim = resize_dim
-        self.crop_dims = [crop_dim, crop_dim]
-        self.oversample = oversample
-        self.is_color = is_color
-
-        self.transformer = image_util.ImageTransformer(is_color=is_color)
-        self.transformer.set_transpose((2, 0, 1))
-
-        self.mean_file = mean_file
-        mean = np.load(self.mean_file)['data_mean']
-        mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1])
-        self.transformer.set_mean(mean)  # mean pixel
-        gpu = 1 if use_gpu else 0
-        conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (gpu)
-        conf = parse_config(train_conf, conf_args)
-        swig_paddle.initPaddle("--use_gpu=%d" % (gpu))
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(
-            conf.model_config)
-        assert isinstance(self.network, swig_paddle.GradientMachine)
-        self.network.loadParameters(self.model_dir)
-
-        data_size = 3 * self.crop_dims[0] * self.crop_dims[1]
-        slots = [dense_vector(data_size)]
-        self.converter = DataProviderConverter(slots)
-
-    def get_data(self, img_path):
-        """
-        1. load image from img_path.
-        2. resize or oversampling.
-        3. transformer data: transpose, sub mean.
-        return K x H x W ndarray.
-        img_path: image path.
-        """
-        image = image_util.load_image(img_path, self.is_color)
-        if self.oversample:
-            # image_util.resize_image: short side is self.resize_dim
-            image = image_util.resize_image(image, self.resize_dim)
-            image = np.array(image)
-            input = np.zeros(
-                (1, image.shape[0], image.shape[1], 3), dtype=np.float32)
-            input[0] = image.astype(np.float32)
-            input = image_util.oversample(input, self.crop_dims)
-        else:
-            image = image.resize(self.crop_dims, Image.ANTIALIAS)
-            input = np.zeros(
-                (1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32)
-            input[0] = np.array(image).astype(np.float32)
-
-        data_in = []
-        for img in input:
-            img = self.transformer.transformer(img).flatten()
-            data_in.append([img.tolist()])
-        return data_in
-
-    def forward(self, input_data):
-        in_arg = self.converter(input_data)
-        return self.network.forwardTest(in_arg)
-
-    def forward(self, data, output_layer):
-        """
-        input_data: py_paddle input data.
-        output_layer: specify the name of probability, namely the layer with
-                      softmax activation.
-        return: the predicting probability of each label.
-        """
-        input = self.converter(data)
-        self.network.forwardTest(input)
-        output = self.network.getLayerOutputs(output_layer)
-        # For oversampling, average predictions across crops.
-        # If not, the shape of output[name]: (1, class_number),
-        # the mean is also applicable.
-        return output[output_layer]['value'].mean(0)
-
-    def predict(self, image=None, output_layer=None):
-        assert isinstance(image, basestring)
-        assert isinstance(output_layer, basestring)
-        data = self.get_data(image)
-        prob = self.forward(data, output_layer)
-        lab = np.argsort(-prob)
-        logging.info("Label of %s is: %d", image, lab[0])
-
-
-if __name__ == '__main__':
-    image_size = 32
-    crop_size = 32
-    multi_crop = True
-    config = "vgg_16_cifar.py"
-    output_layer = "__fc_layer_1__"
-    mean_path = "data/cifar-out/batches/batches.meta"
-    model_path = sys.argv[1]
-    image = sys.argv[2]
-    use_gpu = bool(int(sys.argv[3]))
-
-    obj = ImageClassifier(
-        train_conf=config,
-        model_dir=model_path,
-        resize_dim=image_size,
-        crop_dim=crop_size,
-        mean_file=mean_path,
-        use_gpu=use_gpu,
-        oversample=multi_crop)
-    obj.predict(image, output_layer)
diff --git a/demo/image_classification/preprocess.py b/demo/image_classification/preprocess.py
deleted file mode 100755
index 2947ad239c36f9a02ed67ccf5906380cb70e37ce..0000000000000000000000000000000000000000
--- a/demo/image_classification/preprocess.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.utils.preprocess_img import ImageClassificationDatasetCreater
-from optparse import OptionParser
-
-
-def option_parser():
-    parser = OptionParser(usage="usage: python preprcoess.py "\
-                          "-i data_dir [options]")
-    parser.add_option(
-        "-i",
-        "--input",
-        action="store",
-        dest="input",
-        help="Input data directory.")
-    parser.add_option(
-        "-s",
-        "--size",
-        action="store",
-        dest="size",
-        help="Processed image size.")
-    parser.add_option(
-        "-c",
-        "--color",
-        action="store",
-        dest="color",
-        help="whether to use color images.")
-    return parser.parse_args()
-
-
-if __name__ == '__main__':
-    options, args = option_parser()
-    data_dir = options.input
-    processed_image_size = int(options.size)
-    color = options.color == "1"
-    data_creator = ImageClassificationDatasetCreater(
-        data_dir, processed_image_size, color)
-    data_creator.train_list_name = "train.txt"
-    data_creator.test_list_name = "test.txt"
-    data_creator.num_per_batch = 1000
-    data_creator.overwrite = True
-    data_creator.create_batches()
diff --git a/demo/image_classification/preprocess.sh b/demo/image_classification/preprocess.sh
deleted file mode 100755
index c7396c6393599ef3f2c55089eb05f2435b2b4b82..0000000000000000000000000000000000000000
--- a/demo/image_classification/preprocess.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-data_dir=./data/cifar-out
-
-python preprocess.py -i $data_dir -s 32 -c 1
-
-echo "data/cifar-out/batches/train.txt" > train.list
-echo "data/cifar-out/batches/test.txt" > test.list
diff --git a/demo/image_classification/train.sh b/demo/image_classification/train.sh
deleted file mode 100755
index e45bd47ad5925c6674d628a70a7ad7c4d5d5c173..0000000000000000000000000000000000000000
--- a/demo/image_classification/train.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-config=vgg_16_cifar.py
-output=./cifar_vgg_model
-log=train.log
-
-paddle train \
---config=$config \
---dot_period=10 \
---log_period=100 \
---test_all_data_in_one_period=1 \
---use_gpu=1 \
---trainer_count=1 \
---num_passes=300 \
---save_dir=$output \
-2>&1 | tee $log
-paddle usage -l $log -e $? -n "image_classification_train" >/dev/null 2>&1
-
-python -m paddle.utils.plotcurve -i $log > plot.png
diff --git a/demo/image_classification/vgg_16_cifar.py b/demo/image_classification/vgg_16_cifar.py
deleted file mode 100755
index 8ee4a64c15f885023a6e19812885b4f76bb12af9..0000000000000000000000000000000000000000
--- a/demo/image_classification/vgg_16_cifar.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-is_predict = get_config_arg("is_predict", bool, False)
-
-####################Data Configuration ##################
-if not is_predict:
-    data_dir = 'data/cifar-out/batches/'
-    meta_path = data_dir + 'batches.meta'
-
-    args = {
-        'meta': meta_path,
-        'mean_img_size': 32,
-        'img_size': 32,
-        'num_classes': 10,
-        'use_jpeg': 1,
-        'color': "color"
-    }
-
-    define_py_data_sources2(
-        train_list="train.list",
-        test_list="train.list",
-        module='image_provider',
-        obj='processData',
-        args=args)
-
-######################Algorithm Configuration #############
-settings(
-    batch_size=128,
-    learning_rate=0.1 / 128.0,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * 128))
-
-#######################Network Configuration #############
-data_size = 3 * 32 * 32
-label_size = 10
-img = data_layer(name='image', size=data_size)
-# small_vgg is predefined in trainer_config_helpers.networks
-predict = small_vgg(input_image=img, num_channels=3, num_classes=label_size)
-
-if not is_predict:
-    lbl = data_layer(name="label", size=label_size)
-    outputs(classification_cost(input=predict, label=lbl))
-else:
-    outputs(predict)
diff --git a/demo/introduction/.gitignore b/demo/introduction/.gitignore
deleted file mode 100644
index c54f3f9480ce4ceefda98f77a812ec2d6cd4a5e3..0000000000000000000000000000000000000000
--- a/demo/introduction/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-dataprovider.pyc
-empty.list
-train.log
-output
-train.list
diff --git a/demo/introduction/README.md b/demo/introduction/README.md
deleted file mode 100644
index 0614a7afe645677ef0b65a17ea05f1dcfa45214f..0000000000000000000000000000000000000000
--- a/demo/introduction/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-This folder contains scripts used in PaddlePaddle introduction.
-- use `bash train.sh` to train a simple linear regression model
-- use `python evaluate_model.py` to read model parameters. You can see that `w` and `b` are very close to [2, 0.3].
diff --git a/demo/introduction/api_train_v2.py b/demo/introduction/api_train_v2.py
deleted file mode 100644
index 1ba971b3688ce3dec078998df2c0b183a4e449f8..0000000000000000000000000000000000000000
--- a/demo/introduction/api_train_v2.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import paddle.v2 as paddle
-import paddle.v2.dataset.uci_housing as uci_housing
-
-
-def main():
-    # init
-    paddle.init(use_gpu=False, trainer_count=1)
-
-    # network config
-    x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
-    y_predict = paddle.layer.fc(input=x,
-                                param_attr=paddle.attr.Param(name='w'),
-                                size=1,
-                                act=paddle.activation.Linear(),
-                                bias_attr=paddle.attr.Param(name='b'))
-    y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
-    cost = paddle.layer.mse_cost(input=y_predict, label=y)
-
-    # create parameters
-    parameters = paddle.parameters.create(cost)
-
-    # create optimizer
-    optimizer = paddle.optimizer.Momentum(momentum=0)
-
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=optimizer)
-
-    # event_handler to print training and testing info
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
-                print "Pass %d, Batch %d, Cost %f" % (
-                    event.pass_id, event.batch_id, event.cost)
-
-        if isinstance(event, paddle.event.EndPass):
-            if (event.pass_id + 1) % 10 == 0:
-                result = trainer.test(
-                    reader=paddle.batch(
-                        uci_housing.test(), batch_size=2),
-                    feeding={'x': 0,
-                             'y': 1})
-                print "Test %d, %.2f" % (event.pass_id, result.cost)
-
-    # training
-    trainer.train(
-        reader=paddle.batch(
-            paddle.reader.shuffle(
-                uci_housing.train(), buf_size=500),
-            batch_size=2),
-        feeding={'x': 0,
-                 'y': 1},
-        event_handler=event_handler,
-        num_passes=30)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/demo/introduction/dataprovider.py b/demo/introduction/dataprovider.py
deleted file mode 100644
index 5b48aad0408800676ae7da16eba2dcbb2124f25f..0000000000000000000000000000000000000000
--- a/demo/introduction/dataprovider.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-import random
-
-
-# define data types of input: 2 real numbers
-@provider(
-    input_types={'x': dense_vector(1),
-                 'y': dense_vector(1)}, use_seq=False)
-def process(settings, input_file):
-    for i in xrange(2000):
-        x = random.random()
-        yield {'x': [x], 'y': [2 * x + 0.3]}
diff --git a/demo/introduction/train.sh b/demo/introduction/train.sh
deleted file mode 100755
index 2ce6446d7c943ffc9bea8da43d153539f6f9f15f..0000000000000000000000000000000000000000
--- a/demo/introduction/train.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-paddle train \
-    --config=trainer_config.py \
-    --save_dir=./output \
-    --num_passes=30 \
-    2>&1 |tee 'train.log'
-paddle usage -l "train.log" -e $? -n "introduction" >/dev/null 2>&1
diff --git a/demo/introduction/trainer_config.py b/demo/introduction/trainer_config.py
deleted file mode 100644
index 651dfaa4b7b4873810a0b393655541a62d1a311b..0000000000000000000000000000000000000000
--- a/demo/introduction/trainer_config.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-# 1. read data. Suppose you saved above python code as dataprovider.py
-define_py_data_sources2(
-    train_list=['no_matter.txt'],
-    test_list=None,
-    module='dataprovider',
-    obj='process',
-    args={})
-
-# 2. learning algorithm
-settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-# 3. Network configuration
-x = data_layer(name='x', size=1)
-y = data_layer(name='y', size=1)
-y_predict = fc_layer(
-    input=x,
-    param_attr=ParamAttr(name='w'),
-    size=1,
-    act=LinearActivation(),
-    bias_attr=ParamAttr(name='b'))
-cost = mse_cost(input=y_predict, label=y)
-outputs(cost)
diff --git a/demo/mnist/api_train_v2.py b/demo/mnist/api_train_v2.py
deleted file mode 100644
index 6b95a88042a13a280bcb80f753b3887fcef37296..0000000000000000000000000000000000000000
--- a/demo/mnist/api_train_v2.py
+++ /dev/null
@@ -1,137 +0,0 @@
-import paddle.v2 as paddle
-import gzip
-
-
-def softmax_regression(img):
-    predict = paddle.layer.fc(input=img,
-                              size=10,
-                              act=paddle.activation.Softmax())
-    return predict
-
-
-def multilayer_perceptron(img):
-    # The first fully-connected layer
-    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
-    # The second fully-connected layer and the according activation function
-    hidden2 = paddle.layer.fc(input=hidden1,
-                              size=64,
-                              act=paddle.activation.Relu())
-    # The thrid fully-connected layer, note that the hidden size should be 10,
-    # which is the number of unique digits
-    predict = paddle.layer.fc(input=hidden2,
-                              size=10,
-                              act=paddle.activation.Softmax())
-    return predict
-
-
-def convolutional_neural_network(img):
-    # first conv layer
-    conv_pool_1 = paddle.networks.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        num_channel=1,
-        pool_size=2,
-        pool_stride=2,
-        act=paddle.activation.Tanh())
-    # second conv layer
-    conv_pool_2 = paddle.networks.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        num_channel=20,
-        pool_size=2,
-        pool_stride=2,
-        act=paddle.activation.Tanh())
-    # The first fully-connected layer
-    fc1 = paddle.layer.fc(input=conv_pool_2,
-                          size=128,
-                          act=paddle.activation.Tanh())
-    # The softmax layer, note that the hidden size should be 10,
-    # which is the number of unique digits
-    predict = paddle.layer.fc(input=fc1,
-                              size=10,
-                              act=paddle.activation.Softmax())
-    return predict
-
-
-def main():
-    paddle.init(use_gpu=False, trainer_count=1)
-
-    # define network topology
-    images = paddle.layer.data(
-        name='pixel', type=paddle.data_type.dense_vector(784))
-    label = paddle.layer.data(
-        name='label', type=paddle.data_type.integer_value(10))
-
-    # Here we can build the prediction network in different ways. Please
-    # choose one by uncomment corresponding line.
-    predict = softmax_regression(images)
-    #predict = multilayer_perceptron(images)
-    #predict = convolutional_neural_network(images)
-
-    cost = paddle.layer.classification_cost(input=predict, label=label)
-
-    try:
-        with gzip.open('params.tar.gz', 'r') as f:
-            parameters = paddle.parameters.Parameters.from_tar(f)
-    except IOError:
-        parameters = paddle.parameters.create(cost)
-
-    optimizer = paddle.optimizer.Momentum(
-        learning_rate=0.1 / 128.0,
-        momentum=0.9,
-        regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
-
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=optimizer)
-
-    lists = []
-
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 1000 == 0:
-                print "Pass %d, Batch %d, Cost %f, %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics)
-
-                with gzip.open('params.tar.gz', 'w') as f:
-                    parameters.to_tar(f)
-
-        elif isinstance(event, paddle.event.EndPass):
-            result = trainer.test(reader=paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=128))
-            print "Test with Pass %d, Cost %f, %s\n" % (
-                event.pass_id, result.cost, result.metrics)
-            lists.append((event.pass_id, result.cost,
-                          result.metrics['classification_error_evaluator']))
-
-    trainer.train(
-        reader=paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=8192),
-            batch_size=128),
-        event_handler=event_handler,
-        num_passes=100)
-
-    # find the best pass
-    best = sorted(lists, key=lambda list: float(list[1]))[0]
-    print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
-    print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
-
-    test_creator = paddle.dataset.mnist.test()
-    test_data = []
-    for item in test_creator():
-        test_data.append((item[0], ))
-        if len(test_data) == 100:
-            break
-
-    # output is a softmax layer. It returns probabilities.
-    # Shape should be (100, 10)
-    probs = paddle.infer(
-        output_layer=predict, parameters=parameters, input=test_data)
-    print probs.shape
-
-
-if __name__ == '__main__':
-    main()
diff --git a/demo/recommendation/.gitignore b/demo/recommendation/.gitignore
deleted file mode 100644
index fd27ef62a87cae51f2392c0eba50a44490d029af..0000000000000000000000000000000000000000
--- a/demo/recommendation/.gitignore
+++ /dev/null
@@ -1,10 +0,0 @@
-log.txt
-data/meta.bin
-data/ml-1m
-data/ratings.dat.train
-data/ratings.dat.test
-data/train.list
-data/test.list
-dataprovider_copy_1.py
-*.pyc
-output
diff --git a/demo/recommendation/api_train_v2.py b/demo/recommendation/api_train_v2.py
deleted file mode 100644
index f6a061799e3ac50236a68beedaf700dd6c698a05..0000000000000000000000000000000000000000
--- a/demo/recommendation/api_train_v2.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import paddle.v2 as paddle
-import cPickle
-import copy
-
-
-def main():
-    paddle.init(use_gpu=False)
-    movie_title_dict = paddle.dataset.movielens.get_movie_title_dict()
-    uid = paddle.layer.data(
-        name='user_id',
-        type=paddle.data_type.integer_value(
-            paddle.dataset.movielens.max_user_id() + 1))
-    usr_emb = paddle.layer.embedding(input=uid, size=32)
-
-    usr_gender_id = paddle.layer.data(
-        name='gender_id', type=paddle.data_type.integer_value(2))
-    usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16)
-
-    usr_age_id = paddle.layer.data(
-        name='age_id',
-        type=paddle.data_type.integer_value(
-            len(paddle.dataset.movielens.age_table)))
-    usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16)
-
-    usr_job_id = paddle.layer.data(
-        name='job_id',
-        type=paddle.data_type.integer_value(paddle.dataset.movielens.max_job_id(
-        ) + 1))
-
-    usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16)
-
-    usr_combined_features = paddle.layer.fc(
-        input=[usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb],
-        size=200,
-        act=paddle.activation.Tanh())
-
-    mov_id = paddle.layer.data(
-        name='movie_id',
-        type=paddle.data_type.integer_value(
-            paddle.dataset.movielens.max_movie_id() + 1))
-    mov_emb = paddle.layer.embedding(input=mov_id, size=32)
-
-    mov_categories = paddle.layer.data(
-        name='category_id',
-        type=paddle.data_type.sparse_binary_vector(
-            len(paddle.dataset.movielens.movie_categories())))
-
-    mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32)
-
-    mov_title_id = paddle.layer.data(
-        name='movie_title',
-        type=paddle.data_type.integer_value_sequence(len(movie_title_dict)))
-    mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32)
-    mov_title_conv = paddle.networks.sequence_conv_pool(
-        input=mov_title_emb, hidden_size=32, context_len=3)
-
-    mov_combined_features = paddle.layer.fc(
-        input=[mov_emb, mov_categories_hidden, mov_title_conv],
-        size=200,
-        act=paddle.activation.Tanh())
-
-    inference = paddle.layer.cos_sim(
-        a=usr_combined_features, b=mov_combined_features, size=1, scale=5)
-    cost = paddle.layer.mse_cost(
-        input=inference,
-        label=paddle.layer.data(
-            name='score', type=paddle.data_type.dense_vector(1)))
-
-    parameters = paddle.parameters.create(cost)
-
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=paddle.optimizer.Adam(
-                                     learning_rate=1e-4))
-    feeding = {
-        'user_id': 0,
-        'gender_id': 1,
-        'age_id': 2,
-        'job_id': 3,
-        'movie_id': 4,
-        'category_id': 5,
-        'movie_title': 6,
-        'score': 7
-    }
-
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
-                print "Pass %d Batch %d Cost %.2f" % (
-                    event.pass_id, event.batch_id, event.cost)
-
-    trainer.train(
-        reader=paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.movielens.train(), buf_size=8192),
-            batch_size=256),
-        event_handler=event_handler,
-        feeding=feeding,
-        num_passes=1)
-
-    user_id = 234
-    movie_id = 345
-
-    user = paddle.dataset.movielens.user_info()[user_id]
-    movie = paddle.dataset.movielens.movie_info()[movie_id]
-
-    feature = user.value() + movie.value()
-
-    def reader():
-        yield feature
-
-    infer_dict = copy.copy(feeding)
-    del infer_dict['score']
-
-    prediction = paddle.infer(
-        output=inference,
-        parameters=parameters,
-        reader=paddle.batch(
-            reader, batch_size=32),
-        feeding=infer_dict)
-    print(prediction + 5) / 2
-
-
-if __name__ == '__main__':
-    main()
diff --git a/demo/recommendation/common_utils.py b/demo/recommendation/common_utils.py
deleted file mode 100755
index c20c65286621d701ad58409b539bbe9c813d453a..0000000000000000000000000000000000000000
--- a/demo/recommendation/common_utils.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer.PyDataProvider2 import *
-
-
-def meta_to_header(meta, name):
-    metas = meta[name]['__meta__']['raw_meta']
-    for each_meta in metas:
-        slot_name = each_meta.get('name', '%s_id' % name)
-        if each_meta['type'] == 'id':
-            yield slot_name, integer_value(each_meta['max'])
-        elif each_meta['type'] == 'embedding':
-            is_seq = each_meta['seq'] == 'sequence'
-            yield slot_name, integer_value(
-                len(each_meta['dict']),
-                seq_type=SequenceType.SEQUENCE
-                if is_seq else SequenceType.NO_SEQUENCE)
-        elif each_meta['type'] == 'one_hot_dense':
-            yield slot_name, dense_vector(len(each_meta['dict']))
diff --git a/demo/recommendation/data/config.json b/demo/recommendation/data/config.json
deleted file mode 100644
index f26e74ce47bb7843a571e6033f051c046b31f054..0000000000000000000000000000000000000000
--- a/demo/recommendation/data/config.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "user": {
-    "file": {
-      "name": "users.dat",
-      "delimiter": "::"
-    },
-    "fields": ["id", "gender", "age", "occupation"]
-  },
-  "movie": {
-    "file": {
-      "name": "movies.dat",
-      "delimiter": "::"
-    },
-    "fields": ["id", "title", "genres"]
-  }
-}
diff --git a/demo/recommendation/data/config_generator.py b/demo/recommendation/data/config_generator.py
deleted file mode 100644
index 4ca496a252dffc62ed62bb8f2a5ee1661a940580..0000000000000000000000000000000000000000
--- a/demo/recommendation/data/config_generator.py
+++ /dev/null
@@ -1,127 +0,0 @@
-#!/bin/env python2
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-config_generator.py
-
-Usage:
-    ./config_generator.py <config_file> [--output_format=<output_format>]
-    ./config_generator.py -h | --help
-
-Options:
-    -h --help                           Show this screen.
-    --output_format=<output_format>     Output Config format(json or yaml) [default: json].
-"""
-
-import json
-import docopt
-import copy
-
-DEFAULT_FILE = {"type": "split", "delimiter": ","}
-
-DEFAULT_FIELD = {
-    "id": {
-        "type": "id"
-    },
-    "gender": {
-        "name": "gender",
-        "type": "embedding",
-        "dict": {
-            "type": "char_based"
-        }
-    },
-    "age": {
-        "name": "age",
-        "type": "embedding",
-        "dict": {
-            "type": "whole_content",
-            "sort": True
-        }
-    },
-    "occupation": {
-        "name": "occupation",
-        "type": "embedding",
-        "dict": {
-            "type": "whole_content",
-            "sort": "true"
-        }
-    },
-    "title": {
-        "regex": {
-            "pattern": r"^(.*)\((\d+)\)$",
-            "group_id": 1,
-            "strip": True
-        },
-        "name": "title",
-        "type": {
-            "name": "embedding",
-            "seq_type": "sequence",
-        },
-        "dict": {
-            "type": "char_based"
-        }
-    },
-    "genres": {
-        "type": "one_hot_dense",
-        "dict": {
-            "type": "split",
-            "delimiter": "|"
-        },
-        "name": "genres"
-    }
-}
-
-
-def merge_dict(master_dict, slave_dict):
-    return dict(((k, master_dict.get(k) or slave_dict.get(k))
-                 for k in set(slave_dict) | set(master_dict)))
-
-
-def main(filename, fmt):
-    with open(filename, 'r') as f:
-        conf = json.load(f)
-        obj = dict()
-        for k in conf:
-            val = conf[k]
-            file_dict = val['file']
-            file_dict = merge_dict(file_dict, DEFAULT_FILE)
-
-            fields = []
-            for pos, field_key in enumerate(val['fields']):
-                assert isinstance(field_key, basestring)
-                field = copy.deepcopy(DEFAULT_FIELD[field_key])
-                field['pos'] = pos
-                fields.append(field)
-            obj[k] = {"file": file_dict, "fields": fields}
-    meta = {"meta": obj}
-    # print meta
-    if fmt == 'json':
-
-        def formatter(x):
-            import json
-            return json.dumps(x, indent=2)
-    elif fmt == 'yaml':
-
-        def formatter(x):
-            import yaml
-            return yaml.safe_dump(x, default_flow_style=False)
-    else:
-        raise NotImplementedError("Dump format %s is not implemented" % fmt)
-
-    print formatter(meta)
-
-
-if __name__ == '__main__':
-    args = docopt.docopt(__doc__, version="0.1.0")
-    main(args["<config_file>"], args["--output_format"])
diff --git a/demo/recommendation/data/meta_config.json b/demo/recommendation/data/meta_config.json
deleted file mode 100644
index cc6a046e271dd0faaa47eeb5a5bef6d3604113fe..0000000000000000000000000000000000000000
--- a/demo/recommendation/data/meta_config.json
+++ /dev/null
@@ -1,81 +0,0 @@
-{
-  "meta": {
-    "movie": {
-      "fields": [
-        {
-          "type": "id", 
-          "pos": 0
-        }, 
-        {
-          "regex": {
-            "pattern": "^(.*)\\((\\d+)\\)$", 
-            "group_id": 1, 
-            "strip": true
-          }, 
-          "type": {
-            "seq_type": "sequence", 
-            "name": "embedding"
-          }, 
-          "dict": {
-            "type": "char_based"
-          }, 
-          "name": "title", 
-          "pos": 1
-        }, 
-        {
-          "type": "one_hot_dense", 
-          "dict": {
-            "delimiter": "|", 
-            "type": "split"
-          }, 
-          "name": "genres", 
-          "pos": 2
-        }
-      ], 
-      "file": {
-        "delimiter": "::", 
-        "type": "split", 
-        "name": "movies.dat"
-      }
-    }, 
-    "user": {
-      "fields": [
-        {
-          "type": "id", 
-          "pos": 0
-        }, 
-        {
-          "type": "embedding", 
-          "dict": {
-            "type": "char_based"
-          }, 
-          "name": "gender", 
-          "pos": 1
-        }, 
-        {
-          "type": "embedding", 
-          "dict": {
-            "sort": true, 
-            "type": "whole_content"
-          }, 
-          "name": "age", 
-          "pos": 2
-        }, 
-        {
-          "type": "embedding", 
-          "dict": {
-            "sort": "true", 
-            "type": "whole_content"
-          }, 
-          "name": "occupation", 
-          "pos": 3
-        }
-      ], 
-      "file": {
-        "delimiter": "::", 
-        "type": "split", 
-        "name": "users.dat"
-      }
-    }
-  }
-}
diff --git a/demo/recommendation/data/meta_generator.py b/demo/recommendation/data/meta_generator.py
deleted file mode 100644
index 38e4679d266c331a751114cd13f0e3453016cf26..0000000000000000000000000000000000000000
--- a/demo/recommendation/data/meta_generator.py
+++ /dev/null
@@ -1,430 +0,0 @@
-#!/bin/env python2
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Preprocess Movielens dataset, to get movie/user object.
-
-Usage:
-    ./preprocess.py <dataset_dir> <binary_filename> [--config=<config_file>]
-    ./preprocess.py -h | --help
-
-Options:
-    -h --help               Show this screen.
-    --version               Show version.
-    --config=<config_file>  Get MetaData config file [default: config.json].
-"""
-import docopt
-import os
-import sys
-import re
-import collections
-
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-
-
-class UniqueIDGenerator(object):
-    def __init__(self):
-        self.pool = collections.defaultdict(self.__next_id__)
-        self.next_id = 0
-
-    def __next_id__(self):
-        tmp = self.next_id
-        self.next_id += 1
-        return tmp
-
-    def __call__(self, k):
-        return self.pool[k]
-
-    def to_list(self):
-        ret_val = [None] * len(self.pool)
-        for k in self.pool.keys():
-            ret_val[self.pool[k]] = k
-        return ret_val
-
-
-class SortedIDGenerator(object):
-    def __init__(self):
-        self.__key_set__ = set()
-        self.dict = None
-
-    def scan(self, key):
-        self.__key_set__.add(key)
-
-    def finish_scan(self, compare=None, key=None, reverse=False):
-        self.__key_set__ = sorted(
-            list(self.__key_set__), cmp=compare, key=key, reverse=reverse)
-        self.dict = dict()
-        for idx, each_key in enumerate(self.__key_set__):
-            self.dict[each_key] = idx
-
-    def __call__(self, key):
-        return self.dict[key]
-
-    def to_list(self):
-        return self.__key_set__
-
-
-class SplitFileReader(object):
-    def __init__(self, work_dir, config):
-        assert isinstance(config, dict)
-        self.filename = config['name']
-        self.delimiter = config.get('delimiter', ',')
-        self.work_dir = work_dir
-
-    def read(self):
-        with open(os.path.join(self.work_dir, self.filename), 'r') as f:
-            for line in f:
-                line = line.strip()
-                if isinstance(self.delimiter, unicode):
-                    self.delimiter = str(self.delimiter)
-                yield line.split(self.delimiter)
-
-    @staticmethod
-    def create(work_dir, config):
-        assert isinstance(config, dict)
-        if config['type'] == 'split':
-            return SplitFileReader(work_dir, config)
-
-
-class IFileReader(object):
-    READERS = [SplitFileReader]
-
-    def read(self):
-        raise NotImplementedError()
-
-    @staticmethod
-    def create(work_dir, config):
-        for reader_cls in IFileReader.READERS:
-            val = reader_cls.create(work_dir, config)
-            if val is not None:
-                return val
-
-
-class IDFieldParser(object):
-    TYPE = 'id'
-
-    def __init__(self, config):
-        self.__max_id__ = -sys.maxint - 1
-        self.__min_id__ = sys.maxint
-        self.__id_count__ = 0
-
-    def scan(self, line):
-        idx = int(line)
-        self.__max_id__ = max(self.__max_id__, idx)
-        self.__min_id__ = min(self.__min_id__, idx)
-        self.__id_count__ += 1
-
-    def parse(self, line):
-        return int(line)
-
-    def meta_field(self):
-        return {
-            "is_key": True,
-            'max': self.__max_id__,
-            'min': self.__min_id__,
-            'count': self.__id_count__,
-            'type': 'id'
-        }
-
-
-class SplitEmbeddingDict(object):
-    def __init__(self, delimiter):
-        self.__id__ = UniqueIDGenerator()
-        self.delimiter = delimiter
-
-    def scan(self, multi):
-        for val in multi.split(self.delimiter):
-            self.__id__(val)
-
-    def parse(self, multi):
-        return map(self.__id__, multi.split(self.delimiter))
-
-    def meta_field(self):
-        return self.__id__.to_list()
-
-
-class EmbeddingFieldParser(object):
-    TYPE = 'embedding'
-
-    NO_SEQUENCE = "no_sequence"
-    SEQUENCE = "sequence"
-
-    class CharBasedEmbeddingDict(object):
-        def __init__(self, is_seq=True):
-            self.__id__ = UniqueIDGenerator()
-            self.is_seq = is_seq
-
-        def scan(self, s):
-            for ch in s:
-                self.__id__(ch)
-
-        def parse(self, s):
-            return map(self.__id__, s) if self.is_seq else self.__id__(s[0])
-
-        def meta_field(self):
-            return self.__id__.to_list()
-
-    class WholeContentDict(object):
-        def __init__(self, need_sort=True):
-            assert need_sort
-            self.__id__ = SortedIDGenerator()
-            self.__has_finished__ = False
-
-        def scan(self, txt):
-            self.__id__.scan(txt)
-
-        def meta_field(self):
-            if not self.__has_finished__:
-                self.__id__.finish_scan()
-                self.__has_finished__ = True
-            return self.__id__.to_list()
-
-        def parse(self, txt):
-            return self.__id__(txt)
-
-    def __init__(self, config):
-        try:
-            self.seq_type = config['type']['seq_type']
-        except TypeError:
-            self.seq_type = EmbeddingFieldParser.NO_SEQUENCE
-
-        if config['dict']['type'] == 'char_based':
-            self.dict = EmbeddingFieldParser.CharBasedEmbeddingDict(
-                self.seq_type == EmbeddingFieldParser.SEQUENCE)
-        elif config['dict']['type'] == 'split':
-            self.dict = SplitEmbeddingDict(config['dict'].get('delimiter', ','))
-        elif config['dict']['type'] == 'whole_content':
-            self.dict = EmbeddingFieldParser.WholeContentDict(config['dict'][
-                'sort'])
-        else:
-            print config
-            assert False
-
-        self.name = config['name']
-
-    def scan(self, s):
-        self.dict.scan(s)
-
-    def meta_field(self):
-        return {
-            'name': self.name,
-            'dict': self.dict.meta_field(),
-            'type': 'embedding',
-            'seq': self.seq_type
-        }
-
-    def parse(self, s):
-        return self.dict.parse(s)
-
-
-class OneHotDenseFieldParser(object):
-    TYPE = 'one_hot_dense'
-
-    def __init__(self, config):
-        if config['dict']['type'] == 'split':
-            self.dict = SplitEmbeddingDict(config['dict']['delimiter'])
-        self.name = config['name']
-
-    def scan(self, s):
-        self.dict.scan(s)
-
-    def meta_field(self):
-        # print self.dict.meta_field()
-        return {
-            'dict': self.dict.meta_field(),
-            'name': self.name,
-            'type': 'one_hot_dense'
-        }
-
-    def parse(self, s):
-        ids = self.dict.parse(s)
-        retv = [0.0] * len(self.dict.meta_field())
-        for idx in ids:
-            retv[idx] = 1.0
-        # print retv
-        return retv
-
-
-class FieldParserFactory(object):
-    PARSERS = [IDFieldParser, EmbeddingFieldParser, OneHotDenseFieldParser]
-
-    @staticmethod
-    def create(config):
-        if isinstance(config['type'], basestring):
-            config_type = config['type']
-        elif isinstance(config['type'], dict):
-            config_type = config['type']['name']
-
-        assert config_type is not None
-
-        for each_parser_cls in FieldParserFactory.PARSERS:
-            if config_type == each_parser_cls.TYPE:
-                return each_parser_cls(config)
-        print config
-
-
-class CompositeFieldParser(object):
-    def __init__(self, parser, extractor):
-        self.extractor = extractor
-        self.parser = parser
-
-    def scan(self, *args, **kwargs):
-        self.parser.scan(self.extractor.extract(*args, **kwargs))
-
-    def parse(self, *args, **kwargs):
-        return self.parser.parse(self.extractor.extract(*args, **kwargs))
-
-    def meta_field(self):
-        return self.parser.meta_field()
-
-
-class PositionContentExtractor(object):
-    def __init__(self, pos):
-        self.pos = pos
-
-    def extract(self, line):
-        assert isinstance(line, list)
-        return line[self.pos]
-
-
-class RegexPositionContentExtractor(PositionContentExtractor):
-    def __init__(self, pos, pattern, group_id, strip=True):
-        PositionContentExtractor.__init__(self, pos)
-        pattern = pattern.strip()
-        self.pattern = re.compile(pattern)
-        self.group_id = group_id
-        self.strip = strip
-
-    def extract(self, line):
-        line = PositionContentExtractor.extract(self, line)
-        match = self.pattern.match(line)
-        # print line, self.pattern.pattern, match
-        assert match is not None
-        txt = match.group(self.group_id)
-        if self.strip:
-            txt.strip()
-        return txt
-
-
-class ContentExtractorFactory(object):
-    def extract(self, line):
-        pass
-
-    @staticmethod
-    def create(config):
-        if 'pos' in config:
-            if 'regex' not in config:
-                return PositionContentExtractor(config['pos'])
-            else:
-                extra_args = config['regex']
-                return RegexPositionContentExtractor(
-                    pos=config['pos'], **extra_args)
-
-
-class MetaFile(object):
-    def __init__(self, work_dir):
-        self.work_dir = work_dir
-        self.obj = dict()
-
-    def parse(self, config):
-        config = config['meta']
-
-        ret_obj = dict()
-        for key in config.keys():
-            val = config[key]
-            assert 'file' in val
-            reader = IFileReader.create(self.work_dir, val['file'])
-            assert reader is not None
-            assert 'fields' in val and isinstance(val['fields'], list)
-            fields_config = val['fields']
-            field_parsers = map(MetaFile.__field_config_mapper__, fields_config)
-
-            for each_parser in field_parsers:
-                assert each_parser is not None
-
-            for each_block in reader.read():
-                for each_parser in field_parsers:
-                    each_parser.scan(each_block)
-
-            metas = map(lambda x: x.meta_field(), field_parsers)
-            # print metas
-            key_index = filter(
-                lambda x: x is not None,
-                map(lambda (idx, meta): idx if 'is_key' in meta and meta['is_key'] else None,
-                    enumerate(metas)))[0]
-
-            key_map = []
-            for i in range(min(key_index, len(metas))):
-                key_map.append(i)
-            for i in range(key_index + 1, len(metas)):
-                key_map.append(i)
-
-            obj = {'__meta__': {'raw_meta': metas, 'feature_map': key_map}}
-
-            for each_block in reader.read():
-                idx = field_parsers[key_index].parse(each_block)
-                val = []
-                for i, each_parser in enumerate(field_parsers):
-                    if i != key_index:
-                        val.append(each_parser.parse(each_block))
-                obj[idx] = val
-            ret_obj[key] = obj
-        self.obj = ret_obj
-        return ret_obj
-
-    @staticmethod
-    def __field_config_mapper__(conf):
-        assert isinstance(conf, dict)
-        extrator = ContentExtractorFactory.create(conf)
-        field_parser = FieldParserFactory.create(conf)
-        assert extrator is not None
-        assert field_parser is not None
-        return CompositeFieldParser(field_parser, extrator)
-
-    def dump(self, fp):
-        pickle.dump(self.obj, fp, pickle.HIGHEST_PROTOCOL)
-
-
-def preprocess(binary_filename, dataset_dir, config, **kwargs):
-    assert isinstance(config, str)
-    with open(config, 'r') as config_file:
-        file_loader = None
-        if config.lower().endswith('.yaml'):
-            import yaml
-            file_loader = yaml
-        elif config.lower().endswith('.json'):
-            import json
-            file_loader = json
-        config = file_loader.load(config_file)
-    meta = MetaFile(dataset_dir)
-    meta.parse(config)
-    with open(binary_filename, 'wb') as outf:
-        meta.dump(outf)
-
-
-if __name__ == '__main__':
-    args = docopt.docopt(__doc__, version='0.1.0')
-    kwargs = dict()
-    for key in args.keys():
-        if key != '--help':
-            param_name = key
-            assert isinstance(param_name, str)
-            param_name = param_name.replace('<', '')
-            param_name = param_name.replace('>', '')
-            param_name = param_name.replace('--', '')
-            kwargs[param_name] = args[key]
-    preprocess(**kwargs)
diff --git a/demo/recommendation/data/ml_data.sh b/demo/recommendation/data/ml_data.sh
deleted file mode 100755
index 2268d876389e0bdf5ead405e74d278d276626f82..0000000000000000000000000000000000000000
--- a/demo/recommendation/data/ml_data.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -ex
-cd "$(dirname "$0")"
-# download the dataset
-wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
-# unzip the dataset
-unzip ml-1m.zip
-# remove the unused zip file
-rm ml-1m.zip
diff --git a/demo/recommendation/data/split.py b/demo/recommendation/data/split.py
deleted file mode 100644
index be6869c22f04be1db0f8e9c35c73c851e4c490b0..0000000000000000000000000000000000000000
--- a/demo/recommendation/data/split.py
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/env python2
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Separate movielens 1m dataset to train/test file.
-
-Usage:
-    ./separate.py <input_file> [--test_ratio=<test_ratio>] [--delimiter=<delimiter>]
-    ./separate.py -h | --help
-
-Options:
-    -h --help                       Show this screen.
-    --version                       Show version.
-    --test_ratio=<test_ratio>       Test ratio for separate [default: 0.1].
-    --delimiter=<delimiter>         File delimiter [default: ,].
-"""
-import docopt
-import collections
-import random
-
-
-def process(test_ratio, input_file, delimiter, **kwargs):
-    test_ratio = float(test_ratio)
-    rating_dict = collections.defaultdict(list)
-    with open(input_file, 'r') as f:
-        for line in f:
-            user_id = int(line.split(delimiter)[0])
-            rating_dict[user_id].append(line.strip())
-
-    with open(input_file + ".train", 'w') as train_file:
-        with open(input_file + ".test", 'w') as test_file:
-            for k in rating_dict.keys():
-                lines = rating_dict[k]
-                assert isinstance(lines, list)
-                random.shuffle(lines)
-                test_len = int(len(lines) * test_ratio)
-                for line in lines[:test_len]:
-                    print >> test_file, line
-
-                for line in lines[test_len:]:
-                    print >> train_file, line
-
-
-if __name__ == '__main__':
-    args = docopt.docopt(__doc__, version='0.1.0')
-    kwargs = dict()
-    for key in args.keys():
-        if key != '--help':
-            param_name = key
-            assert isinstance(param_name, str)
-            param_name = param_name.replace('<', '')
-            param_name = param_name.replace('>', '')
-            param_name = param_name.replace('--', '')
-            kwargs[param_name] = args[key]
-    process(**kwargs)
diff --git a/demo/recommendation/dataprovider.py b/demo/recommendation/dataprovider.py
deleted file mode 100755
index c4ff96d80e81926049c9a71d6d9d991c0b568c25..0000000000000000000000000000000000000000
--- a/demo/recommendation/dataprovider.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-import common_utils  # parse
-
-
-def __list_to_map__(lst):
-    ret_val = dict()
-    for each in lst:
-        k, v = each
-        ret_val[k] = v
-    return ret_val
-
-
-def hook(settings, meta, **kwargs):
-    """
-    Init hook is invoked before process data. It will set obj.slots and store
-    data meta.
-
-    :param obj: global object. It will passed to process routine.
-    :type obj: object
-    :param meta: the meta file object, which passed from trainer_config. Meta
-                 file record movie/user features.
-    :param kwargs: unused other arguments.
-    """
-    del kwargs  # unused kwargs
-
-    # Header define slots that used for paddle.
-    #    first part is movie features.
-    #    second part is user features.
-    #    final part is rating score.
-    # header is a list of [USE_SEQ_OR_NOT?, SlotType]
-    movie_headers = list(common_utils.meta_to_header(meta, 'movie'))
-    settings.movie_names = [h[0] for h in movie_headers]
-    headers = movie_headers
-    user_headers = list(common_utils.meta_to_header(meta, 'user'))
-    settings.user_names = [h[0] for h in user_headers]
-    headers.extend(user_headers)
-    headers.append(("rating", dense_vector(1)))  # Score
-
-    # slot types.
-    settings.input_types = __list_to_map__(headers)
-    settings.meta = meta
-
-
-@provider(init_hook=hook, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, filename):
-    with open(filename, 'r') as f:
-        for line in f:
-            # Get a rating from file.
-            user_id, movie_id, score = map(int, line.split('::')[:-1])
-
-            # Scale score to [-5, +5]
-            score = float(score) * 2 - 5.0
-
-            # Get movie/user features by movie_id, user_id
-            movie_meta = settings.meta['movie'][movie_id]
-            user_meta = settings.meta['user'][user_id]
-
-            outputs = [('movie_id', movie_id - 1)]
-
-            # Then add movie features
-            for i, each_meta in enumerate(movie_meta):
-                outputs.append((settings.movie_names[i + 1], each_meta))
-
-            # Then add user id.
-            outputs.append(('user_id', user_id - 1))
-
-            # Then add user features.
-            for i, each_meta in enumerate(user_meta):
-                outputs.append((settings.user_names[i + 1], each_meta))
-
-            # Finally, add score
-            outputs.append(('rating', [score]))
-            # Return data to paddle
-            yield __list_to_map__(outputs)
diff --git a/demo/recommendation/evaluate.py b/demo/recommendation/evaluate.py
deleted file mode 100755
index 3afa7a1e9db5fefb1bbf5aaa174b8168afae4058..0000000000000000000000000000000000000000
--- a/demo/recommendation/evaluate.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-import re
-import math
-
-
-def get_best_pass(log_filename):
-    with open(log_filename, 'r') as f:
-        text = f.read()
-        pattern = re.compile('Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)',
-                             re.S)
-        results = re.findall(pattern, text)
-        sorted_results = sorted(results, key=lambda result: float(result[0]))
-        return sorted_results[0]
-
-
-log_filename = sys.argv[1]
-log = get_best_pass(log_filename)
-predict_error = math.sqrt(float(log[0])) / 2
-print 'Best pass is %s, error is %s, which means predict get error as %f' % (
-    log[1], log[0], predict_error)
-
-evaluate_pass = "output/pass-%s" % log[1]
-print "evaluating from pass %s" % evaluate_pass
diff --git a/demo/recommendation/evaluate.sh b/demo/recommendation/evaluate.sh
deleted file mode 100755
index 02b2857de028bc9c05d7ddd67012043b671b2764..0000000000000000000000000000000000000000
--- a/demo/recommendation/evaluate.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | sort | head -n 1
-}
-
-LOG=`get_best_pass log.txt`
-LOG=(${LOG})
-echo 'Best pass is '${LOG[1]}, ' error is '${LOG[0]}, 'which means predict get error as '`echo ${LOG[0]} | python -c 'import math; print math.sqrt(float(raw_input()))/2'`
-
-evaluate_pass="output/pass-${LOG[1]}"
-
-echo 'evaluating from pass '$evaluate_pass
diff --git a/demo/recommendation/prediction.py b/demo/recommendation/prediction.py
deleted file mode 100755
index 8ad993eab3a9f637cfff752bfedbbc62eaf3c8d5..0000000000000000000000000000000000000000
--- a/demo/recommendation/prediction.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/env python2
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from py_paddle import swig_paddle, DataProviderConverter
-
-from common_utils import *
-from paddle.trainer.config_parser import parse_config
-
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-import sys
-
-if __name__ == '__main__':
-    model_path = sys.argv[1]
-    swig_paddle.initPaddle('--use_gpu=0')
-    conf = parse_config("trainer_config.py", "is_predict=1")
-    network = swig_paddle.GradientMachine.createFromConfigProto(
-        conf.model_config)
-    assert isinstance(network, swig_paddle.GradientMachine)
-    network.loadParameters(model_path)
-    with open('./data/meta.bin', 'rb') as f:
-        meta = pickle.load(f)
-        headers = [h[1] for h in meta_to_header(meta, 'movie')]
-        headers.extend([h[1] for h in meta_to_header(meta, 'user')])
-        cvt = DataProviderConverter(headers)
-        while True:
-            movie_id = int(raw_input("Input movie_id: "))
-            user_id = int(raw_input("Input user_id: "))
-            movie_meta = meta['movie'][movie_id]  # Query Data From Meta.
-            user_meta = meta['user'][user_id]
-            data = [movie_id - 1]
-            data.extend(movie_meta)
-            data.append(user_id - 1)
-            data.extend(user_meta)
-            print "Prediction Score is %.2f" % (
-                (network.forwardTest(cvt.convert([data]))[0]['value'][0][0] + 5)
-                / 2)
diff --git a/demo/recommendation/preprocess.sh b/demo/recommendation/preprocess.sh
deleted file mode 100755
index eeb81ce3cb47e65c0aeb303e7571024ba82dad65..0000000000000000000000000000000000000000
--- a/demo/recommendation/preprocess.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-UNAME_STR=`uname`
-
-if [[ ${UNAME_STR} == 'Linux' ]]; then
-	SHUF_PROG='shuf'
-else
-	SHUF_PROG='gshuf'
-fi
-
-
-cd "$(dirname "$0")"
-delimiter='::'
-dir=ml-1m
-cd data
-echo 'generate meta config file'
-python config_generator.py config.json > meta_config.json
-echo 'generate meta file'
-python meta_generator.py $dir meta.bin --config=meta_config.json
-echo 'split train/test file'
-python split.py $dir/ratings.dat --delimiter=${delimiter} --test_ratio=0.1
-echo 'shuffle train file'
-${SHUF_PROG} $dir/ratings.dat.train > ratings.dat.train
-cp $dir/ratings.dat.test .
-echo "./data/ratings.dat.train" > train.list
-echo "./data/ratings.dat.test" > test.list
diff --git a/demo/recommendation/requirements.txt b/demo/recommendation/requirements.txt
deleted file mode 100644
index 1ea154584a428b6a389309f1f8def502e0aadfce..0000000000000000000000000000000000000000
--- a/demo/recommendation/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-PyYAML
-docopt
diff --git a/demo/recommendation/run.sh b/demo/recommendation/run.sh
deleted file mode 100755
index 22aef556082ba429e9ca7c6dd3ec72699b9dbcf4..0000000000000000000000000000000000000000
--- a/demo/recommendation/run.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-paddle train \
-    --config=trainer_config.py \
-    --save_dir=./output \
-    --use_gpu=false \
-    --trainer_count=4\
-    --test_all_data_in_one_period=true \
-    --log_period=100 \
-    --dot_period=1 \
-    --num_passes=50  2>&1 | tee 'log.txt'
-paddle usage -l log.txt -e $? -n "recommendation" >/dev/null 2>&1
diff --git a/demo/recommendation/trainer_config.py b/demo/recommendation/trainer_config.py
deleted file mode 100755
index 25f529d7d7c430f179107fb189ade34760ab309d..0000000000000000000000000000000000000000
--- a/demo/recommendation/trainer_config.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-
-is_predict = get_config_arg('is_predict', bool, False)
-
-META_FILE = 'data/meta.bin'
-
-with open(META_FILE, 'rb') as f:
-    # load meta file
-    meta = pickle.load(f)
-
-settings(
-    batch_size=1600, learning_rate=1e-3, learning_method=RMSPropOptimizer())
-
-
-def construct_feature(name):
-    """
-    Construct movie/user features.
-
-    This method read from meta data. Then convert feature to neural network due
-    to feature type. The map relation as follow.
-
-    * id: embedding => fc
-    * embedding:
-        is_sequence:  embedding => context_projection => fc => pool
-        not sequence: embedding => fc
-    * one_hot_dense:  fc => fc
-
-    Then gather all features vector, and use a fc layer to combined them as
-    return.
-
-    :param name: 'movie' or 'user'
-    :type name: basestring
-    :return: combined feature output
-    :rtype: LayerOutput
-    """
-    __meta__ = meta[name]['__meta__']['raw_meta']
-    fusion = []
-    for each_meta in __meta__:
-        type_name = each_meta['type']
-        slot_name = each_meta.get('name', '%s_id' % name)
-        if type_name == 'id':
-            slot_dim = each_meta['max']
-            embedding = embedding_layer(
-                input=data_layer(
-                    slot_name, size=slot_dim), size=256)
-            fusion.append(fc_layer(input=embedding, size=256))
-        elif type_name == 'embedding':
-            is_seq = each_meta['seq'] == 'sequence'
-            slot_dim = len(each_meta['dict'])
-            din = data_layer(slot_name, slot_dim)
-            embedding = embedding_layer(input=din, size=256)
-            if is_seq:
-                fusion.append(
-                    text_conv_pool(
-                        input=embedding, context_len=5, hidden_size=256))
-            else:
-                fusion.append(fc_layer(input=embedding, size=256))
-        elif type_name == 'one_hot_dense':
-            slot_dim = len(each_meta['dict'])
-            hidden = fc_layer(input=data_layer(slot_name, slot_dim), size=256)
-            fusion.append(fc_layer(input=hidden, size=256))
-
-    return fc_layer(name="%s_fusion" % name, input=fusion, size=256)
-
-
-movie_feature = construct_feature("movie")
-user_feature = construct_feature("user")
-similarity = cos_sim(a=movie_feature, b=user_feature)
-if not is_predict:
-    outputs(mse_cost(input=similarity, label=data_layer('rating', size=1)))
-
-    define_py_data_sources2(
-        'data/train.list',
-        'data/test.list',
-        module='dataprovider',
-        obj='process',
-        args={'meta': meta})
-else:
-    outputs(similarity)
diff --git a/demo/semantic_role_labeling/.gitignore b/demo/semantic_role_labeling/.gitignore
deleted file mode 100644
index 65c9b674c7d1dad53b7d1c6ee1dcbdb72553888d..0000000000000000000000000000000000000000
--- a/demo/semantic_role_labeling/.gitignore
+++ /dev/null
@@ -1,14 +0,0 @@
-*.pyc
-train.log
-data/feature
-data/conll05st-release/
-data/src.dict
-data/test.wsj.props
-data/test.wsj.seq_pair
-data/test.wsj.words
-data/tgt.dict
-output
-data/emb
-data/targetDict.txt
-data/verbDict.txt
-data/wordDict.txt
diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py
deleted file mode 100644
index 036cad4b0a32357bb42580ef577a1eba558be8fe..0000000000000000000000000000000000000000
--- a/demo/semantic_role_labeling/api_train_v2.py
+++ /dev/null
@@ -1,190 +0,0 @@
-import sys
-import math
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.dataset.conll05 as conll05
-
-
-def db_lstm():
-    word_dict, verb_dict, label_dict = conll05.get_dict()
-    word_dict_len = len(word_dict)
-    label_dict_len = len(label_dict)
-    pred_len = len(verb_dict)
-
-    mark_dict_len = 2
-    word_dim = 32
-    mark_dim = 5
-    hidden_dim = 512
-    depth = 8
-
-    #8 features
-    def d_type(size):
-        return paddle.data_type.integer_value_sequence(size)
-
-    word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
-    predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len))
-
-    ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len))
-    ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len))
-    ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len))
-    ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len))
-    ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len))
-    mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
-
-    target = paddle.layer.data(name='target', type=d_type(label_dict_len))
-
-    default_std = 1 / math.sqrt(hidden_dim) / 3.0
-
-    emb_para = paddle.attr.Param(name='emb', initial_std=0., learning_rate=0.)
-    std_0 = paddle.attr.Param(initial_std=0.)
-    std_default = paddle.attr.Param(initial_std=default_std)
-
-    predicate_embedding = paddle.layer.embedding(
-        size=word_dim,
-        input=predicate,
-        param_attr=paddle.attr.Param(
-            name='vemb', initial_std=default_std))
-    mark_embedding = paddle.layer.embedding(
-        size=mark_dim, input=mark, param_attr=std_0)
-
-    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-    emb_layers = [
-        paddle.layer.embedding(
-            size=word_dim, input=x, param_attr=emb_para) for x in word_input
-    ]
-    emb_layers.append(predicate_embedding)
-    emb_layers.append(mark_embedding)
-
-    hidden_0 = paddle.layer.mixed(
-        size=hidden_dim,
-        bias_attr=std_default,
-        input=[
-            paddle.layer.full_matrix_projection(
-                input=emb, param_attr=std_default) for emb in emb_layers
-        ])
-
-    mix_hidden_lr = 1e-3
-    lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
-    hidden_para_attr = paddle.attr.Param(
-        initial_std=default_std, learning_rate=mix_hidden_lr)
-
-    lstm_0 = paddle.layer.lstmemory(
-        input=hidden_0,
-        act=paddle.activation.Relu(),
-        gate_act=paddle.activation.Sigmoid(),
-        state_act=paddle.activation.Sigmoid(),
-        bias_attr=std_0,
-        param_attr=lstm_para_attr)
-
-    #stack L-LSTM and R-LSTM with direct edges
-    input_tmp = [hidden_0, lstm_0]
-
-    for i in range(1, depth):
-        mix_hidden = paddle.layer.mixed(
-            size=hidden_dim,
-            bias_attr=std_default,
-            input=[
-                paddle.layer.full_matrix_projection(
-                    input=input_tmp[0], param_attr=hidden_para_attr),
-                paddle.layer.full_matrix_projection(
-                    input=input_tmp[1], param_attr=lstm_para_attr)
-            ])
-
-        lstm = paddle.layer.lstmemory(
-            input=mix_hidden,
-            act=paddle.activation.Relu(),
-            gate_act=paddle.activation.Sigmoid(),
-            state_act=paddle.activation.Sigmoid(),
-            reverse=((i % 2) == 1),
-            bias_attr=std_0,
-            param_attr=lstm_para_attr)
-
-        input_tmp = [mix_hidden, lstm]
-
-    feature_out = paddle.layer.mixed(
-        size=label_dict_len,
-        bias_attr=std_default,
-        input=[
-            paddle.layer.full_matrix_projection(
-                input=input_tmp[0], param_attr=hidden_para_attr),
-            paddle.layer.full_matrix_projection(
-                input=input_tmp[1], param_attr=lstm_para_attr)
-        ], )
-
-    crf_cost = paddle.layer.crf(size=label_dict_len,
-                                input=feature_out,
-                                label=target,
-                                param_attr=paddle.attr.Param(
-                                    name='crfw',
-                                    initial_std=default_std,
-                                    learning_rate=mix_hidden_lr))
-
-    crf_dec = paddle.layer.crf_decoding(
-        name='crf_dec_l',
-        size=label_dict_len,
-        input=feature_out,
-        label=target,
-        param_attr=paddle.attr.Param(name='crfw'))
-
-    return crf_cost, crf_dec
-
-
-def load_parameter(file_name, h, w):
-    with open(file_name, 'rb') as f:
-        f.read(16)  # skip header.
-        return np.fromfile(f, dtype=np.float32).reshape(h, w)
-
-
-def main():
-    paddle.init(use_gpu=False, trainer_count=1)
-
-    # define network topology
-    crf_cost, crf_dec = db_lstm()
-
-    # create parameters
-    parameters = paddle.parameters.create([crf_cost, crf_dec])
-
-    # create optimizer
-    optimizer = paddle.optimizer.Momentum(
-        momentum=0,
-        learning_rate=2e-2,
-        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
-        model_average=paddle.optimizer.ModelAverage(
-            average_window=0.5, max_average_window=10000), )
-
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
-                print "Pass %d, Batch %d, Cost %f, %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics)
-
-    trainer = paddle.trainer.SGD(cost=crf_cost,
-                                 parameters=parameters,
-                                 update_equation=optimizer)
-    parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
-
-    trn_reader = paddle.batch(
-        paddle.reader.shuffle(
-            conll05.test(), buf_size=8192), batch_size=10)
-
-    feeding = {
-        'word_data': 0,
-        'ctx_n2_data': 1,
-        'ctx_n1_data': 2,
-        'ctx_0_data': 3,
-        'ctx_p1_data': 4,
-        'ctx_p2_data': 5,
-        'verb_data': 6,
-        'mark_data': 7,
-        'target': 8
-    }
-
-    trainer.train(
-        reader=trn_reader,
-        event_handler=event_handler,
-        num_passes=10000,
-        feeding=feeding)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/demo/semantic_role_labeling/data/extract_dict_feature.py b/demo/semantic_role_labeling/data/extract_dict_feature.py
deleted file mode 100644
index da44111976a0dec68345fc139d0aa459ca9211c2..0000000000000000000000000000000000000000
--- a/demo/semantic_role_labeling/data/extract_dict_feature.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import os
-from optparse import OptionParser
-
-
-def extract_dict_features(pair_file, feature_file):
-
-    with open(pair_file) as fin, open(feature_file, 'w') as feature_out:
-        for line in fin:
-            sentence, predicate, labels = line.strip().split('\t')
-            sentence_list = sentence.split()
-            labels_list = labels.split()
-
-            verb_index = labels_list.index('B-V')
-
-            mark = [0] * len(labels_list)
-            if verb_index > 0:
-                mark[verb_index - 1] = 1
-                ctx_n1 = sentence_list[verb_index - 1]
-            else:
-                ctx_n1 = 'bos'
-
-            if verb_index > 1:
-                mark[verb_index - 2] = 1
-                ctx_n2 = sentence_list[verb_index - 2]
-            else:
-                ctx_n2 = 'bos'
-
-            mark[verb_index] = 1
-            ctx_0 = sentence_list[verb_index]
-
-            if verb_index < len(labels_list) - 1:
-                mark[verb_index + 1] = 1
-                ctx_p1 = sentence_list[verb_index + 1]
-            else:
-                ctx_p1 = 'eos'
-
-            if verb_index < len(labels_list) - 2:
-                mark[verb_index + 2] = 1
-                ctx_p2 = sentence_list[verb_index + 2]
-            else:
-                ctx_p2 = 'eos'
-
-
-            feature_str  = sentence + '\t' \
-                           + predicate + '\t' \
-                           + ctx_n2 + '\t' \
-                           + ctx_n1 + '\t' \
-                           + ctx_0 + '\t' \
-                           + ctx_p1 + '\t' \
-                           + ctx_p2 + '\t' \
-                           + ' '.join([str(i) for i in mark]) + '\t' \
-                           + labels
-
-            feature_out.write(feature_str + '\n')
-
-
-if __name__ == '__main__':
-
-    usage = '-p pair_file -f feature_file'
-    parser = OptionParser(usage)
-    parser.add_option('-p', dest='pair_file', help='the pair file')
-    parser.add_option('-f', dest='feature_file', help='the feature file')
-
-    (options, args) = parser.parse_args()
-
-    extract_dict_features(options.pair_file, options.feature_file)
diff --git a/demo/semantic_role_labeling/data/extract_pairs.py b/demo/semantic_role_labeling/data/extract_pairs.py
deleted file mode 100644
index 94a8488c16734eb1882d54f7ec36f4b9308c09d4..0000000000000000000000000000000000000000
--- a/demo/semantic_role_labeling/data/extract_pairs.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import os
-from optparse import OptionParser
-
-
-def read_labels(props_file):
-    '''
-    a sentence maybe has more than one verb, each verb has its label sequence
-    label[],  is a 3-dimension list. 
-    the first dim is to store all sentence's label seqs, len is the sentence number
-    the second dim is to store all label sequences for one sentences
-    the third dim is to store each label for one word
-    '''
-    labels = []
-    with open(props_file) as fin:
-        label_seqs_for_one_sentences = []
-        one_seg_in_file = []
-        for line in fin:
-            line = line.strip()
-            if line == '':
-                for i in xrange(len(one_seg_in_file[0])):
-                    a_kind_lable = [x[i] for x in one_seg_in_file]
-                    label_seqs_for_one_sentences.append(a_kind_lable)
-                labels.append(label_seqs_for_one_sentences)
-                one_seg_in_file = []
-                label_seqs_for_one_sentences = []
-            else:
-                part = line.split()
-                one_seg_in_file.append(part)
-    return labels
-
-
-def read_sentences(words_file):
-    sentences = []
-    with open(words_file) as fin:
-        s = ''
-        for line in fin:
-            line = line.strip()
-            if line == '':
-                sentences.append(s)
-                s = ''
-            else:
-                s += line + ' '
-    return sentences
-
-
-def transform_labels(sentences, labels):
-    sen_lab_pair = []
-    for i in xrange(len(sentences)):
-        if len(labels[i]) == 1:
-            continue
-        else:
-            verb_list = []
-            for x in labels[i][0]:
-                if x != '-':
-                    verb_list.append(x)
-
-            for j in xrange(1, len(labels[i])):
-                label_list = labels[i][j]
-                current_tag = 'O'
-                is_in_bracket = False
-                label_seq = []
-                verb_word = ''
-                for ll in label_list:
-                    if ll == '*' and is_in_bracket == False:
-                        label_seq.append('O')
-                    elif ll == '*' and is_in_bracket == True:
-                        label_seq.append('I-' + current_tag)
-                    elif ll == '*)':
-                        label_seq.append('I-' + current_tag)
-                        is_in_bracket = False
-                    elif ll.find('(') != -1 and ll.find(')') != -1:
-                        current_tag = ll[1:ll.find('*')]
-                        label_seq.append('B-' + current_tag)
-                        is_in_bracket = False
-                    elif ll.find('(') != -1 and ll.find(')') == -1:
-                        current_tag = ll[1:ll.find('*')]
-                        label_seq.append('B-' + current_tag)
-                        is_in_bracket = True
-                    else:
-                        print 'error:', ll
-                sen_lab_pair.append((sentences[i], verb_list[j - 1], label_seq))
-    return sen_lab_pair
-
-
-def write_file(sen_lab_pair, output_file):
-    with open(output_file, 'w') as fout:
-        for x in sen_lab_pair:
-            sentence = x[0]
-            label_seq = ' '.join(x[2])
-            assert len(sentence.split()) == len(x[2])
-            fout.write(sentence + '\t' + x[1] + '\t' + label_seq + '\n')
-
-
-if __name__ == '__main__':
-
-    usage = '-w words_file -p props_file -o output_file'
-    parser = OptionParser(usage)
-    parser.add_option('-w', dest='words_file', help='the words file')
-    parser.add_option('-p', dest='props_file', help='the props file')
-    parser.add_option('-o', dest='output_file', help='the output_file')
-    (options, args) = parser.parse_args()
-
-    sentences = read_sentences(options.words_file)
-    labels = read_labels(options.props_file)
-    sen_lab_pair = transform_labels(sentences, labels)
-
-    write_file(sen_lab_pair, options.output_file)
diff --git a/demo/semantic_role_labeling/data/get_data.sh b/demo/semantic_role_labeling/data/get_data.sh
deleted file mode 100755
index a0ef26a13b9a03392cb8b6207d6d21b7761e38e8..0000000000000000000000000000000000000000
--- a/demo/semantic_role_labeling/data/get_data.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz
-wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt
-wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt 
-wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt 
-wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb
-tar -xzvf conll05st-tests.tar.gz
-rm conll05st-tests.tar.gz
-cp ./conll05st-release/test.wsj/words/test.wsj.words.gz  .
-cp ./conll05st-release/test.wsj/props/test.wsj.props.gz  . 
-gunzip test.wsj.words.gz
-gunzip test.wsj.props.gz
-
-python extract_pairs.py  -w test.wsj.words -p test.wsj.props -o test.wsj.seq_pair
-python extract_dict_feature.py -p test.wsj.seq_pair -f feature 
diff --git a/demo/semantic_role_labeling/data/test.list b/demo/semantic_role_labeling/data/test.list
deleted file mode 100644
index ec370e897a7811b572613150ccb6f665c3adb974..0000000000000000000000000000000000000000
--- a/demo/semantic_role_labeling/data/test.list
+++ /dev/null
@@ -1 +0,0 @@
-./data/feature
diff --git a/demo/semantic_role_labeling/data/train.list b/demo/semantic_role_labeling/data/train.list
deleted file mode 100644
index ec370e897a7811b572613150ccb6f665c3adb974..0000000000000000000000000000000000000000
--- a/demo/semantic_role_labeling/data/train.list
+++ /dev/null
@@ -1 +0,0 @@
-./data/feature
diff --git a/demo/semantic_role_labeling/dataprovider.py b/demo/semantic_role_labeling/dataprovider.py
deleted file mode 100644
index 360c57ea6283ca43986610abf1831742bfc0c3ef..0000000000000000000000000000000000000000
--- a/demo/semantic_role_labeling/dataprovider.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-
-UNK_IDX = 0
-
-
-def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
-    settings.word_dict = word_dict
-    settings.label_dict = label_dict
-    settings.predicate_dict = predicate_dict
-
-    #all inputs are integral and sequential type
-    settings.slots = [
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(predicate_dict)), integer_value_sequence(2),
-        integer_value_sequence(len(label_dict))
-    ]
-
-
-def get_batch_size(yeild_data):
-    return len(yeild_data[0])
-
-
-@provider(
-    init_hook=hook,
-    should_shuffle=True,
-    calc_batch_size=get_batch_size,
-    can_over_batch_size=True,
-    cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line in fdata:
-            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
-                line.strip().split('\t')
-
-            words = sentence.split()
-            sen_len = len(words)
-            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-
-            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
-            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
-            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
-            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
-            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
-
-            marks = mark.split()
-            mark_slot = [int(w) for w in marks]
-
-            label_list = label.split()
-            label_slot = [settings.label_dict.get(w) for w in label_list]
-            yield word_slot, ctx_n2_slot, ctx_n1_slot, \
-                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot, label_slot
diff --git a/demo/semantic_role_labeling/db_lstm.py b/demo/semantic_role_labeling/db_lstm.py
deleted file mode 100644
index 04e2a559b19bd4b9aec0242eb43edf6ab1e7624e..0000000000000000000000000000000000000000
--- a/demo/semantic_role_labeling/db_lstm.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import os
-import sys
-from paddle.trainer_config_helpers import *
-
-#file paths
-word_dict_file = './data/wordDict.txt'
-label_dict_file = './data/targetDict.txt'
-predicate_file = './data/verbDict.txt'
-train_list_file = './data/train.list'
-test_list_file = './data/test.list'
-
-is_test = get_config_arg('is_test', bool, False)
-is_predict = get_config_arg('is_predict', bool, False)
-
-if not is_predict:
-    #load dictionaries
-    word_dict = dict()
-    label_dict = dict()
-    predicate_dict = dict()
-    with open(word_dict_file, 'r') as f_word, \
-         open(label_dict_file, 'r') as f_label, \
-         open(predicate_file, 'r') as f_pre:
-        for i, line in enumerate(f_word):
-            w = line.strip()
-            word_dict[w] = i
-
-        for i, line in enumerate(f_label):
-            w = line.strip()
-            label_dict[w] = i
-
-        for i, line in enumerate(f_pre):
-            w = line.strip()
-            predicate_dict[w] = i
-
-    if is_test:
-        train_list_file = None
-
-    #define data provider
-    define_py_data_sources2(
-        train_list=train_list_file,
-        test_list=test_list_file,
-        module='dataprovider',
-        obj='process',
-        args={
-            'word_dict': word_dict,
-            'label_dict': label_dict,
-            'predicate_dict': predicate_dict
-        })
-
-    word_dict_len = len(word_dict)
-    label_dict_len = len(label_dict)
-    pred_len = len(predicate_dict)
-
-else:
-    word_dict_len = get_config_arg('dict_len', int)
-    label_dict_len = get_config_arg('label_len', int)
-    pred_len = get_config_arg('pred_len', int)
-
-############################## Hyper-parameters ##################################
-mark_dict_len = 2
-word_dim = 32
-mark_dim = 5
-hidden_dim = 512
-depth = 8
-
-########################### Optimizer #######################################
-
-settings(
-    batch_size=150,
-    learning_method=MomentumOptimizer(momentum=0),
-    learning_rate=2e-2,
-    regularization=L2Regularization(8e-4),
-    is_async=False,
-    model_average=ModelAverage(
-        average_window=0.5, max_average_window=10000), )
-
-####################################### network ##############################
-#8 features and 1 target
-word = data_layer(name='word_data', size=word_dict_len)
-predicate = data_layer(name='verb_data', size=pred_len)
-
-ctx_n2 = data_layer(name='ctx_n2_data', size=word_dict_len)
-ctx_n1 = data_layer(name='ctx_n1_data', size=word_dict_len)
-ctx_0 = data_layer(name='ctx_0_data', size=word_dict_len)
-ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len)
-ctx_p2 = data_layer(name='ctx_p2_data', size=word_dict_len)
-mark = data_layer(name='mark_data', size=mark_dict_len)
-
-if not is_predict:
-    target = data_layer(name='target', size=label_dict_len)
-
-default_std = 1 / math.sqrt(hidden_dim) / 3.0
-
-emb_para = ParameterAttribute(name='emb', initial_std=0., learning_rate=0.)
-std_0 = ParameterAttribute(initial_std=0.)
-std_default = ParameterAttribute(initial_std=default_std)
-
-predicate_embedding = embedding_layer(
-    size=word_dim,
-    input=predicate,
-    param_attr=ParameterAttribute(
-        name='vemb', initial_std=default_std))
-mark_embedding = embedding_layer(
-    name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0)
-
-word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-emb_layers = [
-    embedding_layer(
-        size=word_dim, input=x, param_attr=emb_para) for x in word_input
-]
-emb_layers.append(predicate_embedding)
-emb_layers.append(mark_embedding)
-
-hidden_0 = mixed_layer(
-    name='hidden0',
-    size=hidden_dim,
-    bias_attr=std_default,
-    input=[
-        full_matrix_projection(
-            input=emb, param_attr=std_default) for emb in emb_layers
-    ])
-
-mix_hidden_lr = 1e-3
-lstm_para_attr = ParameterAttribute(initial_std=0.0, learning_rate=1.0)
-hidden_para_attr = ParameterAttribute(
-    initial_std=default_std, learning_rate=mix_hidden_lr)
-
-lstm_0 = lstmemory(
-    name='lstm0',
-    input=hidden_0,
-    act=ReluActivation(),
-    gate_act=SigmoidActivation(),
-    state_act=SigmoidActivation(),
-    bias_attr=std_0,
-    param_attr=lstm_para_attr)
-
-#stack L-LSTM and R-LSTM with direct edges
-input_tmp = [hidden_0, lstm_0]
-
-for i in range(1, depth):
-
-    mix_hidden = mixed_layer(
-        name='hidden' + str(i),
-        size=hidden_dim,
-        bias_attr=std_default,
-        input=[
-            full_matrix_projection(
-                input=input_tmp[0], param_attr=hidden_para_attr),
-            full_matrix_projection(
-                input=input_tmp[1], param_attr=lstm_para_attr)
-        ])
-
-    lstm = lstmemory(
-        name='lstm' + str(i),
-        input=mix_hidden,
-        act=ReluActivation(),
-        gate_act=SigmoidActivation(),
-        state_act=SigmoidActivation(),
-        reverse=((i % 2) == 1),
-        bias_attr=std_0,
-        param_attr=lstm_para_attr)
-
-    input_tmp = [mix_hidden, lstm]
-
-feature_out = mixed_layer(
-    name='output',
-    size=label_dict_len,
-    bias_attr=std_default,
-    input=[
-        full_matrix_projection(
-            input=input_tmp[0], param_attr=hidden_para_attr),
-        full_matrix_projection(
-            input=input_tmp[1], param_attr=lstm_para_attr)
-    ], )
-
-if not is_predict:
-    crf_l = crf_layer(
-        name='crf',
-        size=label_dict_len,
-        input=feature_out,
-        label=target,
-        param_attr=ParameterAttribute(
-            name='crfw', initial_std=default_std, learning_rate=mix_hidden_lr))
-
-    crf_dec_l = crf_decoding_layer(
-        name='crf_dec_l',
-        size=label_dict_len,
-        input=feature_out,
-        label=target,
-        param_attr=ParameterAttribute(name='crfw'))
-
-    eval = sum_evaluator(input=crf_dec_l)
-
-    outputs(crf_l)
-
-else:
-    crf_dec_l = crf_decoding_layer(
-        name='crf_dec_l',
-        size=label_dict_len,
-        input=feature_out,
-        param_attr=ParameterAttribute(name='crfw'))
-
-    outputs(crf_dec_l)
diff --git a/demo/semantic_role_labeling/predict.py b/demo/semantic_role_labeling/predict.py
deleted file mode 100644
index 372fd090b6e8f08f5bb34697772c2e4976810595..0000000000000000000000000000000000000000
--- a/demo/semantic_role_labeling/predict.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import numpy as np
-from optparse import OptionParser
-from py_paddle import swig_paddle, DataProviderConverter
-from paddle.trainer.PyDataProvider2 import integer_value_sequence
-from paddle.trainer.config_parser import parse_config
-"""
-Usage: run following command to show help message.
-  python predict.py -h
-"""
-UNK_IDX = 0
-
-
-class Prediction():
-    def __init__(self, train_conf, dict_file, model_dir, label_file,
-                 predicate_dict_file):
-        """
-        train_conf: trainer configure.
-        dict_file: word dictionary file name.
-        model_dir: directory of model.
-        """
-
-        self.dict = {}
-        self.labels = {}
-        self.predicate_dict = {}
-        self.labels_reverse = {}
-        self.load_dict_label(dict_file, label_file, predicate_dict_file)
-
-        len_dict = len(self.dict)
-        len_label = len(self.labels)
-        len_pred = len(self.predicate_dict)
-
-        conf = parse_config(
-            train_conf, 'dict_len=' + str(len_dict) + ',label_len=' +
-            str(len_label) + ',pred_len=' + str(len_pred) + ',is_predict=True')
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(
-            conf.model_config)
-        self.network.loadParameters(model_dir)
-
-        slots = [
-            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
-            integer_value_sequence(len_pred), integer_value_sequence(2)
-        ]
-        self.converter = DataProviderConverter(slots)
-
-    def load_dict_label(self, dict_file, label_file, predicate_dict_file):
-        """
-        Load dictionary from self.dict_file.
-        """
-        for line_count, line in enumerate(open(dict_file, 'r')):
-            self.dict[line.strip()] = line_count
-
-        for line_count, line in enumerate(open(label_file, 'r')):
-            self.labels[line.strip()] = line_count
-            self.labels_reverse[line_count] = line.strip()
-
-        for line_count, line in enumerate(open(predicate_dict_file, 'r')):
-            self.predicate_dict[line.strip()] = line_count
-
-    def get_data(self, data_file):
-        """
-        Get input data of paddle format.
-        """
-        with open(data_file, 'r') as fdata:
-            for line in fdata:
-                sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = line.strip(
-                ).split('\t')
-                words = sentence.split()
-                sen_len = len(words)
-
-                word_slot = [self.dict.get(w, UNK_IDX) for w in words]
-                predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX)
-                                  ] * sen_len
-                ctx_n2_slot = [self.dict.get(ctx_n2, UNK_IDX)] * sen_len
-                ctx_n1_slot = [self.dict.get(ctx_n1, UNK_IDX)] * sen_len
-                ctx_0_slot = [self.dict.get(ctx_0, UNK_IDX)] * sen_len
-                ctx_p1_slot = [self.dict.get(ctx_p1, UNK_IDX)] * sen_len
-                ctx_p2_slot = [self.dict.get(ctx_p2, UNK_IDX)] * sen_len
-
-                marks = mark.split()
-                mark_slot = [int(w) for w in marks]
-
-                yield word_slot, ctx_n2_slot, ctx_n1_slot, \
-                      ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot
-
-    def predict(self, data_file, output_file):
-        """
-        data_file: file name of input data.
-        """
-        input = self.converter(self.get_data(data_file))
-        output = self.network.forwardTest(input)
-        lab = output[0]["id"].tolist()
-
-        with open(data_file, 'r') as fin, open(output_file, 'w') as fout:
-            index = 0
-            for line in fin:
-                sen = line.split('\t')[0]
-                len_sen = len(sen.split())
-                line_labels = lab[index:index + len_sen]
-                index += len_sen
-                fout.write(sen + '\t' + ' '.join(
-                    [self.labels_reverse[i] for i in line_labels]) + '\n')
-
-
-def option_parser():
-    usage = (
-        "python predict.py -c config -w model_dir "
-        "-d word dictionary -l label_file -i input_file  -p pred_dict_file")
-    parser = OptionParser(usage="usage: %s [options]" % usage)
-    parser.add_option(
-        "-c",
-        "--tconf",
-        action="store",
-        dest="train_conf",
-        help="network config")
-    parser.add_option(
-        "-d",
-        "--dict",
-        action="store",
-        dest="dict_file",
-        help="dictionary file")
-    parser.add_option(
-        "-l",
-        "--label",
-        action="store",
-        dest="label_file",
-        default=None,
-        help="label file")
-    parser.add_option(
-        "-p",
-        "--predict_dict_file",
-        action="store",
-        dest="predict_dict_file",
-        default=None,
-        help="predict_dict_file")
-    parser.add_option(
-        "-i",
-        "--data",
-        action="store",
-        dest="data_file",
-        help="data file to predict")
-    parser.add_option(
-        "-w",
-        "--model",
-        action="store",
-        dest="model_path",
-        default=None,
-        help="model path")
-
-    parser.add_option(
-        "-o",
-        "--output_file",
-        action="store",
-        dest="output_file",
-        default=None,
-        help="output file")
-    return parser.parse_args()
-
-
-def main():
-    options, args = option_parser()
-    train_conf = options.train_conf
-    data_file = options.data_file
-    dict_file = options.dict_file
-    model_path = options.model_path
-    label_file = options.label_file
-    predict_dict_file = options.predict_dict_file
-    output_file = options.output_file
-
-    swig_paddle.initPaddle("--use_gpu=0")
-    predict = Prediction(train_conf, dict_file, model_path, label_file,
-                         predict_dict_file)
-    predict.predict(data_file, output_file)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/demo/semantic_role_labeling/predict.sh b/demo/semantic_role_labeling/predict.sh
deleted file mode 100755
index 873aad670d16803ce321ab60baabe9fe29ea64bf..0000000000000000000000000000000000000000
--- a/demo/semantic_role_labeling/predict.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-  sort -n | head -n 1
-}   
-
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-best_model_path="output/pass-${LOG[1]}"
-
-config_file=db_lstm.py
-dict_file=./data/wordDict.txt
-label_file=./data/targetDict.txt 
-predicate_dict_file=./data/verbDict.txt
-input_file=./data/feature
-output_file=predict.res
- 
-python predict.py \
-     -c $config_file \
-     -w $best_model_path \
-     -l $label_file \
-     -p $predicate_dict_file  \
-     -d $dict_file \
-     -i $input_file \
-     -o $output_file
diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh
deleted file mode 100755
index 095bbff2ea42627a13d8ebab436f5a05abc09743..0000000000000000000000000000000000000000
--- a/demo/semantic_role_labeling/test.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
-  sort -n | head -n 1
-}
-
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-evaluate_pass="output/pass-${LOG[1]}"
-
-echo 'evaluating from pass '$evaluate_pass
-model_list=./model.list
-touch $model_list | echo $evaluate_pass > $model_list
-
-paddle train \
-  --config=./db_lstm.py \
-  --model_list=$model_list \
-  --job=test \
-  --use_gpu=false \
-  --config_args=is_test=1 \
-  --test_all_data_in_one_period=1 \
-2>&1 | tee 'test.log'
-paddle usage -l test.log -e $? -n "semantic_role_labeling_test" >/dev/null 2>&1
diff --git a/demo/semantic_role_labeling/train.sh b/demo/semantic_role_labeling/train.sh
deleted file mode 100755
index eee14010d7b04a1b824f39090fa82fc532085e0d..0000000000000000000000000000000000000000
--- a/demo/semantic_role_labeling/train.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-paddle train \
-  --config=./db_lstm.py \
-  --use_gpu=0 \
-  --log_period=5000 \
-  --trainer_count=1 \
-  --show_parameter_stats_period=5000 \
-  --save_dir=./output \
-  --num_passes=10000 \
-  --average_test_period=10000000 \
-  --init_model_path=./data \
-  --load_missing_parameter_strategy=rand \
-  --test_all_data_in_one_period=1 \
-  2>&1 | tee 'train.log'
-paddle usage -l train.log -e $? -n "semantic_role_labeling_train" >/dev/null 2>&1
diff --git a/demo/sentiment/.gitignore b/demo/sentiment/.gitignore
deleted file mode 100644
index bf2a9ab1ce3c937bf06179074cd952dc53591dfd..0000000000000000000000000000000000000000
--- a/demo/sentiment/.gitignore
+++ /dev/null
@@ -1,11 +0,0 @@
-data/aclImdb
-data/imdb
-data/pre-imdb
-data/mosesdecoder-master
-logs/
-model_output
-dataprovider_copy_1.py
-model.list
-test.log
-train.log
-*.pyc
diff --git a/demo/sentiment/data/get_imdb.sh b/demo/sentiment/data/get_imdb.sh
deleted file mode 100755
index 7600af6fbb900ee845702f1297779c1f0ed9bf84..0000000000000000000000000000000000000000
--- a/demo/sentiment/data/get_imdb.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -e
-set -x
-
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
-
-#download the dataset
-echo "Downloading aclImdb..."
-#http://ai.stanford.edu/%7Eamaas/data/sentiment/
-wget http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
-
-echo "Downloading mosesdecoder..."
-#https://github.com/moses-smt/mosesdecoder
-wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
-
-#extract package
-echo "Unzipping..."
-tar -zxvf aclImdb_v1.tar.gz
-unzip master.zip
-
-#move train and test set to imdb_data directory 
-#in order to process when traing
-mkdir -p imdb/train
-mkdir -p imdb/test
-
-cp -r aclImdb/train/pos/ imdb/train/pos
-cp -r aclImdb/train/neg/ imdb/train/neg
-
-cp -r aclImdb/test/pos/ imdb/test/pos
-cp -r aclImdb/test/neg/ imdb/test/neg
-
-#remove compressed package
-rm aclImdb_v1.tar.gz
-rm master.zip
-
-echo "Done."
diff --git a/demo/sentiment/dataprovider.py b/demo/sentiment/dataprovider.py
deleted file mode 100755
index 4b7f5d0e504aef3884a04cbed8c16503a4079772..0000000000000000000000000000000000000000
--- a/demo/sentiment/dataprovider.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer.PyDataProvider2 import *
-
-
-def hook(settings, dictionary, **kwargs):
-    settings.word_dict = dictionary
-    settings.input_types = [
-        integer_value_sequence(len(settings.word_dict)), integer_value(2)
-    ]
-    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
-
-
-@provider(init_hook=hook)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line_count, line in enumerate(fdata):
-            label, comment = line.strip().split('\t\t')
-            label = int(label)
-            words = comment.split()
-            word_slot = [
-                settings.word_dict[w] for w in words if w in settings.word_dict
-            ]
-            if not word_slot:
-                continue
-            yield word_slot, label
diff --git a/demo/sentiment/predict.py b/demo/sentiment/predict.py
deleted file mode 100755
index 64c78e0d6b9297e7a321a4f070517593b0bfe332..0000000000000000000000000000000000000000
--- a/demo/sentiment/predict.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os, sys
-import numpy as np
-from optparse import OptionParser
-from py_paddle import swig_paddle, DataProviderConverter
-from paddle.trainer.PyDataProvider2 import integer_value_sequence
-from paddle.trainer.config_parser import parse_config
-"""
-Usage: run following command to show help message.
-  python predict.py -h
-"""
-
-
-class SentimentPrediction():
-    def __init__(self, train_conf, dict_file, model_dir=None, label_file=None):
-        """
-        train_conf: trainer configure.
-        dict_file: word dictionary file name.
-        model_dir: directory of model.
-        """
-        self.train_conf = train_conf
-        self.dict_file = dict_file
-        self.word_dict = {}
-        self.dict_dim = self.load_dict()
-        self.model_dir = model_dir
-        if model_dir is None:
-            self.model_dir = os.path.dirname(train_conf)
-
-        self.label = None
-        if label_file is not None:
-            self.load_label(label_file)
-
-        conf = parse_config(train_conf, "is_predict=1")
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(
-            conf.model_config)
-        self.network.loadParameters(self.model_dir)
-        input_types = [integer_value_sequence(self.dict_dim)]
-        self.converter = DataProviderConverter(input_types)
-
-    def load_dict(self):
-        """
-        Load dictionary from self.dict_file.
-        """
-        for line_count, line in enumerate(open(self.dict_file, 'r')):
-            self.word_dict[line.strip().split('\t')[0]] = line_count
-        return len(self.word_dict)
-
-    def load_label(self, label_file):
-        """
-        Load label.
-        """
-        self.label = {}
-        for v in open(label_file, 'r'):
-            self.label[int(v.split('\t')[1])] = v.split('\t')[0]
-
-    def get_index(self, data):
-        """
-        transform word into integer index according to the dictionary.
-        """
-        words = data.strip().split()
-        word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
-        return word_slot
-
-    def batch_predict(self, data_batch):
-        input = self.converter(data_batch)
-        output = self.network.forwardTest(input)
-        prob = output[0]["value"]
-        labs = np.argsort(-prob)
-        for idx, lab in enumerate(labs):
-            if self.label is None:
-                print("predicting label is %d" % (lab[0]))
-            else:
-                print("predicting label is %s" % (self.label[lab[0]]))
-
-
-def option_parser():
-    usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
-    parser = OptionParser(usage="usage: %s [options]" % usage)
-    parser.add_option(
-        "-n",
-        "--tconf",
-        action="store",
-        dest="train_conf",
-        help="network config")
-    parser.add_option(
-        "-d",
-        "--dict",
-        action="store",
-        dest="dict_file",
-        help="dictionary file")
-    parser.add_option(
-        "-b",
-        "--label",
-        action="store",
-        dest="label",
-        default=None,
-        help="dictionary file")
-    parser.add_option(
-        "-c",
-        "--batch_size",
-        type="int",
-        action="store",
-        dest="batch_size",
-        default=1,
-        help="the batch size for prediction")
-    parser.add_option(
-        "-w",
-        "--model",
-        action="store",
-        dest="model_path",
-        default=None,
-        help="model path")
-    return parser.parse_args()
-
-
-def main():
-    options, args = option_parser()
-    train_conf = options.train_conf
-    batch_size = options.batch_size
-    dict_file = options.dict_file
-    model_path = options.model_path
-    label = options.label
-    swig_paddle.initPaddle("--use_gpu=0")
-    predict = SentimentPrediction(train_conf, dict_file, model_path, label)
-
-    batch = []
-    for line in sys.stdin:
-        words = predict.get_index(line)
-        if words:
-            batch.append([words])
-        else:
-            print('All the words in [%s] are not in the dictionary.' % line)
-        if len(batch) == batch_size:
-            predict.batch_predict(batch)
-            batch = []
-    if len(batch) > 0:
-        predict.batch_predict(batch)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/demo/sentiment/predict.sh b/demo/sentiment/predict.sh
deleted file mode 100755
index c72a8e8641516543ef267fcb4b448630246d1e8d..0000000000000000000000000000000000000000
--- a/demo/sentiment/predict.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-#Note the default model is pass-00002, you shold make sure the model path
-#exists or change the mode path.
-model=model_output/pass-00002/
-config=trainer_config.py
-label=data/pre-imdb/labels.list
-cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
-     --tconf=$config\
-     --model=$model \
-     --label=$label \
-     --dict=./data/pre-imdb/dict.txt \
-     --batch_size=1
diff --git a/demo/sentiment/preprocess.py b/demo/sentiment/preprocess.py
deleted file mode 100755
index 29b3682b747c66574590de5ea70574981cc536bb..0000000000000000000000000000000000000000
--- a/demo/sentiment/preprocess.py
+++ /dev/null
@@ -1,359 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import random
-import operator
-import numpy as np
-from subprocess import Popen, PIPE
-from os.path import join as join_path
-from optparse import OptionParser
-
-from paddle.utils.preprocess_util import *
-"""
-Usage: run following command to show help message.
-  python preprocess.py -h 
-"""
-
-
-def save_dict(dict, filename, is_reverse=True):
-    """
-    Save dictionary into file.
-    dict:   input dictionary.
-    filename: output file name, string.
-    is_reverse: True, descending order by value.
-                False, ascending order by value.
-    """
-    f = open(filename, 'w')
-    for k, v in sorted(dict.items(), key=operator.itemgetter(1),\
-                       reverse=is_reverse):
-        f.write('%s\t%s\n' % (k, v))
-    f.close()
-
-
-def tokenize(sentences):
-    """
-    Use tokenizer.perl to tokenize input sentences.
-    tokenizer.perl is tool of Moses.
-    sentences : a list of input sentences.
-    return: a list of processed text.
-    """
-    dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
-    tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
-    assert isinstance(sentences, list)
-    text = "\n".join(sentences)
-    tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
-    tok_text, _ = tokenizer.communicate(text)
-    toks = tok_text.split('\n')[:-1]
-    return toks
-
-
-def read_lines(path):
-    """
-    path: String, file path.
-    return a list of sequence.
-    """
-    seqs = []
-    with open(path, 'r') as f:
-        for line in f.readlines():
-            line = line.strip()
-            if len(line):
-                seqs.append(line)
-    return seqs
-
-
-class SentimentDataSetCreate():
-    """
-    A class to process data for sentiment analysis task.
-    """
-
-    def __init__(self,
-                 data_path,
-                 output_path,
-                 use_okenizer=True,
-                 multi_lines=False):
-        """
-        data_path: string, traing and testing dataset path
-        output_path: string, output path, store processed dataset
-        multi_lines: whether a file has multi lines.
-                     In order to shuffle fully, it needs to read all files into
-                     memory, then shuffle them if one file has multi lines.
-        """
-        self.output_path = output_path
-        self.data_path = data_path
-
-        self.train_dir = 'train'
-        self.test_dir = 'test'
-
-        self.train_list = "train.list"
-        self.test_list = "test.list"
-
-        self.label_list = "labels.list"
-        self.classes_num = 0
-
-        self.batch_size = 50000
-        self.batch_dir = 'batches'
-
-        self.dict_file = "dict.txt"
-        self.dict_with_test = False
-        self.dict_size = 0
-        self.word_count = {}
-
-        self.tokenizer = use_okenizer
-        self.overwrite = False
-
-        self.multi_lines = multi_lines
-
-        self.train_dir = join_path(data_path, self.train_dir)
-        self.test_dir = join_path(data_path, self.test_dir)
-        self.train_list = join_path(output_path, self.train_list)
-        self.test_list = join_path(output_path, self.test_list)
-        self.label_list = join_path(output_path, self.label_list)
-        self.dict_file = join_path(output_path, self.dict_file)
-
-    def data_list(self, path):
-        """
-        create dataset from path
-        path: data path
-        return: data list
-        """
-        label_set = get_label_set_from_dir(path)
-        data = []
-        for lab_name in label_set.keys():
-            file_paths = list_files(join_path(path, lab_name))
-            for p in file_paths:
-                data.append({"label"  : label_set[lab_name],\
-                             "seq_path": p})
-        return data, label_set
-
-    def create_dict(self, data):
-        """
-        create dict for input data.
-        data: list, [sequence, sequnce, ...]
-        """
-        for seq in data:
-            for w in seq.strip().lower().split():
-                if w not in self.word_count:
-                    self.word_count[w] = 1
-                else:
-                    self.word_count[w] += 1
-
-    def create_dataset(self):
-        """
-        create file batches and dictionary of train data set.
-        If the self.overwrite is false and train.list already exists in
-        self.output_path, this function will not create and save file
-        batches from the data set path.
-        return: dictionary size, class number.
-        """
-        out_path = self.output_path
-        if out_path and not os.path.exists(out_path):
-            os.makedirs(out_path)
-
-        # If self.overwrite is false or self.train_list has existed,
-        # it will not process dataset.
-        if not (self.overwrite or not os.path.exists(self.train_list)):
-            print "%s already exists." % self.train_list
-            return
-
-        # Preprocess train data.
-        train_data, train_lab_set = self.data_list(self.train_dir)
-        print "processing train set..."
-        file_lists = self.save_data(train_data, "train", self.batch_size, True,
-                                    True)
-        save_list(file_lists, self.train_list)
-
-        # If have test data path, preprocess test data.
-        if os.path.exists(self.test_dir):
-            test_data, test_lab_set = self.data_list(self.test_dir)
-            assert (train_lab_set == test_lab_set)
-            print "processing test set..."
-            file_lists = self.save_data(test_data, "test", self.batch_size,
-                                        False, self.dict_with_test)
-            save_list(file_lists, self.test_list)
-
-        # save labels set.
-        save_dict(train_lab_set, self.label_list, False)
-        self.classes_num = len(train_lab_set.keys())
-
-        # save dictionary.
-        save_dict(self.word_count, self.dict_file, True)
-        self.dict_size = len(self.word_count)
-
-    def save_data(self,
-                  data,
-                  prefix="",
-                  batch_size=50000,
-                  is_shuffle=False,
-                  build_dict=False):
-        """
-        Create batches for a Dataset object.
-        data: the Dataset object to process.
-        prefix: the prefix of each batch.
-        batch_size: number of data in each batch.
-        build_dict: whether to build dictionary for data
-
-        return: list of batch names
-        """
-        if is_shuffle and self.multi_lines:
-            return self.save_data_multi_lines(data, prefix, batch_size,
-                                              build_dict)
-
-        if is_shuffle:
-            random.shuffle(data)
-        num_batches = int(math.ceil(len(data) / float(batch_size)))
-        batch_names = []
-        for i in range(num_batches):
-            batch_name = join_path(self.output_path,
-                                   "%s_part_%03d" % (prefix, i))
-            begin = i * batch_size
-            end = min((i + 1) * batch_size, len(data))
-            # read a batch of data
-            label_list, data_list = self.get_data_list(begin, end, data)
-            if build_dict:
-                self.create_dict(data_list)
-            self.save_file(label_list, data_list, batch_name)
-            batch_names.append(batch_name)
-
-        return batch_names
-
-    def get_data_list(self, begin, end, data):
-        """
-        begin: int, begining index of data.
-        end: int, ending index of data.
-        data: a list of {"seq_path": seqquence path, "label": label index}
-
-        return a list of label and a list of sequence.
-        """
-        label_list = []
-        data_list = []
-        for j in range(begin, end):
-            seqs = read_lines(data[j]["seq_path"])
-            lab = int(data[j]["label"])
-            #File may have multiple lines.
-            for seq in seqs:
-                data_list.append(seq)
-                label_list.append(lab)
-        if self.tokenizer:
-            data_list = tokenize(data_list)
-        return label_list, data_list
-
-    def save_data_multi_lines(self,
-                              data,
-                              prefix="",
-                              batch_size=50000,
-                              build_dict=False):
-        """
-        In order to shuffle fully, there is no need to load all data if
-        each file only contains one sample, it only needs to shuffle list
-        of file name. But one file contains multi lines, each line is one
-        sample. It needs to read all data into memory to shuffle fully.
-        This interface is mainly for data containning multi lines in each
-        file, which consumes more memory if there is a great mount of data.
-
-        data: the Dataset object to process.
-        prefix: the prefix of each batch.
-        batch_size: number of data in each batch.
-        build_dict: whether to build dictionary for data
-
-        return: list of batch names
-        """
-        assert self.multi_lines
-        label_list = []
-        data_list = []
-
-        # read all data
-        label_list, data_list = self.get_data_list(0, len(data), data)
-        if build_dict:
-            self.create_dict(data_list)
-
-        length = len(label_list)
-        perm_list = np.array([i for i in xrange(length)])
-        random.shuffle(perm_list)
-
-        num_batches = int(math.ceil(length / float(batch_size)))
-        batch_names = []
-        for i in range(num_batches):
-            batch_name = join_path(self.output_path,
-                                   "%s_part_%03d" % (prefix, i))
-            begin = i * batch_size
-            end = min((i + 1) * batch_size, length)
-            sub_label = [label_list[perm_list[i]] for i in range(begin, end)]
-            sub_data = [data_list[perm_list[i]] for i in range(begin, end)]
-            self.save_file(sub_label, sub_data, batch_name)
-            batch_names.append(batch_name)
-
-        return batch_names
-
-    def save_file(self, label_list, data_list, filename):
-        """
-        Save data into file.
-        label_list: a list of int value.
-        data_list: a list of sequnece.
-        filename: output file name.
-        """
-        f = open(filename, 'w')
-        print "saving file: %s" % filename
-        for lab, seq in zip(label_list, data_list):
-            f.write('%s\t\t%s\n' % (lab, seq))
-        f.close()
-
-
-def option_parser():
-    parser = OptionParser(usage="usage: python preprcoess.py "\
-                                "-i data_dir [options]")
-    parser.add_option(
-        "-i",
-        "--data",
-        action="store",
-        dest="input",
-        help="Input data directory.")
-    parser.add_option(
-        "-o",
-        "--output",
-        action="store",
-        dest="output",
-        default=None,
-        help="Output directory.")
-    parser.add_option(
-        "-t",
-        "--tokenizer",
-        action="store",
-        dest="use_tokenizer",
-        default=True,
-        help="Whether to use tokenizer.")
-    parser.add_option("-m", "--multi_lines", action="store",
-                      dest="multi_lines", default=False,
-                      help="If input text files have multi lines and they "\
-                           "need to be shuffled, you should set -m True,")
-    return parser.parse_args()
-
-
-def main():
-    options, args = option_parser()
-    data_dir = options.input
-    output_dir = options.output
-    use_tokenizer = options.use_tokenizer
-    multi_lines = options.multi_lines
-    if output_dir is None:
-        outname = os.path.basename(options.input)
-        output_dir = join_path(os.path.dirname(data_dir), 'pre-' + outname)
-    data_creator = SentimentDataSetCreate(data_dir, output_dir, use_tokenizer,
-                                          multi_lines)
-    data_creator.create_dataset()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/demo/sentiment/preprocess.sh b/demo/sentiment/preprocess.sh
deleted file mode 100755
index 19ec34d4f016365d18db01ddec559d26202b19c6..0000000000000000000000000000000000000000
--- a/demo/sentiment/preprocess.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-echo "Start to preprcess..."
-
-data_dir="./data/imdb"
-python preprocess.py -i $data_dir
-
-echo "Done."
diff --git a/demo/sentiment/sentiment_net.py b/demo/sentiment/sentiment_net.py
deleted file mode 100644
index a01577ca5ae025b7bec67c6d54c7dbd931dbee74..0000000000000000000000000000000000000000
--- a/demo/sentiment/sentiment_net.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from os.path import join as join_path
-
-from paddle.trainer_config_helpers import *
-
-
-def sentiment_data(data_dir=None,
-                   is_test=False,
-                   is_predict=False,
-                   train_list="train.list",
-                   test_list="test.list",
-                   dict_file="dict.txt"):
-    """
-    Predefined data provider for sentiment analysis.
-    is_test: whether this config is used for test.
-    is_predict: whether this config is used for prediction.
-    train_list: text file name, containing a list of training set.
-    test_list: text file name, containing a list of testing set.
-    dict_file: text file name, containing dictionary.
-    """
-    dict_dim = len(open(join_path(data_dir, "dict.txt")).readlines())
-    class_dim = len(open(join_path(data_dir, 'labels.list')).readlines())
-    if is_predict:
-        return dict_dim, class_dim
-
-    if data_dir is not None:
-        train_list = join_path(data_dir, train_list)
-        test_list = join_path(data_dir, test_list)
-        dict_file = join_path(data_dir, dict_file)
-
-    train_list = train_list if not is_test else None
-    word_dict = dict()
-    with open(dict_file, 'r') as f:
-        for i, line in enumerate(open(dict_file, 'r')):
-            word_dict[line.split('\t')[0]] = i
-
-    define_py_data_sources2(
-        train_list,
-        test_list,
-        module="dataprovider",
-        obj="process",
-        args={'dictionary': word_dict})
-
-    return dict_dim, class_dim
-
-
-def bidirectional_lstm_net(input_dim,
-                           class_dim=2,
-                           emb_dim=128,
-                           lstm_dim=128,
-                           is_predict=False):
-    data = data_layer("word", input_dim)
-    emb = embedding_layer(input=data, size=emb_dim)
-    bi_lstm = bidirectional_lstm(input=emb, size=lstm_dim)
-    dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
-    output = fc_layer(input=dropout, size=class_dim, act=SoftmaxActivation())
-
-    if not is_predict:
-        lbl = data_layer("label", 1)
-        outputs(classification_cost(input=output, label=lbl))
-    else:
-        outputs(output)
-
-
-def stacked_lstm_net(input_dim,
-                     class_dim=2,
-                     emb_dim=128,
-                     hid_dim=512,
-                     stacked_num=3,
-                     is_predict=False):
-    """
-    A Wrapper for sentiment classification task.
-    This network uses bi-directional recurrent network,
-    consisting three LSTM layers. This configure is referred to
-    the paper as following url, but use fewer layrs.
-        http://www.aclweb.org/anthology/P15-1109
-
-    input_dim: here is word dictionary dimension.
-    class_dim: number of categories.
-    emb_dim: dimension of word embedding.
-    hid_dim: dimension of hidden layer.
-    stacked_num: number of stacked lstm-hidden layer.
-    is_predict: is predicting or not.
-                Some layers is not needed in network when predicting.
-    """
-    hid_lr = 1e-3
-    assert stacked_num % 2 == 1
-
-    layer_attr = ExtraLayerAttribute(drop_rate=0.5)
-    fc_para_attr = ParameterAttribute(learning_rate=hid_lr)
-    lstm_para_attr = ParameterAttribute(initial_std=0., learning_rate=1.)
-    para_attr = [fc_para_attr, lstm_para_attr]
-    bias_attr = ParameterAttribute(initial_std=0., l2_rate=0.)
-    relu = ReluActivation()
-    linear = LinearActivation()
-
-    data = data_layer("word", input_dim)
-    emb = embedding_layer(input=data, size=emb_dim)
-
-    fc1 = fc_layer(input=emb, size=hid_dim, act=linear, bias_attr=bias_attr)
-    lstm1 = lstmemory(
-        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
-
-    inputs = [fc1, lstm1]
-    for i in range(2, stacked_num + 1):
-        fc = fc_layer(
-            input=inputs,
-            size=hid_dim,
-            act=linear,
-            param_attr=para_attr,
-            bias_attr=bias_attr)
-        lstm = lstmemory(
-            input=fc,
-            reverse=(i % 2) == 0,
-            act=relu,
-            bias_attr=bias_attr,
-            layer_attr=layer_attr)
-        inputs = [fc, lstm]
-
-    fc_last = pooling_layer(input=inputs[0], pooling_type=MaxPooling())
-    lstm_last = pooling_layer(input=inputs[1], pooling_type=MaxPooling())
-    output = fc_layer(
-        input=[fc_last, lstm_last],
-        size=class_dim,
-        act=SoftmaxActivation(),
-        bias_attr=bias_attr,
-        param_attr=para_attr)
-
-    if is_predict:
-        outputs(output)
-    else:
-        outputs(classification_cost(input=output, label=data_layer('label', 1)))
diff --git a/demo/sentiment/test.sh b/demo/sentiment/test.sh
deleted file mode 100755
index 85c4f3ccfc3ede23fcf701769b9701ecbf57c789..0000000000000000000000000000000000000000
--- a/demo/sentiment/test.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
-  sort -n | head -n 1
-}
-
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-evaluate_pass="model_output/pass-${LOG[1]}"
-
-echo 'evaluating from pass '$evaluate_pass
-
-model_list=./model.list
-touch $model_list | echo $evaluate_pass > $model_list
-net_conf=trainer_config.py
-paddle train --config=$net_conf \
-             --model_list=$model_list \
-             --job=test \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --config_args=is_test=1 \
-             2>&1 | tee 'test.log'
-paddle usage -l test.log -e $? -n "sentiment_test" >/dev/null 2>&1
diff --git a/demo/sentiment/train.sh b/demo/sentiment/train.sh
deleted file mode 100755
index 14620f733bf03444e5ba3b3b792dfbed6146ecde..0000000000000000000000000000000000000000
--- a/demo/sentiment/train.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-config=trainer_config.py
-output=./model_output
-paddle train --config=$config \
-             --save_dir=$output \
-             --job=train \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --num_passes=10 \
-             --log_period=10 \
-             --dot_period=20 \
-             --show_parameter_stats_period=100 \
-             --test_all_data_in_one_period=1 \
-             2>&1 | tee 'train.log'
-paddle usage -l train.log -e $? -n "sentiment_train" >/dev/null 2>&1
diff --git a/demo/sentiment/train_v2.py b/demo/sentiment/train_v2.py
deleted file mode 100644
index 1c856556bd0cb32f60eba322469b3621c37e1349..0000000000000000000000000000000000000000
--- a/demo/sentiment/train_v2.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import paddle.v2 as paddle
-
-
-def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128):
-    data = paddle.layer.data("word",
-                             paddle.data_type.integer_value_sequence(input_dim))
-    emb = paddle.layer.embedding(input=data, size=emb_dim)
-    conv_3 = paddle.networks.sequence_conv_pool(
-        input=emb, context_len=3, hidden_size=hid_dim)
-    conv_4 = paddle.networks.sequence_conv_pool(
-        input=emb, context_len=4, hidden_size=hid_dim)
-    output = paddle.layer.fc(input=[conv_3, conv_4],
-                             size=class_dim,
-                             act=paddle.activation.Softmax())
-    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
-    cost = paddle.layer.classification_cost(input=output, label=lbl)
-    return cost
-
-
-def stacked_lstm_net(input_dim,
-                     class_dim=2,
-                     emb_dim=128,
-                     hid_dim=512,
-                     stacked_num=3):
-    """
-    A Wrapper for sentiment classification task.
-    This network uses bi-directional recurrent network,
-    consisting three LSTM layers. This configure is referred to
-    the paper as following url, but use fewer layrs.
-        http://www.aclweb.org/anthology/P15-1109
-
-    input_dim: here is word dictionary dimension.
-    class_dim: number of categories.
-    emb_dim: dimension of word embedding.
-    hid_dim: dimension of hidden layer.
-    stacked_num: number of stacked lstm-hidden layer.
-    """
-    assert stacked_num % 2 == 1
-
-    layer_attr = paddle.attr.Extra(drop_rate=0.5)
-    fc_para_attr = paddle.attr.Param(learning_rate=1e-3)
-    lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.)
-    para_attr = [fc_para_attr, lstm_para_attr]
-    bias_attr = paddle.attr.Param(initial_std=0., l2_rate=0.)
-    relu = paddle.activation.Relu()
-    linear = paddle.activation.Linear()
-
-    data = paddle.layer.data("word",
-                             paddle.data_type.integer_value_sequence(input_dim))
-    emb = paddle.layer.embedding(input=data, size=emb_dim)
-
-    fc1 = paddle.layer.fc(input=emb,
-                          size=hid_dim,
-                          act=linear,
-                          bias_attr=bias_attr)
-    lstm1 = paddle.layer.lstmemory(
-        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
-
-    inputs = [fc1, lstm1]
-    for i in range(2, stacked_num + 1):
-        fc = paddle.layer.fc(input=inputs,
-                             size=hid_dim,
-                             act=linear,
-                             param_attr=para_attr,
-                             bias_attr=bias_attr)
-        lstm = paddle.layer.lstmemory(
-            input=fc,
-            reverse=(i % 2) == 0,
-            act=relu,
-            bias_attr=bias_attr,
-            layer_attr=layer_attr)
-        inputs = [fc, lstm]
-
-    fc_last = paddle.layer.pooling(
-        input=inputs[0], pooling_type=paddle.pooling.Max())
-    lstm_last = paddle.layer.pooling(
-        input=inputs[1], pooling_type=paddle.pooling.Max())
-    output = paddle.layer.fc(input=[fc_last, lstm_last],
-                             size=class_dim,
-                             act=paddle.activation.Softmax(),
-                             bias_attr=bias_attr,
-                             param_attr=para_attr)
-
-    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
-    cost = paddle.layer.classification_cost(input=output, label=lbl)
-    return cost
-
-
-if __name__ == '__main__':
-    # init
-    paddle.init(use_gpu=False)
-
-    #data
-    print 'load dictionary...'
-    word_dict = paddle.dataset.imdb.word_dict()
-    dict_dim = len(word_dict)
-    class_dim = 2
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000),
-        batch_size=100)
-    test_reader = paddle.batch(
-        lambda: paddle.dataset.imdb.test(word_dict), batch_size=100)
-
-    feeding = {'word': 0, 'label': 1}
-
-    # network config
-    # Please choose the way to build the network
-    # by uncommenting the corresponding line.
-    cost = convolution_net(dict_dim, class_dim=class_dim)
-    # cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
-
-    # create parameters
-    parameters = paddle.parameters.create(cost)
-
-    # create optimizer
-    adam_optimizer = paddle.optimizer.Adam(
-        learning_rate=2e-3,
-        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
-        model_average=paddle.optimizer.ModelAverage(average_window=0.5))
-
-    # End batch and end pass event handler
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
-                print "\nPass %d, Batch %d, Cost %f, %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics)
-            else:
-                sys.stdout.write('.')
-                sys.stdout.flush()
-        if isinstance(event, paddle.event.EndPass):
-            result = trainer.test(reader=test_reader, feeding=feeding)
-            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
-
-    # create trainer
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=adam_optimizer)
-
-    trainer.train(
-        reader=train_reader,
-        event_handler=event_handler,
-        feeding=feeding,
-        num_passes=2)
diff --git a/demo/sentiment/trainer_config.py b/demo/sentiment/trainer_config.py
deleted file mode 100644
index f1cadaa728ac58107e15f77b5994d31da088caf7..0000000000000000000000000000000000000000
--- a/demo/sentiment/trainer_config.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from sentiment_net import *
-from paddle.trainer_config_helpers import *
-
-# whether this config is used for test
-is_test = get_config_arg('is_test', bool, False)
-# whether this config is used for prediction
-is_predict = get_config_arg('is_predict', bool, False)
-
-data_dir = "./data/pre-imdb"
-dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
-
-################## Algorithm Config #####################
-
-settings(
-    batch_size=128,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    model_average=ModelAverage(0.5),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-
-#################### Network Config ######################
-stacked_lstm_net(
-    dict_dim, class_dim=class_dim, stacked_num=3, is_predict=is_predict)
-# bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
diff --git a/demo/seqToseq/.gitignore b/demo/seqToseq/.gitignore
deleted file mode 100644
index 21cec2c2c1f3422cbb0ad133281dc1ecdd076a96..0000000000000000000000000000000000000000
--- a/demo/seqToseq/.gitignore
+++ /dev/null
@@ -1,17 +0,0 @@
-data/wmt14
-data/pre-wmt14
-data/wmt14_model
-data/paraphrase
-data/pre-paraphrase
-data/paraphrase_model
-translation/gen.log
-translation/gen_result
-translation/train.log
-paraphrase/train.log
-dataprovider_copy_1.py
-translation/thirdparty.tgz
-translation/thirdparty/train.conf
-translation/thirdparty/dataprovider.py
-translation/thirdparty/seqToseq_net.py
-translation/thirdparty/*.dict
-*.pyc
diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py
deleted file mode 100644
index 3072c375123a2713c655b09fb28001960c9ab64d..0000000000000000000000000000000000000000
--- a/demo/seqToseq/api_train_v2.py
+++ /dev/null
@@ -1,214 +0,0 @@
-import sys
-
-import paddle.v2 as paddle
-
-
-def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
-    ### Network Architecture
-    word_vector_dim = 512  # dimension of word vector
-    decoder_size = 512  # dimension of hidden unit in GRU Decoder network
-    encoder_size = 512  # dimension of hidden unit in GRU Encoder network
-
-    beam_size = 3
-    max_length = 250
-
-    #### Encoder
-    src_word_id = paddle.layer.data(
-        name='source_language_word',
-        type=paddle.data_type.integer_value_sequence(source_dict_dim))
-    src_embedding = paddle.layer.embedding(
-        input=src_word_id,
-        size=word_vector_dim,
-        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
-    src_forward = paddle.networks.simple_gru(
-        input=src_embedding, size=encoder_size)
-    src_backward = paddle.networks.simple_gru(
-        input=src_embedding, size=encoder_size, reverse=True)
-    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
-
-    #### Decoder
-    with paddle.layer.mixed(size=decoder_size) as encoded_proj:
-        encoded_proj += paddle.layer.full_matrix_projection(
-            input=encoded_vector)
-
-    backward_first = paddle.layer.first_seq(input=src_backward)
-
-    with paddle.layer.mixed(
-            size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
-        decoder_boot += paddle.layer.full_matrix_projection(
-            input=backward_first)
-
-    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-
-        decoder_mem = paddle.layer.memory(
-            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
-
-        context = paddle.networks.simple_attention(
-            encoded_sequence=enc_vec,
-            encoded_proj=enc_proj,
-            decoder_state=decoder_mem)
-
-        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
-            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
-            decoder_inputs += paddle.layer.full_matrix_projection(
-                input=current_word)
-
-        gru_step = paddle.layer.gru_step(
-            name='gru_decoder',
-            input=decoder_inputs,
-            output_mem=decoder_mem,
-            size=decoder_size)
-
-        with paddle.layer.mixed(
-                size=target_dict_dim,
-                bias_attr=True,
-                act=paddle.activation.Softmax()) as out:
-            out += paddle.layer.full_matrix_projection(input=gru_step)
-        return out
-
-    decoder_group_name = "decoder_group"
-    group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
-    group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
-    group_inputs = [group_input1, group_input2]
-
-    if not is_generating:
-        trg_embedding = paddle.layer.embedding(
-            input=paddle.layer.data(
-                name='target_language_word',
-                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
-            size=word_vector_dim,
-            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
-        group_inputs.append(trg_embedding)
-
-        # For decoder equipped with attention mechanism, in training,
-        # target embeding (the groudtruth) is the data input,
-        # while encoded source sequence is accessed to as an unbounded memory.
-        # Here, the StaticInput defines a read-only memory
-        # for the recurrent_group.
-        decoder = paddle.layer.recurrent_group(
-            name=decoder_group_name,
-            step=gru_decoder_with_attention,
-            input=group_inputs)
-
-        lbl = paddle.layer.data(
-            name='target_language_next_word',
-            type=paddle.data_type.integer_value_sequence(target_dict_dim))
-        cost = paddle.layer.classification_cost(input=decoder, label=lbl)
-
-        return cost
-    else:
-        # In generation, the decoder predicts a next target word based on
-        # the encoded source sequence and the last generated target word.
-
-        # The encoded source sequence (encoder's output) must be specified by
-        # StaticInput, which is a read-only memory.
-        # Embedding of the last generated word is automatically gotten by
-        # GeneratedInputs, which is initialized by a start mark, such as <s>,
-        # and must be included in generation.
-
-        trg_embedding = paddle.layer.GeneratedInputV2(
-            size=target_dict_dim,
-            embedding_name='_target_language_embedding',
-            embedding_size=word_vector_dim)
-        group_inputs.append(trg_embedding)
-
-        beam_gen = paddle.layer.beam_search(
-            name=decoder_group_name,
-            step=gru_decoder_with_attention,
-            input=group_inputs,
-            bos_id=0,
-            eos_id=1,
-            beam_size=beam_size,
-            max_length=max_length)
-
-        return beam_gen
-
-
-def main():
-    paddle.init(use_gpu=False, trainer_count=1)
-    is_generating = False
-
-    # source and target dict dim.
-    dict_size = 30000
-    source_dict_dim = target_dict_dim = dict_size
-
-    # train the network
-    if not is_generating:
-        cost = seqToseq_net(source_dict_dim, target_dict_dim)
-        parameters = paddle.parameters.create(cost)
-
-        # define optimize method and trainer
-        optimizer = paddle.optimizer.Adam(
-            learning_rate=5e-5,
-            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
-        trainer = paddle.trainer.SGD(cost=cost,
-                                     parameters=parameters,
-                                     update_equation=optimizer)
-        # define data reader
-        wmt14_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.wmt14.train(dict_size), buf_size=8192),
-            batch_size=5)
-
-        # define event_handler callback
-        def event_handler(event):
-            if isinstance(event, paddle.event.EndIteration):
-                if event.batch_id % 10 == 0:
-                    print "\nPass %d, Batch %d, Cost %f, %s" % (
-                        event.pass_id, event.batch_id, event.cost,
-                        event.metrics)
-                else:
-                    sys.stdout.write('.')
-                    sys.stdout.flush()
-
-        # start to train
-        trainer.train(
-            reader=wmt14_reader, event_handler=event_handler, num_passes=2)
-
-    # generate a english sequence to french
-    else:
-        # use the first 3 samples for generation
-        gen_creator = paddle.dataset.wmt14.gen(dict_size)
-        gen_data = []
-        gen_num = 3
-        for item in gen_creator():
-            gen_data.append((item[0], ))
-            if len(gen_data) == gen_num:
-                break
-
-        beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating)
-        # get the pretrained model, whose bleu = 26.92
-        parameters = paddle.dataset.wmt14.model()
-        # prob is the prediction probabilities, and id is the prediction word. 
-        beam_result = paddle.infer(
-            output_layer=beam_gen,
-            parameters=parameters,
-            input=gen_data,
-            field=['prob', 'id'])
-
-        # get the dictionary
-        src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
-
-        # the delimited element of generated sequences is -1,
-        # the first element of each generated sequence is the sequence length
-        seq_list = []
-        seq = []
-        for w in beam_result[1]:
-            if w != -1:
-                seq.append(w)
-            else:
-                seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
-                seq = []
-
-        prob = beam_result[0]
-        beam_size = 3
-        for i in xrange(gen_num):
-            print "\n*******************************************************\n"
-            print "src:", ' '.join(
-                [src_dict.get(w) for w in gen_data[i][0]]), "\n"
-            for j in xrange(beam_size):
-                print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
-
-
-if __name__ == '__main__':
-    main()
diff --git a/demo/seqToseq/data/paraphrase_data.sh b/demo/seqToseq/data/paraphrase_data.sh
deleted file mode 100755
index e6497c91286d44b5ef3b66c5f824e36a09728720..0000000000000000000000000000000000000000
--- a/demo/seqToseq/data/paraphrase_data.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-
-# download the in-house paraphrase dataset
-wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/paraphrase.tar.gz
-
-# untar the dataset
-tar -zxvf paraphrase.tar.gz
-rm paraphrase.tar.gz
diff --git a/demo/seqToseq/data/paraphrase_model.sh b/demo/seqToseq/data/paraphrase_model.sh
deleted file mode 100755
index d0e7f214a38c4dad0fdf7c10ba3b76eb0ab40f06..0000000000000000000000000000000000000000
--- a/demo/seqToseq/data/paraphrase_model.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-
-dim=32
-pretrained_dir='../../model_zoo/embedding/'
-preModel=$pretrained_dir'model_'$dim'.emb'
-preDict=$pretrained_dir'baidu.dict'
-
-usrDict_dir='pre-paraphrase/'
-srcDict=$usrDict_dir'src.dict'
-trgDict=$usrDict_dir'trg.dict'
-
-usrModel_dir='paraphrase_model/'
-mkdir $usrModel_dir
-srcModel=$usrModel_dir'_source_language_embedding'
-trgModel=$usrModel_dir'_target_language_embedding'
-
-echo 'extract desired parameters based on user dictionary'
-script=$pretrained_dir'extract_para.py'
-python $script --preModel $preModel --preDict $preDict \
-          --usrModel $srcModel --usrDict $srcDict -d $dim
-python $script --preModel $preModel --preDict $preDict \
-          --usrModel $trgModel --usrDict $trgDict -d $dim
diff --git a/demo/seqToseq/data/wmt14_data.sh b/demo/seqToseq/data/wmt14_data.sh
deleted file mode 100755
index 43f67168d2a876ba5401e0f8490a88adac9c5551..0000000000000000000000000000000000000000
--- a/demo/seqToseq/data/wmt14_data.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-mkdir wmt14
-cd wmt14
-
-# download the dataset
-wget http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz
-wget http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz
-
-# untar the dataset
-tar -zxvf bitexts.tgz
-tar -zxvf dev+test.tgz
-gunzip bitexts.selected/*
-mv bitexts.selected train
-rm bitexts.tgz
-rm dev+test.tgz
-
-# separate the dev and test dataset
-mkdir test gen
-mv dev/ntst1213.* test
-mv dev/ntst14.* gen 
-rm -rf dev
-
-set +x
-# rename the suffix, .fr->.src, .en->.trg
-for dir in train test gen
-do 
-  filelist=`ls $dir`
-  cd $dir
-  for file in $filelist
-  do 
-    if [ ${file##*.} = "fr" ]; then
-      mv $file ${file/%fr/src}
-    elif [ ${file##*.} = 'en' ]; then
-      mv $file ${file/%en/trg}
-    fi
-  done
-  cd ..
-done
diff --git a/demo/seqToseq/data/wmt14_model.sh b/demo/seqToseq/data/wmt14_model.sh
deleted file mode 100755
index c4b55b90a3eb98f94e0eb3be028c6de1ef57326b..0000000000000000000000000000000000000000
--- a/demo/seqToseq/data/wmt14_model.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-
-# download the pretrained model
-wget http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz
-
-# untar the model
-tar -zxvf wmt14_model.tar.gz
-rm wmt14_model.tar.gz 
diff --git a/demo/seqToseq/dataprovider.py b/demo/seqToseq/dataprovider.py
deleted file mode 100755
index c2b49804be582d7d0bc3ef6332741be03936eb24..0000000000000000000000000000000000000000
--- a/demo/seqToseq/dataprovider.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-
-UNK_IDX = 2
-START = "<s>"
-END = "<e>"
-
-
-def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list,
-         **kwargs):
-    # job_mode = 1: training mode
-    # job_mode = 0: generating mode
-    settings.job_mode = not is_generating
-
-    def fun(dict_path):
-        out_dict = dict()
-        with open(dict_path, "r") as fin:
-            out_dict = {
-                line.strip(): line_count
-                for line_count, line in enumerate(fin)
-            }
-        return out_dict
-
-    settings.src_dict = fun(src_dict_path)
-    settings.trg_dict = fun(trg_dict_path)
-
-    settings.logger.info("src dict len : %d" % (len(settings.src_dict)))
-
-    if settings.job_mode:
-        settings.slots = {
-            'source_language_word':
-            integer_value_sequence(len(settings.src_dict)),
-            'target_language_word':
-            integer_value_sequence(len(settings.trg_dict)),
-            'target_language_next_word':
-            integer_value_sequence(len(settings.trg_dict))
-        }
-        settings.logger.info("trg dict len : %d" % (len(settings.trg_dict)))
-    else:
-        settings.slots = {
-            'source_language_word':
-            integer_value_sequence(len(settings.src_dict)),
-            'sent_id':
-            integer_value_sequence(len(open(file_list[0], "r").readlines()))
-        }
-
-
-def _get_ids(s, dictionary):
-    words = s.strip().split()
-    return [dictionary[START]] + \
-           [dictionary.get(w, UNK_IDX) for w in words] + \
-           [dictionary[END]]
-
-
-@provider(init_hook=hook, pool_size=50000)
-def process(settings, file_name):
-    with open(file_name, 'r') as f:
-        for line_count, line in enumerate(f):
-            line_split = line.strip().split('\t')
-            if settings.job_mode and len(line_split) != 2:
-                continue
-            src_seq = line_split[0]  # one source sequence
-            src_ids = _get_ids(src_seq, settings.src_dict)
-
-            if settings.job_mode:
-                trg_seq = line_split[1]  # one target sequence
-                trg_words = trg_seq.split()
-                trg_ids = [settings.trg_dict.get(w, UNK_IDX) for w in trg_words]
-
-                # remove sequence whose length > 80 in training mode
-                if len(src_ids) > 80 or len(trg_ids) > 80:
-                    continue
-                trg_ids_next = trg_ids + [settings.trg_dict[END]]
-                trg_ids = [settings.trg_dict[START]] + trg_ids
-                yield {
-                    'source_language_word': src_ids,
-                    'target_language_word': trg_ids,
-                    'target_language_next_word': trg_ids_next
-                }
-            else:
-                yield {'source_language_word': src_ids, 'sent_id': [line_count]}
diff --git a/demo/seqToseq/paraphrase/train.conf b/demo/seqToseq/paraphrase/train.conf
deleted file mode 100644
index be79c5e771c0e864fd1776cedb3ef37c997b6df6..0000000000000000000000000000000000000000
--- a/demo/seqToseq/paraphrase/train.conf
+++ /dev/null
@@ -1,33 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-sys.path.append("..")
-
-from seqToseq_net import *
-
-is_generating = False
-### Data Definiation
-train_conf = seq_to_seq_data(data_dir = "./data/pre-paraphrase",
-                             is_generating = is_generating)
-
-### Algorithm Configuration
-settings(
-      learning_method = AdamOptimizer(),
-      batch_size = 50,
-      learning_rate = 5e-4)
-
-### Network Architecture
-gru_encoder_decoder(train_conf, is_generating, word_vector_dim = 32)
diff --git a/demo/seqToseq/paraphrase/train.sh b/demo/seqToseq/paraphrase/train.sh
deleted file mode 100755
index 9bb6dbdb1d4c5e35bfb31855e0331f0250a69a20..0000000000000000000000000000000000000000
--- a/demo/seqToseq/paraphrase/train.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-cd ..
-
-paddle train \
-    --config='paraphrase/train.conf' \
-    --save_dir='paraphrase/model' \
-    --init_model_path='data/paraphrase_model' \
-    --load_missing_parameter_strategy=rand \
-    --use_gpu=false \
-    --num_passes=16 \
-    --show_parameter_stats_period=100 \
-    --trainer_count=4 \
-    --log_period=10 \
-    --dot_period=5 \
-    2>&1 | tee 'paraphrase/train.log'
-paddle usage -l 'paraphrase/train.log' -e $? -n "seqToseq_paraphrase_train" >/dev/null 2>&1
diff --git a/demo/seqToseq/preprocess.py b/demo/seqToseq/preprocess.py
deleted file mode 100755
index 03f371331a0755e5939e457f4bdfb1770b8dad88..0000000000000000000000000000000000000000
--- a/demo/seqToseq/preprocess.py
+++ /dev/null
@@ -1,219 +0,0 @@
-#!/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Example:
-    python preprocess.py -i INPUT [-d DICTSIZE] [-m]
-
-Options:
-    -h, --help     show this help message and exit
-    -i INPUT       input original dataset path
-    -d DICTSIZE    specified word count of dictionary
-    -m --mergeDict merge source and target dictionary
-"""
-import os
-import sys
-
-import string
-from optparse import OptionParser
-from paddle.utils.preprocess_util import save_list, DatasetCreater
-
-
-class SeqToSeqDatasetCreater(DatasetCreater):
-    """
-    A class to process data for sequence to sequence application.
-    """
-
-    def __init__(self, data_path, output_path):
-        """
-        data_path: the path to store the train data, test data and gen data
-        output_path: the path to store the processed dataset
-        """
-        DatasetCreater.__init__(self, data_path)
-        self.gen_dir_name = 'gen'
-        self.gen_list_name = 'gen.list'
-        self.output_path = output_path
-
-    def concat_file(self, file_path, file1, file2, output_path, output):
-        """
-        Concat file1 and file2 to be one output file 
-        The i-th line of output = i-th line of file1 + '\t' + i-th line of file2
-        file_path: the path to store file1 and file2
-        output_path: the path to store output file
-        """
-        file1 = os.path.join(file_path, file1)
-        file2 = os.path.join(file_path, file2)
-        output = os.path.join(output_path, output)
-        if not os.path.exists(output):
-            os.system('paste ' + file1 + ' ' + file2 + ' > ' + output)
-
-    def cat_file(self, dir_path, suffix, output_path, output):
-        """
-        Cat all the files in dir_path with suffix to be one output file 
-        dir_path: the base directory to store input file
-        suffix: suffix of file name
-        output_path: the path to store output file
-        """
-        cmd = 'cat '
-        file_list = os.listdir(dir_path)
-        file_list.sort()
-        for file in file_list:
-            if file.endswith(suffix):
-                cmd += os.path.join(dir_path, file) + ' '
-        output = os.path.join(output_path, output)
-        if not os.path.exists(output):
-            os.system(cmd + '> ' + output)
-
-    def build_dict(self, file_path, dict_path, dict_size=-1):
-        """ 
-        Create the dictionary for the file, Note that
-        1. Valid characters include all printable characters
-        2. There is distinction between uppercase and lowercase letters
-        3. There is 3 special token: 
-           <s>: the start of a sequence
-           <e>: the end of a sequence
-           <unk>: a word not included in dictionary
-        file_path: the path to store file 
-        dict_path: the path to store dictionary
-        dict_size: word count of dictionary
-                   if is -1, dictionary will contains all the words in file 
-        """
-        if not os.path.exists(dict_path):
-            dictory = dict()
-            with open(file_path, "r") as fdata:
-                for line in fdata:
-                    line = line.split('\t')
-                    for line_split in line:
-                        words = line_split.strip().split()
-                        for word in words:
-                            if word not in dictory:
-                                dictory[word] = 1
-                            else:
-                                dictory[word] += 1
-            output = open(dict_path, "w+")
-            output.write('<s>\n<e>\n<unk>\n')
-            count = 3
-            for key, value in sorted(
-                    dictory.items(), key=lambda d: d[1], reverse=True):
-                output.write(key + "\n")
-                count += 1
-                if count == dict_size:
-                    break
-            self.dict_size = count
-
-    def create_dataset(self,
-                       dict_size=-1,
-                       mergeDict=False,
-                       suffixes=['.src', '.trg']):
-        """
-        Create seqToseq dataset 
-        """
-        # dataset_list and dir_list has one-to-one relationship
-        train_dataset = os.path.join(self.data_path, self.train_dir_name)
-        test_dataset = os.path.join(self.data_path, self.test_dir_name)
-        gen_dataset = os.path.join(self.data_path, self.gen_dir_name)
-        dataset_list = [train_dataset, test_dataset, gen_dataset]
-
-        train_dir = os.path.join(self.output_path, self.train_dir_name)
-        test_dir = os.path.join(self.output_path, self.test_dir_name)
-        gen_dir = os.path.join(self.output_path, self.gen_dir_name)
-        dir_list = [train_dir, test_dir, gen_dir]
-
-        # create directory
-        for dir in dir_list:
-            if not os.path.exists(dir):
-                os.mkdir(dir)
-
-        # checkout dataset should be parallel corpora
-        suffix_len = len(suffixes[0])
-        for dataset in dataset_list:
-            file_list = os.listdir(dataset)
-            if len(file_list) % 2 == 1:
-                raise RuntimeError("dataset should be parallel corpora")
-            file_list.sort()
-            for i in range(0, len(file_list), 2):
-                if file_list[i][:-suffix_len] != file_list[i + 1][:-suffix_len]:
-                    raise RuntimeError(
-                        "source and target file name should be equal")
-
-        # cat all the files with the same suffix in dataset
-        for suffix in suffixes:
-            for dataset in dataset_list:
-                outname = os.path.basename(dataset) + suffix
-                self.cat_file(dataset, suffix, dataset, outname)
-
-        # concat parallel corpora and create file.list
-        print 'concat parallel corpora for dataset'
-        id = 0
-        list = ['train.list', 'test.list', 'gen.list']
-        for dataset in dataset_list:
-            outname = os.path.basename(dataset)
-            self.concat_file(dataset, outname + suffixes[0],
-                             outname + suffixes[1], dir_list[id], outname)
-            save_list([os.path.join(dir_list[id], outname)],
-                      os.path.join(self.output_path, list[id]))
-            id += 1
-
-        # build dictionary for train data
-        dict = ['src.dict', 'trg.dict']
-        dict_path = [
-            os.path.join(self.output_path, dict[0]),
-            os.path.join(self.output_path, dict[1])
-        ]
-        if mergeDict:
-            outname = os.path.join(train_dir, train_dataset.split('/')[-1])
-            print 'build src dictionary for train data'
-            self.build_dict(outname, dict_path[0], dict_size)
-            print 'build trg dictionary for train data'
-            os.system('cp ' + dict_path[0] + ' ' + dict_path[1])
-        else:
-            outname = os.path.join(train_dataset, self.train_dir_name)
-            for id in range(0, 2):
-                suffix = suffixes[id]
-                print 'build ' + suffix[1:] + ' dictionary for train data'
-                self.build_dict(outname + suffix, dict_path[id], dict_size)
-        print 'dictionary size is', self.dict_size
-
-
-def main():
-    usage = "usage: \n" \
-            "python %prog -i INPUT [-d DICTSIZE] [-m]"
-    parser = OptionParser(usage)
-    parser.add_option(
-        "-i", action="store", dest="input", help="input original dataset path")
-    parser.add_option(
-        "-d",
-        action="store",
-        dest="dictsize",
-        help="specified word count of dictionary")
-    parser.add_option(
-        "-m",
-        "--mergeDict",
-        action="store_true",
-        dest="mergeDict",
-        help="merge source and target dictionary")
-    (options, args) = parser.parse_args()
-    if options.input[-1] == os.path.sep:
-        options.input = options.input[:-1]
-    outname = os.path.basename(options.input)
-    output_path = os.path.join(os.path.dirname(options.input), 'pre-' + outname)
-    dictsize = int(options.dictsize) if options.dictsize else -1
-    if not os.path.exists(output_path):
-        os.mkdir(output_path)
-        data_creator = SeqToSeqDatasetCreater(options.input, output_path)
-        data_creator.create_dataset(dictsize, options.mergeDict)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py
deleted file mode 100644
index 3d1f86ec3b7eda4fceaf3a1e406e3d0a1a4a2f60..0000000000000000000000000000000000000000
--- a/demo/seqToseq/seqToseq_net.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# edit-mode: -*- python -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import os
-from paddle.trainer_config_helpers import *
-
-
-def seq_to_seq_data(data_dir,
-                    is_generating,
-                    dict_size=30000,
-                    train_list='train.list',
-                    test_list='test.list',
-                    gen_list='gen.list',
-                    gen_result='gen_result'):
-    """
-    Predefined seqToseq train data provider for application
-    is_generating: whether this config is used for generating
-    dict_size: word count of dictionary
-    train_list: a text file containing a list of training data
-    test_list: a text file containing a list of testing data
-    gen_list: a text file containing a list of generating data
-    gen_result: a text file containing generating result
-    """
-    src_lang_dict = os.path.join(data_dir, 'src.dict')
-    trg_lang_dict = os.path.join(data_dir, 'trg.dict')
-
-    if is_generating:
-        train_list = None
-        test_list = os.path.join(data_dir, gen_list)
-    else:
-        train_list = os.path.join(data_dir, train_list)
-        test_list = os.path.join(data_dir, test_list)
-
-    define_py_data_sources2(
-        train_list,
-        test_list,
-        module="dataprovider",
-        obj="process",
-        args={
-            "src_dict_path": src_lang_dict,
-            "trg_dict_path": trg_lang_dict,
-            "is_generating": is_generating
-        })
-
-    return {
-        "src_dict_path": src_lang_dict,
-        "trg_dict_path": trg_lang_dict,
-        "gen_result": gen_result
-    }
-
-
-def gru_encoder_decoder(data_conf,
-                        is_generating,
-                        word_vector_dim=512,
-                        encoder_size=512,
-                        decoder_size=512,
-                        beam_size=3,
-                        max_length=250,
-                        error_clipping=50):
-    """
-    A wrapper for an attention version of GRU Encoder-Decoder network
-    is_generating: whether this config is used for generating
-    encoder_size: dimension of hidden unit in GRU Encoder network
-    decoder_size: dimension of hidden unit in GRU Decoder network
-    word_vector_dim: dimension of word vector
-    beam_size: expand width in beam search
-    max_length: a stop condition of sequence generation
-    """
-    for k, v in data_conf.iteritems():
-        globals()[k] = v
-    source_dict_dim = len(open(src_dict_path, "r").readlines())
-    target_dict_dim = len(open(trg_dict_path, "r").readlines())
-    gen_trans_file = gen_result
-
-    src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
-    src_embedding = embedding_layer(
-        input=src_word_id,
-        size=word_vector_dim,
-        param_attr=ParamAttr(name='_source_language_embedding'))
-    src_forward = simple_gru(
-        input=src_embedding,
-        size=encoder_size,
-        naive=True,
-        gru_layer_attr=ExtraLayerAttribute(
-            error_clipping_threshold=error_clipping))
-    src_backward = simple_gru(
-        input=src_embedding,
-        size=encoder_size,
-        reverse=True,
-        naive=True,
-        gru_layer_attr=ExtraLayerAttribute(
-            error_clipping_threshold=error_clipping))
-    encoded_vector = concat_layer(input=[src_forward, src_backward])
-
-    with mixed_layer(size=decoder_size) as encoded_proj:
-        encoded_proj += full_matrix_projection(input=encoded_vector)
-
-    backward_first = first_seq(input=src_backward)
-    with mixed_layer(
-            size=decoder_size,
-            act=TanhActivation(), ) as decoder_boot:
-        decoder_boot += full_matrix_projection(input=backward_first)
-
-    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-        decoder_mem = memory(
-            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
-
-        context = simple_attention(
-            encoded_sequence=enc_vec,
-            encoded_proj=enc_proj,
-            decoder_state=decoder_mem, )
-
-        with mixed_layer(size=decoder_size * 3) as decoder_inputs:
-            decoder_inputs += full_matrix_projection(input=context)
-            decoder_inputs += full_matrix_projection(input=current_word)
-
-        gru_step = gru_step_naive_layer(
-            name='gru_decoder',
-            input=decoder_inputs,
-            output_mem=decoder_mem,
-            size=decoder_size,
-            layer_attr=ExtraLayerAttribute(
-                error_clipping_threshold=error_clipping))
-
-        with mixed_layer(
-                size=target_dict_dim, bias_attr=True,
-                act=SoftmaxActivation()) as out:
-            out += full_matrix_projection(input=gru_step)
-        return out
-
-    decoder_group_name = "decoder_group"
-    group_inputs = [
-        StaticInput(
-            input=encoded_vector, is_seq=True), StaticInput(
-                input=encoded_proj, is_seq=True)
-    ]
-
-    if not is_generating:
-        trg_embedding = embedding_layer(
-            input=data_layer(
-                name='target_language_word', size=target_dict_dim),
-            size=word_vector_dim,
-            param_attr=ParamAttr(name='_target_language_embedding'))
-        group_inputs.append(trg_embedding)
-
-        # For decoder equipped with attention mechanism, in training,
-        # target embeding (the groudtruth) is the data input,
-        # while encoded source sequence is accessed to as an unbounded memory.
-        # Here, the StaticInput defines a read-only memory
-        # for the recurrent_group.
-        decoder = recurrent_group(
-            name=decoder_group_name,
-            step=gru_decoder_with_attention,
-            input=group_inputs)
-
-        lbl = data_layer(name='target_language_next_word', size=target_dict_dim)
-        cost = classification_cost(input=decoder, label=lbl)
-        outputs(cost)
-    else:
-        # In generation, the decoder predicts a next target word based on
-        # the encoded source sequence and the last generated target word.
-
-        # The encoded source sequence (encoder's output) must be specified by
-        # StaticInput, which is a read-only memory.
-        # Embedding of the last generated word is automatically gotten by
-        # GeneratedInputs, which is initialized by a start mark, such as <s>,
-        # and must be included in generation.
-
-        trg_embedding = GeneratedInput(
-            size=target_dict_dim,
-            embedding_name='_target_language_embedding',
-            embedding_size=word_vector_dim)
-        group_inputs.append(trg_embedding)
-
-        beam_gen = beam_search(
-            name=decoder_group_name,
-            step=gru_decoder_with_attention,
-            input=group_inputs,
-            bos_id=0,
-            eos_id=1,
-            beam_size=beam_size,
-            max_length=max_length)
-
-        seqtext_printer_evaluator(
-            input=beam_gen,
-            id_input=data_layer(
-                name="sent_id", size=1),
-            dict_file=trg_dict_path,
-            result_file=gen_trans_file)
-        outputs(beam_gen)
diff --git a/demo/seqToseq/translation/eval_bleu.sh b/demo/seqToseq/translation/eval_bleu.sh
deleted file mode 100755
index 54c2ed237e93adb3456dbe62f75626d36c2d90bc..0000000000000000000000000000000000000000
--- a/demo/seqToseq/translation/eval_bleu.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-gen_file=$1
-beam_size=$2
-
-# find top1 generating result
-top1=$(printf '%s_top1.txt' `basename $gen_file .txt`)
-if [ $beam_size -eq 1 ]; then
-    awk -F "\t" '{sub(" <e>","",$2);sub(" ","",$2);print $2}' $gen_file >$top1
-else
-    awk 'BEGIN{
-        FS="\t";
-        OFS="\t";
-        read_pos = 2} {
-        if (NR == read_pos){
-            sub(" <e>","",$3);
-            sub(" ","",$3);
-            print $3;
-            read_pos += (2 + res_num);
-      }}' res_num=$beam_size $gen_file >$top1
-fi 
-
-# evalute bleu value
-bleu_script=multi-bleu.perl
-standard_res=../data/wmt14/gen/ntst14.trg
-bleu_res=`perl $bleu_script $standard_res <$top1`
-
-echo $bleu_res
-rm $top1
diff --git a/demo/seqToseq/translation/gen.conf b/demo/seqToseq/translation/gen.conf
deleted file mode 100644
index e9bea4e4559ff31ad83c4474e91de7e7acc77e9f..0000000000000000000000000000000000000000
--- a/demo/seqToseq/translation/gen.conf
+++ /dev/null
@@ -1,36 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-sys.path.append("..")
-
-from seqToseq_net import *
-
-# whether this config is used for generating
-is_generating = True
-
-### Data Definiation
-gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14", 
-                           is_generating = is_generating,
-                           gen_result = "./translation/gen_result")
-
-### Algorithm Configuration
-settings(
-      learning_method = AdamOptimizer(),
-      batch_size = 1,
-      learning_rate = 0)
-
-### Network Architecture
-gru_encoder_decoder(gen_conf, is_generating)
diff --git a/demo/seqToseq/translation/gen.sh b/demo/seqToseq/translation/gen.sh
deleted file mode 100755
index 64b78f5e9654e7b206740f92e224e0164108c9f1..0000000000000000000000000000000000000000
--- a/demo/seqToseq/translation/gen.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-cd ..
-
-paddle train \
-    --job=test \
-    --config='translation/gen.conf' \
-    --save_dir='data/wmt14_model' \
-    --use_gpu=false \
-    --num_passes=13 \
-    --test_pass=12 \
-    --trainer_count=1 \
-    2>&1 | tee 'translation/gen.log'
-paddle usage -l 'translation/gen.log' -e $? -n "seqToseq_translation_gen" >/dev/null 2>&1
diff --git a/demo/seqToseq/translation/moses_bleu.sh b/demo/seqToseq/translation/moses_bleu.sh
deleted file mode 100755
index 2f230d7f4c736da003966fbdb277f6b8b1ec952c..0000000000000000000000000000000000000000
--- a/demo/seqToseq/translation/moses_bleu.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-echo "Downloading multi-bleu.perl"
-wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl --no-check-certificate
diff --git a/demo/seqToseq/translation/train.conf b/demo/seqToseq/translation/train.conf
deleted file mode 100644
index 72b7ccdbb95dbda8f06674079db9a3257bb31622..0000000000000000000000000000000000000000
--- a/demo/seqToseq/translation/train.conf
+++ /dev/null
@@ -1,36 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-sys.path.append("..")
-
-from seqToseq_net import *
-
-# whether this config is used for generating
-is_generating = False
-
-### Data Definiation
-data_dir  = "./data/pre-wmt14"
-train_conf = seq_to_seq_data(data_dir = data_dir, 
-                             is_generating = is_generating)
-
-### Algorithm Configuration
-settings(
-    learning_method = AdamOptimizer(),
-    batch_size = 50,
-    learning_rate = 5e-4)
-
-### Network Architecture
-gru_encoder_decoder(train_conf, is_generating)
diff --git a/demo/seqToseq/translation/train.sh b/demo/seqToseq/translation/train.sh
deleted file mode 100755
index b0ec9854b118cbb9ed39d6bed0cdd845403926a4..0000000000000000000000000000000000000000
--- a/demo/seqToseq/translation/train.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-cd ..
-
-paddle train \
---config='translation/train.conf' \
---save_dir='translation/model' \
---use_gpu=false \
---num_passes=16 \
---show_parameter_stats_period=100 \
---trainer_count=4 \
---log_period=10 \
---dot_period=5 \
-2>&1 | tee 'translation/train.log'
-paddle usage -l 'translation/train.log' -e $? -n "seqToseq_translation_train" >/dev/null 2>&1
diff --git a/demo/word2vec/api_train_v2.py b/demo/word2vec/api_train_v2.py
deleted file mode 100644
index c0940f0e56eafa22f8aeb7052c0ddc79d8862917..0000000000000000000000000000000000000000
--- a/demo/word2vec/api_train_v2.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import gzip
-import math
-
-import paddle.v2 as paddle
-
-embsize = 32
-hiddensize = 256
-N = 5
-
-
-def wordemb(inlayer):
-    wordemb = paddle.layer.embedding(
-        input=inlayer,
-        size=embsize,
-        param_attr=paddle.attr.Param(
-            name="_proj",
-            initial_std=0.001,
-            learning_rate=1,
-            l2_rate=0,
-            sparse_update=True))
-    return wordemb
-
-
-def main():
-    # for local training
-    cluster_train = False
-
-    if not cluster_train:
-        paddle.init(use_gpu=False, trainer_count=1)
-    else:
-        paddle.init(
-            use_gpu=False,
-            trainer_count=2,
-            port=7164,
-            ports_num=1,
-            ports_num_for_sparse=1,
-            num_gradient_servers=1)
-    word_dict = paddle.dataset.imikolov.build_dict()
-    dict_size = len(word_dict)
-    firstword = paddle.layer.data(
-        name="firstw", type=paddle.data_type.integer_value(dict_size))
-    secondword = paddle.layer.data(
-        name="secondw", type=paddle.data_type.integer_value(dict_size))
-    thirdword = paddle.layer.data(
-        name="thirdw", type=paddle.data_type.integer_value(dict_size))
-    fourthword = paddle.layer.data(
-        name="fourthw", type=paddle.data_type.integer_value(dict_size))
-    nextword = paddle.layer.data(
-        name="fifthw", type=paddle.data_type.integer_value(dict_size))
-
-    Efirst = wordemb(firstword)
-    Esecond = wordemb(secondword)
-    Ethird = wordemb(thirdword)
-    Efourth = wordemb(fourthword)
-
-    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
-    hidden1 = paddle.layer.fc(input=contextemb,
-                              size=hiddensize,
-                              act=paddle.activation.Sigmoid(),
-                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
-                              bias_attr=paddle.attr.Param(learning_rate=2),
-                              param_attr=paddle.attr.Param(
-                                  initial_std=1. / math.sqrt(embsize * 8),
-                                  learning_rate=1))
-    predictword = paddle.layer.fc(input=hidden1,
-                                  size=dict_size,
-                                  bias_attr=paddle.attr.Param(learning_rate=2),
-                                  act=paddle.activation.Softmax())
-
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
-                with gzip.open("batch-" + str(event.batch_id) + ".tar.gz",
-                               'w') as f:
-                    trainer.save_parameter_to_tar(f)
-                result = trainer.test(
-                    paddle.batch(
-                        paddle.dataset.imikolov.test(word_dict, N), 32))
-                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics,
-                    result.metrics)
-
-    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
-
-    parameters = paddle.parameters.create(cost)
-    adagrad = paddle.optimizer.AdaGrad(
-        learning_rate=3e-3,
-        regularization=paddle.optimizer.L2Regularization(8e-4))
-    trainer = paddle.trainer.SGD(cost,
-                                 parameters,
-                                 adagrad,
-                                 is_local=not cluster_train)
-    trainer.train(
-        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
-        num_passes=30,
-        event_handler=event_handler)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 154cfe24432f3e43ed724a45273b4a582b45f73d..1efa74ecda4170332d96603ca2253c68468474f9 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -207,6 +207,11 @@ trans_full_matrix_projection
 Aggregate Layers
 ================
 
+AggregateLevel
+--------------
+..  autoclass:: paddle.v2.layer.AggregateLevel
+    :noindex:
+
 ..  _api_v2.layer_pooling:
 
 pooling
@@ -248,6 +253,11 @@ block_expand
 
 ..  _api_v2.layer_expand:
 
+ExpandLevel
+-----------
+..  autoclass:: paddle.v2.layer.ExpandLevel
+    :noindex:
+
 expand
 ------
 ..  autoclass:: paddle.v2.layer.expand
diff --git a/doc/design/build_system/README.md b/doc/design/build_system/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..310739f37ae48934afe1d042e87efef85b98f1fc
--- /dev/null
+++ b/doc/design/build_system/README.md
@@ -0,0 +1,107 @@
+A few months ago when we were trying to replace CMake with Bazel, @emailweixu suggested that we rewrite those handy Bazel functions using CMake. Now it seems that it's the right time to get this done, as we are facing problems from the porting of Majel and the development of new the parameter server using Go and C++.
+
+Here are some initial thoughts. Your comments are welcome!
+
+### Required CMake Function
+
+I think we need only the following few CMake functions to make a project description mean and clean:
+
+| C++ | CUDA C++ | Go |
+|---|---|---|
+| cc_library | nv_library | go_library |
+| cc_binary | nv_binary | go_binary |
+| cc_test | nv_test | go_test |
+
+- The `_library` functions generate  .a files from source code.
+- The `_binary` functions generate executable binary files.
+- The `_test` functions generate executable unit test files. They work like `_binary` but links `-lgtest` and `-lgtest_main`.
+
+The difference between `nv_` functions and `cc_` functions is that the former use `nvcc` instead of the system-default C++ compiler.
+
+Both `nv_` and `cc_` functions enables C++11 (-std=c++11).
+
+Also,
+
+- to describe external dependencies, we need `external_library`.
+- to build shared libraries, we need `shared_library`.
+
+### An Example Project
+
+Suppose that we have aforementioned functions defined in our `/cmake` directory.  The following example `CMakeLists.txt` describes a project including the following source files:
+
+- tensor.h
+- tensor.cc
+- tensor_test.cc
+- ops.h
+- ops.cu
+- ops_test.cu
+- api.go
+- api_test.go
+
+Suppose that ops.cu depends on CUDNN.
+
+```cmake
+# cc_binary parses tensor.cc and figures out that target also depend
+# on tensor.h.
+cc_binary(tensor
+  SRCS
+  tensor.cc)
+
+# The dependency to target tensor implies that if any of
+# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built.
+cc_test(tensor_test
+  SRCS
+  tensor_test.cc
+  DEPS
+  tensor)
+
+# I don't have a clear idea what parameters external_library need to
+# have.  @gangliao as a CMake expert would have better ideas.
+external_library(cudnn
+  ....)
+
+# Suppose that ops.cu depends on external target CUDNN.  Also, ops.cu
+# include global functions that take Tensor as their parameters, so
+# ops depend on tensor.  This implies that if any of tensor.{h.cc},
+# ops.{h,cu} is changed, ops need to be re-built.
+nv_library(ops
+  SRCS
+  ops.cu
+  DEPS
+  tensor
+  cudnn)  # cudnn is defined later.
+
+nv_test(ops_test
+  SRCS
+  ops_test.cu
+  DEPS
+  ops)
+
+# Because api.go defines a GO wrapper to ops and tensor, it depends on
+# both.  This implies that if any of tensor.{h,cc}, ops.{h,cu}, or
+# api.go is changed, api need to be re-built.
+go_library(api
+  SRCS
+  api.go
+  DEPS
+  tensor # Because ops depend on tensor, this line is optional.
+  ops)
+
+go_test(api_test
+  SRCS
+  api_test.go
+  DEPS
+  api)
+
+
+# This builds libapi.so.  shared_library might use CMake target
+# api_shared so to distinguish it from above target api.
+shared_library(api
+  DEPS
+  api)
+
+```
+
+### Implementation
+
+As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph.  It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`.
diff --git a/doc/design/cluster_train/master_server.md b/doc/design/cluster_train/master_server.md
index bb8307652587b4dc56cd668a3a5e64722734d194..4bf3c506f101361875043f8bfd97972b8c981a22 100644
--- a/doc/design/cluster_train/master_server.md
+++ b/doc/design/cluster_train/master_server.md
@@ -10,7 +10,7 @@ A dataset is a list of files in *RecordIO* format. A RecordIO file consists of c
 
 ## Task Queue
 
-As mentioned in [distributed training design doc](./README.md), a *task* is a data shard that the master server assigns to the trainer process to train on. A task consists of one or multiple *blocks* from one or multiple files. The master server maintains *task queues* to track the training progress.
+As mentioned in [distributed training design doc](./README.md), a *task* is a data shard that the master server assigns to the trainer process to train on. A task consists of one or multiple *chunks* from one or multiple files. The master server maintains *task queues* to track the training progress.
 
 ### Task Queue Creation
 
@@ -21,23 +21,23 @@ As mentioned in [distributed training design doc](./README.md), a *task* is a da
 	func (m *RPCServer) ReportDataset(Paths []string, dummy *int) error {
 	}
 	```
-1. The master server will scan through each RecordIO file to generate the *block index* and know how many blocks does each file have. A block can be referenced by the file path and the index of the block within the file. The block index is in memory data structure that enables fast access to each block, and the index of the block with the file is an integer start from 0, representing the n-th block within the file.
+1. The master server will scan through each RecordIO file to generate the *chunk index* and know how many chunks does each file have. A chunk can be referenced by the file path and the index of the chunk within the file. The chunk index is in memory data structure that enables fast access to each chunk, and the index of the chunk with the file is an integer start from 0, representing the n-th chunk within the file.
 
-	The definition of the block is:
+	The definition of the chunk is:
 	```go
-	type Block struct {
-		Idx   int // index of the block within the file
+	type Chunk struct {
+		Idx   int // index of the chunk within the file
 		Path  string
-		Index recordio.Index // block index
+		Index recordio.Index // chunk index
 	}
 	```
-1. Blocks are grouped into tasks, and tasks are filled into the todo queue. The pending queue and the done queue are initialized with no element.
+1. Chunks are grouped into tasks, and tasks are filled into the todo queue. The pending queue and the done queue are initialized with no element.
 
 	The definition of the task is:
 	```go
 	type Task struct {
 		Index  int
-		Blocks []Block
+		Chunks []Chunk
 	}
 	```
 
diff --git a/doc/design/cluster_train/pserver_client.md b/doc/design/cluster_train/pserver_client.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3e4079010490b69db1de28157f0cab80cad2381
--- /dev/null
+++ b/doc/design/cluster_train/pserver_client.md
@@ -0,0 +1,158 @@
+# Design Doc: The Client Library of Parameter Server
+
+For an overview of trainer's role, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter server's client library, which will manage communication with parameter servers. The library will be implemented in [Go](https://golang.org/) and made available as a static or dynamic library with a C header file.
+
+## Parameter Partition
+
+Each parameter will be partitioned into parameter blocks to make the parameters evenly distributed on parameter servers. The partition is done automatically by the client library. The *sparse parameter* require a little different treatment:
+
+### Sparse Parameter
+
+The sparse parameter is a parameter that is updated sparsely. The name is somewhat misleading, it does not have a sparse representation, it has the same representation as a dense vector.
+
+Because a sparse parameter is updated sparsely, the trainer will have to partition the sparse parameter. Because the parameter server will merge all sparse parameter shard into the same file when saving the parameter. It needs special naming convention:
+
+If a sparse parameter is partitioned into n shards, they should be named as:
+
+```text
+name:sparse-0
+name:sparse-1
+...
+name:sparse-n-1
+```
+
+The library is unaware of the partition, and treat each parameter independently. Only when saving parameters, the parameter servers will merge the sparse parameters according to the naming convention.
+
+## Model Optimization Using Gradients
+
+There are two ways to perform model optimization using gradients:
+
+- On Client
+
+  The client does multiple steps of forward and backward update. In each step, the gradients are calculated and a new model is generated. After some steps, the client will calculate the difference between the newest model and the old model at step 0. The difference will be updated to parameter servers. Parameter servers will just update parameters using the difference without any optimization using gradients (such as Adam and L1 regularization).
+
+- On Parameter Server
+
+  The client will send accumulated gradients to parameter servers, the parameter server will do the optimization using gradients.
+
+## L1 and L2 Regularization
+
+PaddlePaddle allows L1 or L2 regularizations to be specified per parameter, so when the trainer initializes the parameter it needs include a parameter configuration when L1 or L2 regularization is necessary.
+
+## Parameter Initialization
+
+The parameters on parameter servers need to be initialized. To provide maximum flexibility, the trainer will initialize the parameters. Only one trainer will do the initialization, the other trainers will wait for the completion of initialization and get the parameters from the parameter servers.
+
+### Trainer Selection
+
+To select the trainer for initialization, every trainer will try to get a distributed lock, whoever owns the lock will do the initialization. As illustrated below:
+
+<img src="./src/init_lock.png">
+
+### Trainer Selection Process
+
+The trainer select process is encapsulated in the C API function:
+```c
+int paddle_begin_init_params(paddle_pserver_client* client, const char* config_proto);
+```
+The selected trainer's call to `paddle_begin_init_params` will return with 1, and the other trainers' call to `paddle_begin_init_params` will return 0. `paddle_get_params` will be blocked until initialization is completed. As illustrated below:
+
+<img src="./src/pserver_init.png">
+
+## C Interface
+
+```c
+typedef enum {
+  PADDLE_ELEMENT_TYPE_INT32   = 0,
+  PADDLE_ELEMENT_TYPE_UINT32  = 1,
+  PADDLE_ELEMENT_TYPE_INT64   = 2,
+  PADDLE_ELEMENT_TYPE_UINT64  = 3,
+  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
+  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
+} paddle_element_type;
+
+typedef struct {
+  char*               name;
+  paddle_element_type element_type;
+  void*               content;
+  int                 content_len;
+} paddle_parameter, paddle_gradient;
+
+typedef struct paddle_pserver_client paddle_pserver_client;
+
+paddle_pserver_client* paddle_new_pserver_client();
+void paddle_pserver_client_release(paddle_pserver_client* client);
+
+/**
+ * @brief paddle_begin_init_params begins to initialize parameters on
+ * parameter servers.
+ *
+ * paddle_begin_init_params will be called from multiple trainers,
+ * only one trainer will be selected to initialize the parameters on
+ * parameter servers. Other trainers need to get the initialized
+ * parameters from parameter servers using @paddle_get_params.
+ *
+ * @return 1 if the trainer is selected to initialize parameter
+ * servers, otherwise 0.
+ */
+int paddle_begin_init_params(paddle_pserver_client* client);
+
+/**
+ * @brief paddle_init_param initializes the parameter on parameter
+ * servers.
+ *
+ * @param param the parameter to initialize.
+ * @param param_config_proto the configuration for the parameter.
+ * @param config_len the length of param_config_proto
+ * @return 0 if successful, otherwise -1. On failure, the trainer
+ * needs to restart the entire initialization process (starting from
+ * @paddle_begin_init_param). Or simply exit the program and wait for
+ * the cluster management system to restart the trainer.
+ */
+int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);
+
+/**
+ * @brief paddle_finish_init_params tells parameter servers client has
+ * sent all parameters to parameter servers as initialization.
+ *
+ * @return 0 if successful, otherwise -1. On failure, the trainer
+ * needs to restart the entire initialization process (starting from
+ * @paddle_begin_init_param). Or simply exit the program and wait for
+ * the cluster management system to restart the trainer.
+ */
+int paddle_finish_init_params(paddle_pserver_client* client);
+
+/**
+ * @brief paddle_send_grads sends gradients to parameter servers for
+ * updating parameters.
+ *
+ * @param grads the array of gradients to send.
+ * @param len the length of the gradient array.
+ * @param learning_rate the learning rate for the gradients.
+ * @return 0 if successful, otherwise -1.
+ */
+int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grads, int len);
+
+/**
+ * @brief paddle_get_params gets parameters from parameter servers.
+ *
+ * paddle_get_params will block until parameters are initialized on
+ * the parameter servers.
+ *
+ * @param names the array of names of the parameters to get.
+ * @param dst the destination array of parameters to save to.
+ * @param len the length of the names array and the paddle_parameter
+ * array.
+ * @return 0 if successful, otherwise -1.
+ */
+int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_parameter* dst, int len);
+
+/**
+ * @brief paddle_save_model indicates parameters to save the parameter
+ * to the given path
+ *
+ * @param path the path to save parameters.
+ * @return 0 if successful, otherwise -1.
+ */
+int paddle_save_model(paddle_pserver_client* client, const char* path);
+```
diff --git a/doc/design/cluster_train/src/init_lock.graffle b/doc/design/cluster_train/src/init_lock.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..fa9149f21b1311eed48ef72ec55e556559d0fc94
Binary files /dev/null and b/doc/design/cluster_train/src/init_lock.graffle differ
diff --git a/doc/design/cluster_train/src/init_lock.png b/doc/design/cluster_train/src/init_lock.png
new file mode 100644
index 0000000000000000000000000000000000000000..92404ee6d6c0f9a7727952bae3c869ba338ecd7f
Binary files /dev/null and b/doc/design/cluster_train/src/init_lock.png differ
diff --git a/doc/design/cluster_train/src/pserver_init.graffle b/doc/design/cluster_train/src/pserver_init.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..5f3f1f52be8aa7f9049a8fcd6b7c93c8560c1676
Binary files /dev/null and b/doc/design/cluster_train/src/pserver_init.graffle differ
diff --git a/doc/design/cluster_train/src/pserver_init.png b/doc/design/cluster_train/src/pserver_init.png
new file mode 100644
index 0000000000000000000000000000000000000000..dfe491ff98dd7db1c336093c80964a260df2cd90
Binary files /dev/null and b/doc/design/cluster_train/src/pserver_init.png differ
diff --git a/doc/design/cluster_train/src/submit-job.graffle b/doc/design/cluster_train/src/submit-job.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..677cdfb6d9a32168bf71729eb841fa1ca0dd31d6
Binary files /dev/null and b/doc/design/cluster_train/src/submit-job.graffle differ
diff --git a/doc/design/cluster_train/src/submit-job.png b/doc/design/cluster_train/src/submit-job.png
new file mode 100644
index 0000000000000000000000000000000000000000..3046a460a7ba708079e88a560debaa215a694680
Binary files /dev/null and b/doc/design/cluster_train/src/submit-job.png differ
diff --git a/doc/design/cluster_train/submit-job.md b/doc/design/cluster_train/submit-job.md
new file mode 100644
index 0000000000000000000000000000000000000000..8377d5489dc64bd2fdc5bb4f7bc737e7b489000d
--- /dev/null
+++ b/doc/design/cluster_train/submit-job.md
@@ -0,0 +1,127 @@
+# Submit a Distributed Training Job
+
+The user can submit a distributed training job with Python code, rather than with a command-line interface.
+
+## Runtime Environment On Kubernetes
+
+For a distributed training job, there is two Docker image called *runtime Docker image* and *base Docker image*. The runtime Docker image is the Docker image that gets scheduled by Kubernetes to run during training. The base Docker image is for building the runtime Docker image.
+
+### Base Docker Image
+
+Usually, the base Docker image is PaddlePaddle product Docker image including paddle binary files and python package. And of course, users can specify any image name hosted on any docker registry which users have the access right.
+
+### Runtime Docker Image
+
+The trainer package which user upload and some Python dependencies are packaged into a runtime Docker image based on base Docker image.
+
+- Handle Python Dependencies
+
+  You need to provide requirements.txt file in your `trainer-package` folder. Example:
+
+  ```txt
+  pillow
+  protobuf==3.1.0
+  ```
+  More [details](https://pip.readthedocs.io/en/1.1/requirements.html) about requirements, an example project looks like:
+  ```bash
+    paddle_example
+      |-quick_start
+        |-trainer.py
+        |-dataset.py
+        |-requirements.txt
+  ```
+
+## Submit Distributed Training Job With Python Code
+<img src="./src/submit-job.png" width="800">
+
+- `paddle.job.dist_train()` will call the Job Server API `/v1/packages` to upload the trainer package and save them on CephFS, and then call `/v1/trainer/job` to submit the PaddlePaddle distributed job.
+- `/v1/trainer/job` will start a building job for preparing the runtime Docker image. When the building job is finished, Job Server will submit the PaddlePaddle distributed job to Kubernetes.
+- *NOTE*: For the first version, we will not prepare the runtime Docker image, instead, the package is uploaded to Paddle Cloud, and Paddle Cloud will mount the package in a temporary folder into the base Docker image. We will not support custom Python dependencies in the first version as well.
+
+You can call `paddle.job.dist_train` and provide distributed training configuration as the parameters:
+```python
+paddle.job.dist_train(
+  trainer=dist_trainer(),
+  paddle_job=PaddleJob(
+    job_name = "paddle-cloud",
+    entry_point = "python %s"%__file__,
+    trainer_package = "/example/word2vec",
+    image = "yancey1989/paddle-job",
+    trainers = 10,
+    pservers = 3,
+    trainer_cpu = 1,
+    trainer_gpu = 1,
+    trainer_mem = "10G",
+    pserver_cpu = 1,
+    pserver_mem = "2G"
+  ))
+```
+
+The parameter `trainer` of `paddle.job.dist_train` is a function and you can implement it as follows:
+```python
+def dist_trainer():
+  def trainer_creator():
+    trainer = paddle.v2.trainer.SGD(...)
+    trainer.train(...)
+  return trainer_creator
+```
+
+The pseudo code of `paddle.job.dist_train` is as follows:
+```python
+def dist_train(trainer, paddle_job):
+  # if the code is running on cloud, set PADDLE_ON_CLOUD=YES
+  if os.getenv("RUNNING_ON_CLOUD", "NO") == "NO":
+    #submit the paddle job
+    paddle_job.submit()
+  else:
+    #start the training
+    trainer()
+```
+### PaddleJob Parameters
+parameter | type | explanation
+ --- | --- | ---
+job_name | str | the unique name for the training job
+entry_point | str | entry point for startup trainer process
+trainer_package | str | trainer package file path which user have the access right
+image|str|the [base image](#base-docker-image) for building the [runtime image](#runtime-docker-image)
+pservers|int| Parameter Server process count
+trainers|int| Trainer process count
+pserver_cpu|int| CPU count for each Parameter Server process
+pserver_mem|str| memory allocated for each Parameter Server process, a plain integer using one of these suffixes: E, P, T, G, M, K
+trainer_cpu|int| CPU count for each Trainer process
+trainer_mem|str| memory allocated for each Trainer process, a plain integer using one of these suffixes: E, P, T, G, M, K
+trainer_gpu|int| GPU count for each Trainer process, if you only want CPU, do not set this parameter
+
+### Deploy Parameter Server, Trainer and Master Process
+  - Deploy PaddlePaddle Parameter Server processes, it's a Kubernetes ReplicaSet.
+  - Deploy PaddlePaddle Trainer processes, it's a Kubernetes Job.
+  - Deploy PaddlePaddle Master processes, it's a Kubernetes ReplicaSet.
+
+## Job Server
+
+- RESTful API
+
+  Job server provides RESTful HTTP API for receiving the trainer package and displaying
+  PaddlePaddle job related informations.
+  - `POST   /v1/package` receive the trainer package and save them on CephFS
+  - `POST   /v1/trainer/job` submit a trainer job
+  - `GET    /v1/jobs/` list all jobs
+  - `GET    /v1/jobs/<job-name>` the status of a job
+  - `DELETE /v1/jobs/<job-name>` delete a job
+  - `GET    /v1/version` job server version
+
+- Build Runtime Docker Image on Kubernetes
+
+  `paddle.job.dist_train` will upload the trainer package to Job Server, save them on the distributed filesystem, and then start up a job for building the runtime Docker image that gets scheduled by Kubernetes to run during training.
+
+  There are some benefits for building runtime Docker image on JobServer:
+  - On Paddle Cloud, users will run the trainer code in a Jupyter Notebook which is a Kubernetes Pod, if we want to execute `docker build` in the Pod, we should mount the host's `docker.sock` to the Pod, user's code will connect the host's Docker Engine directly, it's not safe.
+  - Users only need to upload the training package files, does not need to install docker engine, docker registry as dependencies.
+  - If we want to change another image type, such as RKT, users do not need to care about it.
+
+- Deploy Parameter Server, Trainer and Master Processes
+
+  `POST /v1/trainer/job` receives the distributed training parameters, and deploy the job as follows:
+  - Deploy PaddlePaddle Parameter Server processes, it's a Kubernetes ReplicaSet.
+  - Deploy PaddlePaddle Trainer processes, it's a Kubernetes Job.
+  - Deploy PaddlePaddle Master processes, it's a Kubernetes ReplicaSet.
diff --git a/doc/design/file_manager/README.md b/doc/design/file_manager/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3df10d801e568834729f902aace483d033340e2d
--- /dev/null
+++ b/doc/design/file_manager/README.md
@@ -0,0 +1,87 @@
+# FileManager设计文档
+## 目标
+在本文档中，我们设计说明了名为FileManager系统，方便用户上传自己的训练数据以进行分布式训练
+
+主要功能包括：
+
+- 提供常用的命令行管理命令管理文件和目录
+- 支持大文件的断点上传、下载  
+
+## 名词解释
+- PFS：是`Paddlepaddle cloud File System`的缩写，是对用户文件存储空间的抽象，与之相对的是local filesystem。目前我们用CephFS来搭建。
+- [CephFS](http://docs.ceph.com/docs/master/cephfs/)：一个POSIX兼容的文件系统。
+- Chunk：逻辑划上文件分块的单位。
+
+## 模块
+### 架构图
+<image src=./src/filemanager.png width=900>
+
+### PFSClient
+- 功能： 详细设计[link](./pfs/pfsclient.md)
+	- 提供用户管理文件的命令
+	- 需要可以跨平台执行
+
+- 双向验证   
+	PFSClient需要和Ingress之间做双向验证<sup>[tls](#tls)</sup>，所以用户需要首先在`cloud.paddlepaddle.org`上注册一下，申请用户空间，并且把系统生成的CA(certificate authority)、Key、CRT(CA signed certificate)下载到本地，然后才能使用PFSClient。
+		
+### [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/)
+- 功能：  
+	提供七层协议的反向代理、基于粘性会话的负载均衡功能。
+	
+- 透传用户身份的办法  
+	Ingress需要把PFSClient的身份信息传给PFSServer，配置的方法参考[link](http://www.integralist.co.uk/posts/clientcertauth.html#3)
+
+### PFSServer
+PFSServer提供RESTful API接口，接收处理PFSClient端的文件管理请求，并且把结果返回PFSClient端。
+
+RESTful API
+
+- /api/v1/files
+	- `GET /api/v1/files`: Get metadata of files or directories.
+	- `POST /api/v1/files`: Create files or directories.
+	- `PATCH /api/v1/files`: Update files or directories.
+	- `DELETE /api/v1/files`: Delete files or directories.
+
+- /api/v1/file/chunks
+	- `GET /api/v1/storage/file/chunks`: Get chunks's metadata of a file.
+
+- /api/v1/storage/files
+	- `GET /api/v1/storage/files`: Download files or directories.
+	- `POST /api/v1/storage/files`: Upload files or directories.
+
+- /api/v1/storage/file/chunks
+	- `GET /api/v1/storage/file/chunks`: Download chunks's data.
+	- `POST /api/v1/storage/file/chunks`: Upload chunks's data.
+
+## 文件传输优化
+
+### 分块文件传输
+用户文件可能是比较大的，上传到Cloud或者下载到本地的时间可能比较长，而且在传输的过程中也可能出现网络不稳定的情况。为了应对以上的问题，我们提出了Chunk的概念，一个Chunk由所在的文件偏移、数据、数据长度及校验值组成。文件的上传和下载都是通过对Chunk的操作来实现的。由于Chunk比较小（默认256K），完成一个传输动作完成的时间也比较短，不容易出错。PFSClient需要在传输完毕最后一个Chunk的时候检查destination文件的MD5值是否和source文件一致。
+
+一个典型的Chunk如下所示：
+
+```
+type Chunk struct {
+	fileOffset int64
+	checksum uint32
+	len     uint32
+	data    []byte
+}
+```  
+
+### 生成sparse文件
+当destination文件不存在或者大小和source文件不一致时，可以用[Fallocate](https://Go.org/pkg/syscall/#Fallocate)生成sparse文件，然后就可以并发写入多个Chunk。
+
+### 覆盖不一致的部分
+文件传输的的关键在于需要PFSClient端对比source和destination的文件Chunks的checksum是否保持一致，不一致的由PFSClient下载或者传输Chunk完成。这样已经传输成功的部分就不用重新传输了。
+
+## 用户使用流程
+参考[link](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md)
+
+## 框架生成
+用[swagger](https://github.com/swagger-api/swagger-codegen)生成PFSClient和PFSServer的框架部分，以便我们可以把更多的精力放到逻辑本身上。
+
+## 参考文档
+- <a name=tls></a>[TLS complete guide](https://github.com/k8sp/tls/blob/master/tls.md)
+- [aws.s3](http://docs.aws.amazon.com/cli/latest/reference/s3/)
+- [linux man document](https://linux.die.net/man/)
diff --git a/doc/design/file_manager/pfs/pfsclient.md b/doc/design/file_manager/pfs/pfsclient.md
new file mode 100644
index 0000000000000000000000000000000000000000..56bc70c54bbc92b78d66e04fb495b1300cf8ebe0
--- /dev/null
+++ b/doc/design/file_manager/pfs/pfsclient.md
@@ -0,0 +1,129 @@
+# PFSClient
+
+## Description
+The `pfs` command is a Command Line Interface to manage your files on PaddlePaddle Cloud
+
+## Synopsis
+```
+paddle [options] pfs <subcommand> [parameters]
+```
+
+## Options
+```
+--profile (string)
+	Use a specific profile from your credential file.
+
+--help (string)
+	Display more information about command
+
+--version
+	Output version information and exit
+
+--debug
+	Show detailed debugging log	
+	
+--only-show-errors (boolean) 
+	Only errors and warnings are displayed. All other output is suppressed.
+```
+
+## Path Arguments
+When using a command, we need to specify path arguments. There are two path argument type: `localpath` and `pfspath`.  
+
+A `pfspath` begin with `/pfs`, eg: `/pfs/$DATACENTER/home/$USER/folder`.
+
+[Here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md#上传训练文件) is how to config datacenters.
+
+## order of Path Arguments
+Commonly, if there are two path arguments, the first is the source, and the second is the destination.
+
+## Subcommonds
+- rm - remove files or directories
+
+```
+Synopsis:
+	rm [-r] [-v] <PFSPath> ...
+
+Options:
+	-r 
+		Remove directories and their contents recursively 
+	-v      
+		Cause rm to be verbose, showing files after they are removed.
+	
+Examples:
+	paddle pfs rm /pfs/$DATACENTER/home/$USER/file
+	paddle pfs rm -r /pfs/$DATACENTER/home/$USER/folder
+```
+- mv - move (rename) files
+
+```
+Synopsis:
+	mv [-f | -n] [-v] <LocalPath> <PFSPath>
+	mv [-f | -n] [-v] <LocalPath> ... <PFSPath>
+	mv [-f | -n] [-v] <PFSPath> <LocalPath> 
+	mv [-f | -n] [-v] <PFSPath> ... <LocalPath> 
+	mv [-f | -n] [-v] <PFSPath> <PFSPath> 
+	mv [-f | -n] [-v] <PFSPath> ... <PFSPath> 
+	
+Options:
+	-f      
+		Do not prompt for confirmation before overwriting the destination path.  (The -f option overrides previous -n options.)
+	-n      
+		Do not overwrite an existing file.  (The -n option overrides previous -f options.)
+	-v      
+		Cause mv to be verbose, showing files after they are moved.
+		
+Examples:
+	paddle pfs mv ./text1.txt /pfs/$DATACENTER/home/$USER/text1.txt
+```
+- cp - copy files or directories
+
+```
+Synopsis:
+	cp [-r] [-f | -n] [-v] [--preserve--links] <LocalPath> <PFSPath>
+	cp [-r] [-f | -n] [-v] [--preserve--links] <LocalPath> ... <PFSPath>
+	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> <LocalPath> 
+	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> ... <LocalPath>
+	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> <PFSPath> 
+	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> ... <PFSPath>
+
+Options:
+	-r
+   		Copy directories recursively
+   	-f      
+		Do not prompt for confirmation before overwriting the destination path.  (The -f option overrides previous -n options.)
+	-n      
+		Do not overwrite an existing file.  (The -n option overrides previous -f options.)
+	-v      
+		Cause cp to be verbose, showing files after they are copied.
+	--preserve--links
+	   Reserve links when copy links
+	   
+Examples:
+	paddle pfs cp ./file /pfs/$DATACENTER/home/$USER/file
+	paddle pfs cp /pfs/$DATACENTER/home/$USER/file ./file
+```
+- ls- list files
+
+```
+Synopsis:
+	ls [-r] <PFSPath> ...
+	
+Options:
+	-R
+   		List directory(ies) recursively
+
+Examples:
+	paddle pfs ls  /pfs/$DATACENTER/home/$USER/file
+	paddle pfs ls  /pfs/$DATACENTER/home/$USER/folder
+```
+
+- mkdir - mkdir directory(ies)
+Create intermediate directory(ies) as required.
+
+```
+Synopsis:
+	mkdir <PFSPath> ...
+
+Examples:
+	paddle pfs mkdir  /pfs/$DATACENTER/home/$USER/folder
+```
diff --git a/doc/design/file_manager/src/filemanager.graffle b/doc/design/file_manager/src/filemanager.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..7861a33072bc1908f69d12b37c20491dd8663103
Binary files /dev/null and b/doc/design/file_manager/src/filemanager.graffle differ
diff --git a/doc/design/file_manager/src/filemanager.png b/doc/design/file_manager/src/filemanager.png
new file mode 100644
index 0000000000000000000000000000000000000000..8139a19f5722f56d3c211f3ab0d3982f751134b9
Binary files /dev/null and b/doc/design/file_manager/src/filemanager.png differ
diff --git a/doc/design/parameters_in_cpp.md b/doc/design/parameters_in_cpp.md
new file mode 100644
index 0000000000000000000000000000000000000000..b6f99bc7d9d6fafacb0a4bcff806b65d9aef98cc
--- /dev/null
+++ b/doc/design/parameters_in_cpp.md
@@ -0,0 +1,41 @@
+# Design Doc: The C++ Class `Parameters`
+
+`Parameters` is a concept we designed in Paddle V2 API. `Parameters` is a container of parameters, and make Paddle can shared parameter between topologies. We described usages of `Parameter` in [api.md](./api.md).
+
+We used Python to implement Parameters when designing V2 API before. There are several defects for current implementation:
+* We just use `memcpy` to share Parameters between topologies, but this is very inefficient. 
+* We did not implement share Parameters while training. We just trigger `memcpy` when start training.
+
+It is necessary that we implement Parameters in CPP side. However, it could be a code refactoring for Paddle, because Paddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current Paddle implementation, there are three concepts associated with `Parameters`:
+
+1. `paddle::Parameter`. A `Parameters` is a container for `paddle::Parameter`.
+It is evident that we should use `paddle::Parameter` when developing `Parameters`.
+However, the `Parameter` class contains many functions and does not have a clear interface.
+It contains `create/store Parameter`, `serialize/deserialize`, `optimize(i.e SGD)`, `randomize/zero`.
+When we developing `Parameters`, we only use `create/store Parameter` functionality.
+We should extract functionalities of Parameter into many classes to clean Paddle CPP implementation.
+
+2. `paddle::GradientMachine` and its sub-classes, e.g., `paddle::MultiGradientMachine`, `paddle::NeuralNetwork`.
+We should pass `Parameters` to `paddle::GradientMachine` when `forward/backward` to avoid `memcpy` between topologies.
+Also, we should handle multi-GPU/CPU training, because `forward` and `backward` would perform on multi-GPUs and multi-CPUs.
+`Parameters` should dispatch the parameter value to each device, and gather the parameter gradient from each device.
+
+3. `paddle::ParameterUpdater`. The ParameterUpdater is used to update parameters in Paddle. 
+So `Parameters` should be used by `paddle::ParameterUpdater`, and `paddle::ParameterUpdater` should optimize `Parameters` (by SGD).
+
+
+The step by step approach for implementation Parameters in Paddle C++ core is listed below. Each step should be a PR and could be merged into Paddle one by one.
+
+1. Clean `paddle::Parameter` interface. Extract the functionalities of `paddle::Parameter` to prepare for the implementation of Parameters.
+
+2. Implementation a `Parameters` class. It just stores the `paddle::Parameter` inside. Make `GradientMachine` uses `Parameters` as a class member.
+
+3. Make `Parameters` support Multi-CPU and Multi-GPU training to prepare for sharing `Parameter` between topologies.
+Because we need share `Parameters` between topologies, it is `Parameters`'s response to exchange Parameters between GPUs.
+`GradientMachine` should not handle how to exchange Parameters because `GradientMachine` only used to train one topology and we need to support train many topologies in Paddle, i.e., there could be many GradientMachines use one `Parameters`.
+   * We should use a global function to exchange Parameters between GPUs, not a member function in `Parameters`. The `MultiGradientMachine` invoke this function, which uses `Parameters` as this function inputs.
+   * The MultiGradientMachine contains many functionalities. Extracting the Parameters exchanging logic could make MultiGradientMachine clearer and simpler.
+
+4. Make `Parameters` as an argument for `forward/backward` function, not a data member for `GradientMachine`. For example, `forward` could be `forward(const Parameters& params, ...)` and `backward` could be `backward(Parameters* params, ...)`. After this step, Paddle could share `Parameters` between topologies.
+
+5. `ParameterUpdater` is invoked by `GradientMachine` and `Trainer`, but it updates `Parameters`. In the end of this code refactoring, we could change `ParameterUpdater` directly uses `Parameters` to make `ParameterUpdater`'s implementation clear.
diff --git a/doc/design/speech/README.MD b/doc/design/speech/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..7304650e628dba210488cd2dc4836318b5383b2a
--- /dev/null
+++ b/doc/design/speech/README.MD
@@ -0,0 +1,155 @@
+# DeepSpeech2 on PaddlePaddle: Design Doc 
+
+We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine,  on PaddlePaddle. For the first-stage plan, we have the following short-term goals:
+
+- Release a basic distributed implementation of DS2 on PaddlePaddle.
+- Contribute a chapter of Deep Speech to PaddlePaddle Book.
+
+Intensive system optimization and low-latency inference library (details in \[[1](#references)\]) are not yet covered in this first-stage plan.
+
+## Table of Contents
+
+- [Tasks](#tasks)
+- [Task Dependency](#task-dependency)
+- [Design Details](#design-details)
+    - [Overview](#overview)
+    - [Row Convolution](#row-convolution)
+    - [Beam Search With CTC and LM](#beam-search-with-ctc-and-lm)
+- [Future Work](#future-work)
+- [References](#references)
+
+## Tasks
+
+We roughly break down the project into 14 tasks:
+
+1. Develop an **audio data provider**:
+	- Json filelist generator.
+	- Audio file format transformer.
+	- Spectrogram feature extraction, power normalization etc.
+	- Batch data reader with SortaGrad.
+	- Data augmentation (optional).
+	- Prepare (one or more) public English data sets & baseline.
+2. Create a **simplified DS2 model configuration**:
+   - With only fixed-length (by padding) audio sequences (otherwise need *Task 3*).
+	- With only bidirectional-GRU (otherwise need *Task 4*).
+	- With only greedy decoder (otherwise need *Task 5, 6*).
+3. Develop to support **variable-shaped** dense-vector (image) batches of input data.
+   - Update `DenseScanner` in `dataprovider_converter.py`, etc.
+4. Develop a new **lookahead-row-convolution layer** (See \[[1](#references)\] for details):
+   - Lookahead convolution windows.
+   - Within-row convolution, without kernels shared across rows.
+5. Build KenLM **language model** (5-gram) for beam search decoder:
+   - Use KenLM toolkit.
+   - Prepare the corpus & train the model.
+   - Create infererence interfaces (for Task 6).
+6. Develop a **beam search decoder** with CTC + LM + WORDCOUNT:
+   - Beam search with CTC.
+   - Beam search with external custom scorer (e.g. LM).
+   - Try to design a more general beam search interface.
+7. Develop a **Word Error Rate evaluator**:
+   - update `ctc_error_evaluator`(CER) to support WER.
+8. Prepare internal dataset for Mandarin (optional):
+    - Dataset, baseline, evaluation details.
+    - Particular data preprocessing for Mandarin.
+    - Might need cooperating with the Speech Department.
+9. Create **standard DS2 model configuration**:
+   - With variable-length audio sequences (need *Task 3*).
+	- With unidirectional-GRU + row-convolution (need *Task 4*).
+	- With CTC-LM beam search decoder (need *Task 5, 6*).
+10. Make it run perfectly on **clusters**.
+11. Experiments and **benchmarking** (for accuracy, not efficiency):
+    - With public English dataset.
+    - With internal (Baidu) Mandarin dataset (optional).
+12. Time **profiling** and optimization.
+13. Prepare **docs**.
+14. Prepare PaddlePaddle **Book** chapter with a simplified version.
+
+## Task Dependency
+
+Tasks parallelizable within phases:
+
+Roadmap     | Description                               | Parallelizable Tasks 
+----------- | :------------------------------------     | :--------------------
+Phase I	    | Simplified model & components             | *Task 1* ~ *Task 8*
+Phase II    | Standard model & benchmarking & profiling | *Task 9* ~ *Task 12*
+Phase III   | Documentations                            | *Task13* ~ *Task14*
+
+Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!
+
+## Design Details
+
+### Overview
+
+Traditional **ASR** (Automatic Speech Recognition) pipelines require great human efforts devoted to elaborately tuning multiple hand-engineered components (e.g. audio feature design, accoustic model, pronuncation model and language model etc.). **Deep Speech 2** (**DS2**) \[[1](#references)\], however, trains such ASR models in an end-to-end manner, replacing most intermediate modules with only a single deep network architecture. With scaling up both the data and model sizes, DS2 achieves a very significant performance boost.
+
+Please read Deep Speech 2 \[[1](#references),[2](#references)\] paper for more background knowledge.
+
+The classical DS2 network contains 15 layers (from bottom to top):
+
+- **Two** data layers (audio spectrogram, transcription text)
+- **Three** 2D convolution layers
+- **Seven** uni-directional simple-RNN layers
+- **One** lookahead row convolution layers
+- **One** fully-connected layers
+- **One** CTC-loss layer
+
+<div align="center">
+<img src="image/ds2_network.png" width=350><br/>
+Figure 1. Archetecture of Deep Speech 2 Network.
+</div>
+
+We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar networks with different depths might also work well. As in \[[1](#references)\], authors use a different depth (e.g. 2-2-3-1-1-1) for final experiments.
+
+Key ingredients about the layers:
+
+- **Data Layers**: 
+   - Frame sequences data of audio **spectrogram** (with FFT).
+   - Token sequences data of **transcription** text (labels). 
+   - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required.
+- **2D Convolution Layers**: 
+   - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension).
+   - With striding for only the first convlution layer.
+   - No pooling for all convolution layers.
+- **Uni-directional RNNs** 
+	- Uni-directional + row convolution: for low-latency inference.
+	- Bi-direcitional + without row convolution: if we don't care about the inference latency.
+- **Row convolution**:
+	- For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs.
+	- Not nessesary if with bi-direcitional RNNs. 
+	- "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across.
+- **Batch Normalization Layers**:
+   - Added to all above layers (except for data and loss layer).
+   - Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration.
+ 
+
+Required Components                     | PaddlePaddle Support                      | Need to Develop
+:-------------------------------------  | :--------------------------------------   | :-----------------------
+Data Layer I (Spectrogram)	            | Not supported yet.                        |  TBD (Task 3)
+Data Layer II (Transcription)           | `paddle.data_type.integer_value_sequence` | -
+2D Convolution Layer                    | `paddle.layer.image_conv_layer`           | -
+DataType Converter (vec2seq)            | `paddle.layer.block_expand`               | -
+Bi-/Uni-directional RNNs                | `paddle.layer.recurrent_group`            | -
+Row Convolution Layer                   | Not supported yet.                        | TBD (Task 4)
+CTC-loss Layer                          | `paddle.layer.warp_ctc`                   | -
+Batch Normalization Layer               | `paddle.layer.batch_norm`                 | -
+CTC-Beam search                         | Not supported yet.                        | TBD (Task 6)
+
+### Row Convolution
+
+TODO by Assignees
+
+### Beam Search with CTC and LM
+
+TODO by Assignees
+
+## Future Work
+
+- Efficiency Improvement
+- Accuracy Improvement
+- Low-latency Inference Library
+- Large-scale benchmarking
+
+## References
+
+1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
+2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). 	arXiv:1512.02595.
diff --git a/doc/design/speech/image/ds2_network.png b/doc/design/speech/image/ds2_network.png
new file mode 100644
index 0000000000000000000000000000000000000000..1a5b2184d47928cc2849d5a7c8ea2d8cf5337e11
Binary files /dev/null and b/doc/design/speech/image/ds2_network.png differ
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index df5e172252277a881480cd2816eb901b711abe6b..c14160d55ec8fdb9fc552da33f3a3dac13c1a764 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -232,7 +232,19 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 
 用户需要指定本机上Python的路径：``<exc_path>``, ``<lib_path>``, ``<inc_path>``
 
-10. A protocol message was rejected because it was too big
+11. CMake源码编译，Paddle版本号为0.0.0
+--------------------------------------
+
+如果运行 :code:`paddle version`, 出现 :code:`PaddlePaddle 0.0.0`；或者运行 :code:`cmake ..`，出现
+
+..  code-block:: bash
+
+    CMake Warning at cmake/version.cmake:20 (message):
+      Cannot add paddle version from git tag
+          
+那么用户需要拉取所有的远程分支到本机，命令为 :code:`git fetch upstream`，然后重新cmake即可。
+
+12. A protocol message was rejected because it was too big
 ----------------------------------------------------------
 
 如果在训练NLP相关模型时，出现以下错误：
@@ -270,7 +282,7 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 
 完整源码可参考 `seqToseq <https://github.com/PaddlePaddle/Paddle/tree/develop/demo/seqToseq>`_ 示例。
 
-11. 如何指定GPU设备
+13. 如何指定GPU设备
 -------------------
 
 例如机器上有4块GPU，编号从0开始，指定使用2、3号GPU：
@@ -288,7 +300,7 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
         paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
 
 
-12. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办?
+14. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办?
 ------------------------------------------------------------------------
 
 Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异常(即训练过程中出现NaN或者Inf)，立刻退出。浮点异常通常的原因是浮点数溢出、除零等问题。
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index da2d4234658b6ea4730346e721437cc1633c4362..87c286a1af75e08313813f1373ea03b85d4af523 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -12,13 +12,13 @@ PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打
 像，称为生产镜像，里面涵盖了PaddlePaddle运行所需的所有环境。每次
 PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运
 行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在
-`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 提供最新
+`dockerhub.com <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 提供最新
 的Docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。为了方便在国
 内的开发者下载Docker镜像，我们提供了国内的镜像服务器供大家使用。如果您
 在国内，请把文档里命令中的paddlepaddle/paddle替换成
 docker.paddlepaddle.org/paddle。
 
-1. 开发镜像：:code:`paddlepaddle/paddle:<version>-dev`
+1. 开发镜像：:code:`paddlepaddle/paddle:0.10.0-dev`
 
    这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境，完成开发，编译，发布，
    文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具，所以如果需要自行配置开发环境需要考虑版本的因素。
@@ -37,13 +37,13 @@ docker.paddlepaddle.org/paddle。
 
    .. code-block:: bash
 
-      docker run -it --rm paddlepaddle/paddle:<version>-dev /bin/bash
+      docker run -it --rm paddlepaddle/paddle:0.10.0-dev /bin/bash
 
    或者，可以以后台进程方式运行容器：
 
    .. code-block:: bash
 
-      docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:<version>-dev
+      docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0-dev
 
    然后用密码 :code:`root` SSH进入容器：
 
@@ -73,7 +73,7 @@ docker.paddlepaddle.org/paddle。
 
    .. code-block:: bash
 
-      nvidia-docker run -it --rm paddledev/paddle:0.10.0rc1-gpu /bin/bash
+      nvidia-docker run -it --rm paddledev/paddle:0.10.0-gpu /bin/bash
 
    注意: 如果使用nvidia-docker存在问题，你也许可以尝试更老的方法，具体如下，但是我们并不推荐这种方法。：
 
@@ -81,7 +81,7 @@ docker.paddlepaddle.org/paddle。
 
       export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
       export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-      docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:<version>-gpu
+      docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0-gpu
 
 3. 运行以及发布您的AI程序
 
@@ -98,7 +98,7 @@ docker.paddlepaddle.org/paddle。
       nvidia-docker run -it -v $PWD:/work paddle /work/a.py
 
 
-   这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像，可以编写`Dockerfile`使用`FROM paddledev/paddle:<version>`
+   这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像，可以编写`Dockerfile`使用`FROM paddledev/paddle:0.10.0`
    创建和发布自己的AI程序镜像。
 
 运行PaddlePaddle Book
@@ -177,7 +177,7 @@ Paddle的Docker开发镜像带有一个通过 `woboq code browser
 
 .. code-block:: bash
 
-   docker run -d --name paddle-cpu-doc paddle:<version>-dev
+   docker run -d --name paddle-cpu-doc paddle:0.10.0-dev
    docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
 
 接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 03df497506099d2fb758bd0ab437d2c082f2b537..b6fd3329b273aabe80edd5f1ff064a311648b3c2 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -23,7 +23,7 @@ Docker is simple as long as we understand a few basic concepts:
 
   .. code-block:: bash
 		  
-     docker pull paddlepaddle/paddle:0.10.0rc2
+     docker pull paddlepaddle/paddle:0.10.0
 
   to download a Docker image, paddlepaddle/paddle in this example,
   from Dockerhub.com.
@@ -35,7 +35,7 @@ Docker is simple as long as we understand a few basic concepts:
 
   .. code-block:: bash
 
-     docker run paddlepaddle/paddle:0.10.0rc2
+     docker run paddlepaddle/paddle:0.10.0
 
   to start a container to run a Docker image, paddlepaddle/paddle in this example.
 
@@ -62,7 +62,7 @@ of PaddlePaddle, we release both of them. Production image includes
 CPU-only version and a CUDA GPU version and their no-AVX versions.
 
 We put the docker images on `dockerhub.com
-<https://hub.docker.com/r/paddledev/paddle/>`_. You can find the
+<https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_. You can find the
 latest versions under "tags" tab at dockerhub.com. If you are in
 China, you can use our Docker image registry mirror to speed up the
 download process. To use it, please replace all paddlepaddle/paddle in
@@ -89,7 +89,7 @@ the commands to docker.paddlepaddle.org/paddle.
 
    .. code-block:: bash
 
-      docker run -it --rm paddlepaddle/paddle:0.10.0rc2 /bin/bash
+      docker run -it --rm paddlepaddle/paddle:0.10.0 /bin/bash
 
    Above method work with the GPU image too -- the recommended way is
    using `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_.
@@ -101,7 +101,7 @@ the commands to docker.paddlepaddle.org/paddle.
 
    .. code-block:: bash
 
-      nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash
+      nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0-gpu /bin/bash
 
 2. development image :code:`paddlepaddle/paddle:<version>-dev`
 
@@ -149,13 +149,13 @@ Run the program using docker:
 
 .. code-block:: bash
 
-   docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 python /workspace/example.py
+   docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 python /workspace/example.py
 
 Or if you are using GPU for training:
 
 .. code-block:: bash
 
-   nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu python /workspace/example.py
+   nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu python /workspace/example.py
 
 Above commands will start a docker container by running :code:`python
 /workspace/example.py`. It will stop once :code:`python
@@ -166,7 +166,7 @@ run PaddlePaddle program interactively:
 
 .. code-block:: bash
 
-   docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 /bin/bash
+   docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 /bin/bash
    # now we are inside docker container
    cd /workspace
    python example.py
@@ -175,7 +175,7 @@ Running with GPU is identical:
 
 .. code-block:: bash
 
-   nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash
+   nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu /bin/bash
    # now we are inside docker container
    cd /workspace
    python example.py
diff --git a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md b/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..90dc84718c9ce1374cda6022de177afeeb60279d
--- /dev/null
+++ b/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
@@ -0,0 +1,75 @@
+# 构建Android平台上的PaddlePaddle库
+
+用户可通过交叉编译的方式，在用户熟悉的开发平台（Linux，Mac OS X和Windows）上编译Android平台上适用的PaddlePaddle库。
+本文档将以Linux x86-64平台为例，介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
+
+## 准备交叉编译环境
+
+从源码交叉编译PaddlePaddle，用户需要提前准备好交叉编译环境。Android平台上使用的C/C++交叉编译工具链为[Android NDK](https://developer.android.com/ndk/downloads/index.html?hl=zh-cn)，用户可自行前往下载预编译好的版本，也可通过以下命令获取：
+
+```bash
+wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
+unzip -q android-ndk-r14b-linux-x86_64.zip
+```
+
+Android NDK中包含了所有Android API级别、所有架构（arm/arm64/x86/mips）需要用到的编译工具和系统库。用户可根据自己的编译目标架构、所需支持的最低Android API级别，构建[独立工具链](https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn)。
+比如：
+
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm --platform=android-21 --install-dir=your/path/to/my_standalone_toolchain
+```
+
+此命令将在your/path/to/my_standalone_toolchain目录生成一套编译工具链，面向架构为32位ARM架构，支持的最小的Android API级别为21，使用的编译器为arm-linux-androideabi-gcc (GCC) 4.9。
+
+注意：**PaddlePaddle要求使用的编译工具链所支持的Andoid API级别不小于21**。
+
+## 配置交叉编译参数
+
+CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置，PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake)，以提供一些默认的编译器和编译参数相关配置。注意，从CMake 3.7版本开始，CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时，将会将用户传进来的配置参数传递CMake系统，交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。
+
+交叉编译Android版本的PaddlePaddle库时，有一些必须配置的参数：
+- `CMAKE_SYSTEM_NAME`，CMake编译的目标平台，必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后，PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本，并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及Android所需`arm_soft_fp_abi`分支的目标机版OpenBLAS库。此外，还会强制设置一些PaddlePaddle参数的值（`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`）。
+- `WITH_C_API`，必须设置为`ON`。在Android平台上只支持使用C-API来预测。
+- `WITH_SWIG_PY`，必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。
+
+Android平台可选配置参数：
+
+- `ANDROID_STANDALONE_TOOLCHAIN`，独立工具链所在的绝对路径，或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动推导和设置需要使用的交叉编译器、sysroot、以及Android API级别；否则，用户需要在cmake时手动设置这些值。无默认值。
+- `ANDROID_ABI`，目标架构ABI。目前只支持`armeabi-v7a`，默认值为`armeabi-v7a`。
+- `ANDROID_NATIVE_API_LEVEL`，工具链的Android API级别。若没有显式设置，PaddlePaddle将根据`ANDROID_STANDALONE_TOOLCHAIN`的值自动推导得到。
+- `ANROID_ARM_MODE`，是否使用ARM模式。可设置`ON/OFF`，默认值为`ON`。
+- `ANDROID_ARM_NEON`，是否使用NEON指令。目前必须设置成`ON`，默认值为`ON`。
+
+其他配置参数：
+
+- `HOST_C/CXX_COMPILER`，宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值；若环境变量`CC`没有设置，则设置成`cc`编译器。
+
+一种常用的cmake配置如下：
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/my_standalone_toolchain \
+      -DANDROID_ABI=armeabi-v7a \
+      -DANDROID_ARM_NEON=ON \
+      -DANDROID_ARM_MODE=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。
+
+## 编译和安装
+
+CMake配置完成后，执行以下命令，PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。
+
+```bash
+make
+make install
+```
+
+注意：如果你曾经在源码目录下编译过其他平台的PaddlePaddle库，请先使用`rm -rf`命令删除`third_party`目录和`build`目录，以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
+
+执行完安装命令后，`your/path/to/install`目录中会包含`include`和`lib`目录，其中`include`中包含C-API的头文件，`lib`中包含一个Android版本的库。自此，PaddlePaddle的已经安装完成，用户可将`your/path/to/install`目录下的生成文件用于深度学习相关Android App中，调用方法见C-API文档。
diff --git a/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md b/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..085b5dda1615a9af918b59870db460fcc5acdcca
--- /dev/null
+++ b/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md
@@ -0,0 +1,65 @@
+# 构建Raspberry Pi平台上的PaddlePaddle库
+
+对于Rasspberry Pi系统，用户可通过ssh等方式登录到Raspberry Pi系统上，按照[源码编译PaddlePaddle](http://www.paddlepaddle.org/doc_cn/getstarted/build_and_install/cmake/build_from_source_cn.html)相关文档所述，直接编译Raspberry Pi平台上适用的PaddlePaddle库。
+
+用户也可以在自己熟悉的开发平台上，通过交叉编译的方式来编译。这篇文档将以Linux x86-64平台为例，介绍交叉编译Raspberry Pi平台上适用的PaddlePaddle的方法和步骤。
+
+## 准备交叉编译环境
+
+从源码交叉编译PaddlePaddle，用户需要提前准备好交叉编译环境。用户可自行前往[github](https://github.com/raspberrypi/tools)下载Raspberry Pi平台使用的C/C++交叉编译工具链，也可通过以下命令获取：
+
+```bash
+git clone https://github.com/raspberrypi/tools.git
+```
+
+该github仓库中包含若干个预编译好的、针对不同平台的编译工具。宿主机是Linux x86-64环境，则需选用`arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64`下的作为编译工具，所使用的编译器为arm-linux-gnueabihf-gcc 4.8.3。
+
+注意，该编译工具链需要系统glibc支持2.14以上。
+
+## 配置交叉编译参数
+
+CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置，PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/raspberry_pi.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake)，以提供一些默认的编译器和编译参数相关配置。
+
+交叉编译Raspberry Pi版本PaddlePaddle库时，有一些必须配置的参数：
+
+- `CMAKE_SYSTEM_NAME`，CMake编译的目标平台，必须配置为`RPi`。在设置`CMAKE_SYSTEM_NAME=RPi`后，PaddlePaddle的CMake系统才认为在是在交叉编译Raspberry Pi系统的版本，并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及目标机版OpenBLAS库。
+
+Raspberry Pi平台可选配置参数：
+
+- `RPI_TOOLCHAIN`，编译工具链所在的绝对路径，或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动设置需要使用的交叉编译器；否则，用户需要在cmake时手动设置这些值。无默认值。
+- `RPI_ARM_NEON`，是否使用NEON指令。目前必须设置成`ON`，默认值为`ON`。
+
+其他配置参数：
+
+- `HOST_C/CXX_COMPILER`，宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值；若环境变量`CC`没有设置，则设置成`cc`编译器。
+
+cmake参数如下；
+
+```
+cmake -DCMAKE_SYSTEM_NAME=RPi \
+      -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \
+      -DRPI_ARM_NEON=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_GPU=OFF \
+      -DWITH_C_API=ON \
+      -DWITH_PYTHON=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。
+
+## 编译和安装
+
+CMake配置完成后，执行以下命令，PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle。
+
+```bash
+make
+make install
+```
+
+注意：如果你曾经在源码目录下编译过其他平台的PaddlePaddle库，请先使用`rm -rf`命令删除`third_party`目录和`build`目录，以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
+
+执行完安装命令后，由于上一步cmake配置中`WITH_C_API`设置为`ON`，`your/path/to/install`目录中会包含`include`和`lib`目录，其中`include`中包含C-API的头文件，`lib`中包含一个Raspberry Pi版本的库。
+
+更多的编译配置见[源码编译PaddlePaddle](http://www.paddlepaddle.org/doc_cn/getstarted/build_and_install/cmake/build_from_source_cn.html)相关文档。
diff --git a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
index 79048e92482851af6c2dd7d055868ebcaa7a298b..e05173c2006ff47ecb6ca5a4fe1502de750acc59 100644
--- a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
+++ b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
@@ -28,17 +28,17 @@ pooling 的使用示例如下，详细见 :ref:`api_v2.layer_pooling` 配置API
 
         seq_pool = pooling(input=layer,
                            pooling_type=pooling.Max(),
-                           agg_level=AggregateLevel.EACH_SEQUENCE)
+                           agg_level=AggregateLevel.TO_SEQUENCE)
         
 - `pooling_type` 目前支持两种，分别是：pooling.Max()和pooling.Avg()。
 
-- `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
+- `agg_level=AggregateLevel.TO_NO_SEQUENCE` 时（默认值）：
 
   - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
   - 输入：一个双层序列，或一个单层序列
   - 输出：一个0层序列，即整个输入序列（单层或双层）的平均值（或最大值）
 
-- `agg_level=AggregateLevel.EACH_SEQUENCE` 时：
+- `agg_level=AggregateLevel.TO_SEQUENCE` 时：
 
   - 作用：一个双层序列经过运算变成一个单层序列
   - 输入：必须是一个双层序列
@@ -52,15 +52,15 @@ last_seq 的使用示例如下（ :ref:`api_v2.layer_first_seq` 类似），详
 ..	code-block:: bash
 
         last = last_seq(input=layer,
-                        agg_level=AggregateLevel.EACH_SEQUENCE)
+                        agg_level=AggregateLevel.TO_SEQUENCE)
         
-- `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
+- `agg_level=AggregateLevel.TO_NO_SEQUENCE` 时（默认值）：
 
   - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
   - 输入：一个双层序列或一个单层序列
   - 输出：一个0层序列，即整个输入序列（双层或者单层）最后一个，或第一个元素。
 
-- `agg_level=AggregateLevel.EACH_SEQUENCE` 时：
+- `agg_level=AggregateLevel.TO_SEQUENCE` 时：
   - 作用：一个双层序列经过运算变成一个单层序列
   - 输入：必须是一个双层序列
   - 输出：一个单层序列，其中每个元素是双层序列中每个subseq最后一个（或第一个）元素。
@@ -74,9 +74,9 @@ expand 的使用示例如下，详细见 :ref:`api_v2.layer_expand` 配置API。
 
         ex = expand(input=layer1,
                     expand_as=layer2,
-                    expand_level=ExpandLevel.FROM_TIMESTEP)
+                    expand_level=ExpandLevel.FROM_NO_SEQUENCE)
         
-- `expand_level=ExpandLevel.FROM_TIMESTEP` 时（默认值）：
+- `expand_level=ExpandLevel.FROM_NO_SEQUENCE` 时（默认值）：
 
   - 作用：一个0层序列经过运算扩展成一个单层序列，或者一个双层序列
   - 输入：layer1必须是一个0层序列，是待扩展的数据；layer2 可以是一个单层序列，或者是一个双层序列，提供扩展的长度信息
diff --git a/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst b/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
index 96e52b910a22576fd75c9d4e1bef6e2cf74bc84f..efdc44455ea4dc81a87b4d4fc8a81e78b15cb06a 100644
--- a/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
+++ b/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
@@ -81,7 +81,7 @@
   
   * 在本例中，我们将原始数据的每一组，通过\ :code:`recurrent_group`\ 进行拆解，拆解成的每一句话再通过一个LSTM网络。这和单层RNN的配置是等价的。
 
-* 与单层RNN的配置类似，我们只需要使用LSTM encode成的最后一个向量。所以对\ :code:`recurrent_group`\ 进行了\ :code:`last_seq`\ 操作。但和单层RNN不同，我们是对每一个子序列取最后一个元素，因此\ :code:`agg_level=AggregateLevel.EACH_SEQUENCE`\ 。
+* 与单层RNN的配置类似，我们只需要使用LSTM encode成的最后一个向量。所以对\ :code:`recurrent_group`\ 进行了\ :code:`last_seq`\ 操作。但和单层RNN不同，我们是对每一个子序列取最后一个元素，因此\ :code:`agg_level=AggregateLevel.TO_SEQUENCE`\ 。
 
 * 至此，\ :code:`lstm_last`\ 便和单层RNN配置中的\ :code:`lstm_last`\ 具有相同的结果了。
 
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md
index 775938612e8d213b92e2eb69dae805838dc5ae96..699390145226ec2b65fdf5122db187e1d30d669e 100644
--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -7,6 +7,7 @@
 - 确保编译器选项 `WITH_STYLE_CHECK` 已打开，并且编译能通过代码样式检查。
 - 所有代码必须具有单元测试。
 - 通过所有单元测试。
+- 请遵守[提交代码的一些约定](#提交代码的一些约定)。
 
 以下教程将指导您提交代码。
 ## [Fork](https://help.github.com/articles/fork-a-repo/)
@@ -83,7 +84,7 @@ no changes added to commit (use "git add" and/or "git commit -a")
 ➜  docker build -t paddle:dev .
 ```
 
-随后可以用这个开发镜像开build PaddlePaddle的源码。比如如果要build一个不依赖GPU，但是支持AVX指令集，并且包括unit tests的PaddlePaddle，可以：
+随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU，但是支持AVX指令集，并且包括unit tests的PaddlePaddle，可以：
 
 ```bash
 ➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev
@@ -217,3 +218,22 @@ upstream
 ```
 
 至此，我们就完成了一次代码贡献的过程。
+
+## 提交代码的一些约定
+
+为了使评审人在评审代码时更好地专注于代码本身，请您每次提交代码时，遵守以下约定：
+1. 请保证Travis-CI 中单元测试能顺利通过。如果没过，说明提交的代码存在问题，评审人一般不做评审。
+2. 提交PUll Request前：
+   - 请注意commit的数量：
+     - 原因：如果仅仅修改一个文件但提交了十几个commit，每个commit只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改，且不排除commit之间的修改存在相互覆盖的情况。
+     - 建议：每次提交时，保持尽量少的commit，可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit，可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
+   - 请注意每个commit的名称：应能反映当前commit的内容，不能太随意。
+3. 如果解决了某个Issue的问题，请在该PUll Request的**第一个**评论框中加上：`fix #issue_number`，这样当该PUll Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
+
+此外，在回复评审人意见时，请您遵守以下约定：
+1. 评审人的每个意见都必须回复（这是开源社区的基本礼貌，别人帮了忙，应该说谢谢）：
+   - 对评审意见同意且按其修改完的，给个简单的`Done`即可；
+   - 对评审意见不同意的，请给出您自己的反驳理由。
+2. 如果评审意见比较多：
+   - 请给出总体的修改情况。
+   - 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复，而非直接回复的方式。原因是每个回复都会发送一封邮件，会造成邮件灾难。
diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md
index 9b0d3e83c0dc264650eda73e6801c60a75439b4a..40d1eb62d722244139cc84eb170c190d988f5626 100644
--- a/doc/howto/dev/contribute_to_paddle_en.md
+++ b/doc/howto/dev/contribute_to_paddle_en.md
@@ -4,9 +4,9 @@ We sincerely appreciate your contributions. You can use fork and pull request
 workflow to merge your code.
 
 ## Code Requirements
-- Your code must be fully documented by
-  [doxygen](http://www.stack.nl/~dimitri/doxygen/) style.
-- Make sure the compiler option WITH\_STYLE\_CHECK is on and the compiler
+- Your code comments must be fully documented by
+  [Doxygen](http://www.stack.nl/~dimitri/doxygen/) style.
+- Make sure the compiler option `WITH_STYLE_CHECK` is on and the compiler
   passes the code style check.
 - All code must have unit test.
 - Pass all unit tests.
@@ -20,32 +20,25 @@ It's just that simple.
 
 ## Clone
 
-Paddle is currently using [git-flow branching model](http://nvie.com/posts/a-successful-git-branching-model/).
-The **develop** is the main branch, and other user's branches are feature branches.
+Clone remote repository.
 
-Once you've created a fork, you can use your favorite git client to clone your
-repo or just head straight to the command line:
-
-```shell
-# Clone your fork to your local machine
-git clone --branch develop https://github.com/USERNAME/Paddle.git
-```
-If your repository doesn't contain **develop** branch, just create it by your own.
-
-```shell
-git clone https://github.com/USERNAME/Paddle.git Paddle
-cd Paddle
-git checkout -b develop  # create develop branch.
-git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # add upstream to baidu/Paddle
-git pull upstream develop  # update to upstream
+```bash
+➜  git clone https://github.com/USERNAME/Paddle
+➜  cd Paddle
 ```
 
-Then you can start to develop by making a local developement branch
+## Create a local branch
+
+Paddle is currently using [Git-flow branching model](http://nvie.com/posts/a-successful-git-branching-model/).
 
-```shell
-git checkout -b MY_COOL_STUFF_BRANCH
+All feature and bug fix development work should be done on a new branch, generally create new branch from `develop` branch .
+
+```bash
+➜  git checkout -b my-cool-stuff
 ```
 
+Before the checkout, you need to keep the current branch directory clean, otherwise the untracked file will be brought to the new branch, which can be inspected by `git status`.
+
 ## Using `pre-commit` hook
 
 Paddle developers use [pre-commit](http://pre-commit.com/) tool to manage git
@@ -58,89 +51,169 @@ To use [pre-commit](http://pre-commit.com/), you should install it by
 `pip install pre-commit`, and currently, Paddle uses `clang-format` to format
 c/cpp sources. Please make sure clang-format 3.8+ installed.
 
-Then just run `pre-commit install` in your Paddle clone directory. When you
-commit your code, the pre-commit hook will check the local code if there is
+Install and run it as follow:
+
+```bash
+➜  pip install pre-commit
+➜  pre-commit install
+```
+
+When you commit your code, the pre-commit hook will check the local code if there is
 anything not suitable to commit, and so on.
 
+## Start to develop
+
+In this tutorial, I delete a line in README.md and created a new file.
+
+We can use `git status` to inspect the changes of current directory, `git diff` to see difference.
+
+```bash
+➜  git status
+On branch test
+Changes not staged for commit:
+  (use "git add <file>..." to update what will be committed)
+  (use "git checkout -- <file>..." to discard changes in working directory)
+
+	modified:   README.md
+
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+	test
+
+no changes added to commit (use "git add" and/or "git commit -a")
+```
+## Build and Test
+
+We package PaddlePaddle's compile environment into a Docker image, called the develop image named `paddle:dev`, it contains all compiling tools that PaddlePaddle needs. 
+
+If you want to build the develop image, just run:
+
+```bash
+➜  docker build -t paddle:dev .
+```
+
+Then we can use the develop image to build PaddlePaddle source. For example:
+
+```bash
+➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev
+```
+
+The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated.
+
+Then we can generate the production image by copying the compiled PaddlePaddle program into the image by
+
+```bash
+➜  docker build -t paddle:prod -f build/Dockerfile .
+```
+
+Run unit test finally:
+
+```bash
+➜  docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
+```
+
+For more details, you can read [this doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst).
+
 ## Commit
 
-Commit your changes by following command lines:
+Next we cancel the changes to the README.md file and then commit our changes by following command lines:
+
+```bash
+➜  git checkout -- README.md
+➜  git status
+On branch test
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+	test
+
+nothing added to commit but untracked files present (use "git add" to track)
+➜  git add test
+```
 
-```shell
-# show the working tree status
-git status
-# add modified files
-git add xx
-env EDITOR=vim git commit  # You can write your comments by vim/nano/emacs.
+We should write a description of each commit by `git commit` to allow others to know
+the changes in these files.
+
+```bash
+➜  git commit
+CRLF end-lines remover...............................(no files to check)Skipped
+yapf.................................................(no files to check)Skipped
+Check for added large files..............................................Passed
+Check for merge conflicts................................................Passed
+Check for broken symlinks................................................Passed
+Detect Private Key...................................(no files to check)Skipped
+Fix End of Files.....................................(no files to check)Skipped
+clang-formater.......................................(no files to check)Skipped
+[my-cool-stuff c703c041] add test file
+ 1 file changed, 0 insertions(+), 0 deletions(-)
+ create mode 100644 233
 ```
-The first line of commit infomation is the title. The second and later lines
-are the details if any.
 
 ## Keeping Fork Up to Date
 
 Before pull your request, you should sync your code from the latest PaddlePaddle.
 To do this, you'll need to add a remote at first:
 
-```shell
-# see the current configured remote repository
-git remote -v
-# add upstream repository
-git remote add upstream https://github.com/PaddlePaddle/Paddle.git
-# verify the new upstream
-git remote -v
+```bash
+➜  git remote add upstream https://github.com/PaddlePaddle/Paddle
+➜  git remote
+origin
+upstream
 ```
 
 Update your fork with the latest upstream changes:
 
-```shell
-git pull --rebase upstream develop
+```bash
+➜  git fetch upstream
+➜  git pull upstream develop
 ```
 
-If there are no unique commits locally, git will simply perform a fast-forward.
-However, if you have been making changes (in the vast majority of cases you
-probably shouldn't be), you may have to deal with conflicts.
-
 Now, your local master branch is up-to-date with everything modified upstream.
 
 ## Push to GitHub
 
-```shell
+```bash
 # push to your repository in Github
-git push -u origin MY_COOL_STUFF_BRANCH  # create remote branch MY_COOL_STUFF_BRANCH to origin.
+➜  git push origin my-cool-stuff
 ```
 
-## Pull Request
+## Create an issue and a Pull Request
+
+Create an Issue to describe the problem and record its number.
 
 Go to the page for your fork on GitHub, select your development branch,
-and click the **pull request button**.
-
-## Update your pull request with the lastest version
-
-During the code review, your pull request may become stale because new commits in
-baidu/Paddle. GitHub allows autmotic update if there is no conflict. You can do this
-by clicking the "Update Branch" button in your pull request page. However, in the case
-of conflict, you need to do the update manually. You need to do the following on
-your local repository:
-```shell
-git checkout MY_COOL_STUFF_BRANCH
-git pull upstream develop
-# You may need to resolve the conflict according to the git prompt.
-# Make and test your code.
-git push origin MY_COOL_STUFF_BRANCH
+and click the `New pull request`.
+
+<img width="295" alt="screen shot 2017-04-26 at 9 09 28 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436054/a6d98c66-2ac4-11e7-9cb1-18dd13150230.png">
+
+Then select the target branch:
+
+<img width="750" alt="screen shot 2017-04-26 at 9 11 52 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436139/f83b1e6c-2ac4-11e7-8c0e-add499023c46.png">
+
+We can add `resolve #Issue number` in PR description to close the issue automatically after the PR is merge. More details in <https://help.github.com/articles/closing-issues-via-commit-messages/>.
+
+Then wait for review, if there need to modify, refer to the above steps to update the corresponding origin branch.
+
+## Delete origin branch
+
+After the PR is merge into the main repository, we can delete the remote branch on the PR page.
+
+<img width="775" alt="screen shot 2017-04-26 at 9 18 24 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436457/e4cdd472-2ac5-11e7-9272-badc76c4a23e.png">
+
+Or just run:
+
+```bash
+➜  git push origin :my-cool-stuff
 ```
-Now your Pull Request is updated with the latest version.
 
-## Revise your pull request
+## Delete local branch
 
-When you revise your pull request according to reviewer's comments, please use 'git commit' instead of 'git commit --amend' to commit your changes so that the reviewers can see the difference between the new pull requrest and the old pull request.
+Finally, we delete local branch:
 
-The possible commands are
+```bash
+➜  git checkout develop 
 
-```shell
-git checkout MY_COOL_STUFF_BRANCH
-git pull upstream develop   # update local to newest code base.
-# May be some conflicts will occured.
-# And develop your cool stuff
-env EDITOR=vim git commit  # add your revise log
-git push origin MY_COOL_STUFF_BRANCH
+# delete my-cool-stuff branch
+➜  git branch -D my-cool-stuff
 ```
diff --git a/go/cmake/CMakeDetermineGoCompiler.cmake b/go/cmake/CMakeDetermineGoCompiler.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..a9bb6906c7440782bd648bb7505a548248a11bb0
--- /dev/null
+++ b/go/cmake/CMakeDetermineGoCompiler.cmake
@@ -0,0 +1,44 @@
+if(NOT CMAKE_Go_COMPILER)
+  if(NOT $ENV{GO_COMPILER} STREQUAL "")
+    get_filename_component(CMAKE_Go_COMPILER_INIT $ENV{GO_COMPILER} PROGRAM PROGRAM_ARGS CMAKE_Go_FLAGS_ENV_INIT)
+
+    if(CMAKE_Go_FLAGS_ENV_INIT)
+      set(CMAKE_Go_COMPILER_ARG1 "${CMAKE_Go_FLAGS_ENV_INIT}" CACHE STRING "First argument to Go compiler")
+    endif()
+
+    if(NOT EXISTS ${CMAKE_Go_COMPILER_INIT})
+      message(SEND_ERROR "Could not find compiler set in environment variable GO_COMPILER:\n$ENV{GO_COMPILER}.")
+    endif()
+
+  endif()
+
+  set(Go_BIN_PATH
+    $ENV{GOPATH}
+    $ENV{GOROOT}
+    $ENV{GOROOT}/../bin
+    $ENV{GO_COMPILER}
+    /usr/bin
+    /usr/local/bin
+    )
+
+  if(CMAKE_Go_COMPILER_INIT)
+    set(CMAKE_Go_COMPILER ${CMAKE_Go_COMPILER_INIT} CACHE PATH "Go Compiler")
+  else()
+    find_program(CMAKE_Go_COMPILER
+      NAMES go
+      PATHS ${Go_BIN_PATH}
+    )
+    EXEC_PROGRAM(${CMAKE_Go_COMPILER} ARGS version OUTPUT_VARIABLE GOLANG_VERSION)
+    STRING(REGEX MATCH "go[0-9]+.[0-9]+.[0-9]+[ /A-Za-z0-9]*" VERSION "${GOLANG_VERSION}")
+    message("-- The Golang compiler identification is ${VERSION}")
+    message("-- Check for working Golang compiler: ${CMAKE_Go_COMPILER}")
+  endif()
+
+endif()
+
+mark_as_advanced(CMAKE_Go_COMPILER)
+
+configure_file(${CMAKE_MODULE_PATH}/CMakeGoCompiler.cmake.in
+  ${CMAKE_PLATFORM_INFO_DIR}/CMakeGoCompiler.cmake @ONLY)
+
+set(CMAKE_Go_COMPILER_ENV_VAR "GO_COMPILER")
diff --git a/go/cmake/CMakeGoCompiler.cmake.in b/go/cmake/CMakeGoCompiler.cmake.in
new file mode 100644
index 0000000000000000000000000000000000000000..a71f08e064656fbaad8cfa77aea6f216515712ef
--- /dev/null
+++ b/go/cmake/CMakeGoCompiler.cmake.in
@@ -0,0 +1,8 @@
+set(CMAKE_Go_COMPILER "@CMAKE_Go_COMPILER@")
+set(CMAKE_Go_COMPILER_LOADED 1)
+
+set(CMAKE_Go_SOURCE_FILE_EXTENSIONS go)
+set(CMAKE_Go_LINKER_PREFERENCE 40)
+set(CMAKE_Go_OUTPUT_EXTENSION .o)
+set(CMAKE_Go_OUTPUT_EXTENSION_REPLACE 1)
+set(CMAKE_Go_COMPILER_ENV_VAR "GO_COMPILER")
diff --git a/go/cmake/CMakeGoInformation.cmake b/go/cmake/CMakeGoInformation.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..ba51ac93fcd429478f324b66bd5129d94ea2a8f4
--- /dev/null
+++ b/go/cmake/CMakeGoInformation.cmake
@@ -0,0 +1,7 @@
+if(NOT CMAKE_Go_COMPILE_OBJECT)
+  set(CMAKE_Go_COMPILE_OBJECT "go tool compile -l -N -o <OBJECT> <SOURCE> ")
+endif()
+
+if(NOT CMAKE_Go_LINK_EXECUTABLE)
+  set(CMAKE_Go_LINK_EXECUTABLE "go tool link -o <TARGET> <OBJECTS>  ")
+endif()
diff --git a/go/cmake/CMakeTestGoCompiler.cmake b/go/cmake/CMakeTestGoCompiler.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..b9891b015baced05b51e34dba562fd98a84fe14c
--- /dev/null
+++ b/go/cmake/CMakeTestGoCompiler.cmake
@@ -0,0 +1 @@
+set(CMAKE_Go_COMPILER_WORKS 1 CACHE INTERNAL "")
diff --git a/go/cmake/flags.cmake b/go/cmake/flags.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..a167c432a920e9ee93878603f3b946e8593412f6
--- /dev/null
+++ b/go/cmake/flags.cmake
@@ -0,0 +1,45 @@
+# Setting Paddle Compile Flags
+include(CheckCXXCompilerFlag)
+include(CheckCCompilerFlag)
+include(CheckCXXSymbolExists)
+include(CheckTypeSize)
+
+function(CheckCompilerCXX11Flag)
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
+            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
+        endif()
+    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
+        # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
+        # https://gist.github.com/yamaya/2924292
+        if(APPLE)  # cmake < 3.0 compiler id "Clang" on Mac OS X
+            if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.1)
+                message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.")
+            endif()
+        else()
+            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
+                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
+            endif()
+        endif()
+    endif()
+endfunction()
+
+CheckCompilerCXX11Flag()
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
+# Common gpu architectures: Kepler, Maxwell
+foreach(capability 30 35 50)
+      list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
+endforeach()
+
+if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
+      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
+endif()
+
+# Modern gpu architectures: Pascal
+if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
+      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
+endif()
+
+set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
diff --git a/go/cmake/golang.cmake b/go/cmake/golang.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..d38d06de2348821b21109f7dc708314da81111c5
--- /dev/null
+++ b/go/cmake/golang.cmake
@@ -0,0 +1,50 @@
+set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
+file(MAKE_DIRECTORY ${GOPATH})
+set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle")
+file(MAKE_DIRECTORY ${PADDLE_IN_GOPATH})
+
+function(GO_LIBRARY NAME BUILD_TYPE)
+  if(BUILD_TYPE STREQUAL "STATIC")
+    set(BUILD_MODE -buildmode=c-archive)
+    set(LIB_NAME "lib${NAME}.a")
+  else()
+    set(BUILD_MODE -buildmode=c-shared)
+    if(APPLE)
+      set(LIB_NAME "lib${NAME}.dylib")
+    else()
+      set(LIB_NAME "lib${NAME}.so")
+    endif()
+  endif()
+
+  file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
+  file(RELATIVE_PATH rel ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
+
+  # find Paddle directory.
+  get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
+  get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
+  get_filename_component(PADDLE_DIR ${PARENT_DIR} DIRECTORY)
+
+  # automatically get all dependencies specified in the source code
+  # for given target.
+  add_custom_target(goGet env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ${rel}/...)
+
+  # make a symlink that references Paddle inside $GOPATH, so go get
+  # will use the local changes in Paddle rather than checkout Paddle
+  # in github.
+  add_custom_target(copyPaddle
+    COMMAND ln -sf ${PADDLE_DIR} ${PADDLE_IN_GOPATH})
+  add_dependencies(goGet copyPaddle)
+
+  add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
+    -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
+    ${CMAKE_GO_FLAGS} ${GO_SOURCE}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+
+  add_custom_target(${NAME} ALL DEPENDS ${OUTPUT_DIR}/.timestamp ${ARGN})
+  add_dependencies(${NAME} goGet)
+
+  if(NOT BUILD_TYPE STREQUAL "STATIC")
+    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME} DESTINATION bin)
+  endif()
+endfunction(GO_LIBRARY)
diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go
new file mode 100644
index 0000000000000000000000000000000000000000..d1f3d7d76c438670faf6677b01e790c5ebe1f2cb
--- /dev/null
+++ b/go/cmd/master/master.go
@@ -0,0 +1,93 @@
+package main
+
+import (
+	"fmt"
+	"net"
+	"net/http"
+	"net/rpc"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/namsral/flag"
+
+	"github.com/PaddlePaddle/Paddle/go/master"
+	"github.com/PaddlePaddle/recordio"
+)
+
+func main() {
+	port := flag.Int("port", 8080, "port of the master server.")
+	dataset := flag.String("training_dataset", "", "dataset: comma separated path to RecordIO paths, supports golb patterns.")
+	faultTolerance := flag.Bool("fault_tolerance", false, "enable fault tolerance (requires etcd).")
+	taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
+	taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
+	chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
+	flag.Parse()
+
+	if *dataset == "" {
+		panic("no dataset specified.")
+	}
+
+	if *faultTolerance {
+		panic("fault tolernance not implemented.")
+	}
+
+	var chunks []master.Chunk
+	var paths []string
+	ss := strings.Split(*dataset, ",")
+	fmt.Println(ss)
+	for _, s := range ss {
+		match, err := filepath.Glob(s)
+		if err != nil {
+			panic(err)
+		}
+		paths = append(paths, match...)
+	}
+
+	if len(paths) == 0 {
+		panic("no valid datset specified.")
+	}
+
+	idx := 0
+	for _, path := range paths {
+		f, err := os.Open(path)
+		if err != nil {
+			panic(err)
+		}
+
+		index, err := recordio.LoadIndex(f)
+		if err != nil {
+			panic(err)
+		}
+		f.Close()
+
+		count := index.NumChunks()
+		for i := 0; i < count; i++ {
+			chunk := master.Chunk{
+				Idx:   idx,
+				Path:  path,
+				Index: *index.ChunkIndex(i),
+			}
+			chunks = append(chunks, chunk)
+		}
+	}
+
+	s := master.NewService(chunks, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
+	err := rpc.Register(s)
+	if err != nil {
+		panic(err)
+	}
+
+	rpc.HandleHTTP()
+	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
+	if err != nil {
+		panic(err)
+	}
+
+	err = http.Serve(l, nil)
+	if err != nil {
+		panic(err)
+	}
+}
diff --git a/go/cmd/pserver/.gitignore b/go/cmd/pserver/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..fffd9adc4fde9681ad2a58fcf594d20bdd86ab45
--- /dev/null
+++ b/go/cmd/pserver/.gitignore
@@ -0,0 +1 @@
+pserver
diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
new file mode 100644
index 0000000000000000000000000000000000000000..f0be251c2471cc9ddc069f040417b5181a78c058
--- /dev/null
+++ b/go/cmd/pserver/pserver.go
@@ -0,0 +1,34 @@
+package main
+
+import (
+	"net"
+	"net/http"
+	"net/rpc"
+	"strconv"
+
+	"github.com/namsral/flag"
+
+	"github.com/PaddlePaddle/Paddle/go/pserver"
+)
+
+func main() {
+	port := flag.Int("port", 0, "port of the pserver")
+	flag.Parse()
+
+	s := pserver.NewService()
+	err := rpc.Register(s)
+	if err != nil {
+		panic(err)
+	}
+
+	rpc.HandleHTTP()
+	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
+	if err != nil {
+		panic(err)
+	}
+
+	err = http.Serve(l, nil)
+	if err != nil {
+		panic(err)
+	}
+}
diff --git a/go/master/service.go b/go/master/service.go
new file mode 100644
index 0000000000000000000000000000000000000000..ab17a62f3854c1e32d731037fcc9857260d03781
--- /dev/null
+++ b/go/master/service.go
@@ -0,0 +1,178 @@
+package master
+
+import (
+	"errors"
+	"log"
+	"sync"
+	"time"
+
+	"github.com/PaddlePaddle/recordio"
+)
+
+const (
+	targetTaskCount = 300
+)
+
+// errors
+var (
+	ErrNoMoreTask          = errors.New("no more task for current pass")
+	ErrPendingTaskNotFound = errors.New("pending task not found")
+)
+
+// Service is the master server service.
+type Service struct {
+	timeoutDur time.Duration
+	timeoutMax int
+
+	mu         sync.Mutex
+	taskQueues taskQueues
+}
+
+// Recover recovers service state from etcd.
+func Recover() (*Service, error) {
+	// TODO(helin): recover from snapshot state from etcd.
+	return nil, nil
+}
+
+func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
+	id := 0
+	if chunksPerTask <= 0 {
+		chunksPerTask = 1
+	}
+
+	var result []taskEntry
+	var cur taskEntry
+	for i, c := range chunks {
+		if i%chunksPerTask == 0 && len(cur.Task.Chunks) > 0 {
+			cur.Task.ID = id
+			id++
+			result = append(result, cur)
+			cur.Task.Chunks = nil
+		}
+
+		cur.Task.Chunks = append(cur.Task.Chunks, c)
+	}
+
+	if len(cur.Task.Chunks) > 0 {
+		cur.Task.ID = id
+		id++
+		result = append(result, cur)
+	}
+
+	return result
+}
+
+// NewService creates a new service.
+func NewService(chunks []Chunk, chunksPerTask int, timeoutDur time.Duration, timeoutMax int) *Service {
+	s := &Service{}
+	s.timeoutDur = timeoutDur
+	s.timeoutMax = timeoutMax
+	s.taskQueues = taskQueues{}
+	s.taskQueues.Pending = make(map[int]taskEntry)
+	s.taskQueues.Todo = partition(chunks, chunksPerTask)
+	return s
+}
+
+// Chunk is a chunk of data consisted of several data instances.
+type Chunk struct {
+	Idx   int // index of the chunk within the file
+	Path  string
+	Index recordio.Index // block index
+}
+
+// Task is the basic unit of data instances assigned to trainers.
+type Task struct {
+	ID     int
+	Chunks []Chunk
+}
+
+type taskEntry struct {
+	Epoch      int
+	NumTimeout int
+	Task       Task
+}
+
+type taskQueues struct {
+	Todo    []taskEntry
+	Pending map[int]taskEntry // map from task ID to task entry
+	Done    []taskEntry
+	Failed  []Task
+}
+
+// *must* be called with s.mu being held.
+func (s *Service) snapshot() error {
+	// TODO(helin): snapshot state on etcd.
+	return nil
+}
+
+// GetTask gets a new task from the service.
+func (s *Service) GetTask(dummy int, task *Task) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if len(s.taskQueues.Todo) == 0 {
+		return ErrNoMoreTask
+	}
+
+	t := s.taskQueues.Todo[0]
+	t.Epoch++
+	s.taskQueues.Todo = s.taskQueues.Todo[1:]
+	s.taskQueues.Pending[t.Task.ID] = t
+	err := s.snapshot()
+	if err != nil {
+		return err
+	}
+
+	time.AfterFunc(s.timeoutDur, func(taskID int, epoch int) func() {
+		return func() {
+			s.mu.Lock()
+			defer s.mu.Unlock()
+
+			t, ok := s.taskQueues.Pending[taskID]
+			if !ok {
+				return
+			}
+
+			if t.Epoch != epoch {
+				// new epoch, task launched after the
+				// schedule of this timeout check.
+				return
+			}
+
+			defer func() {
+				err := s.snapshot()
+				if err != nil {
+					log.Println(err)
+				}
+			}()
+
+			delete(s.taskQueues.Pending, t.Task.ID)
+
+			t.NumTimeout++
+			if t.NumTimeout > s.timeoutMax {
+				s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
+				return
+			}
+
+			s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+		}
+	}(t.Task.ID, t.Epoch))
+	return nil
+}
+
+// TaskFinished tell the service that a task is finished.
+func (s *Service) TaskFinished(taskID int, dummy *int) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	t, ok := s.taskQueues.Pending[taskID]
+	if !ok {
+		return ErrPendingTaskNotFound
+	}
+
+	// task finished, reset timeout
+	t.NumTimeout = 0
+	s.taskQueues.Done = append(s.taskQueues.Done, t)
+	delete(s.taskQueues.Pending, taskID)
+	return s.snapshot()
+}
diff --git a/go/master/service_internal_test.go b/go/master/service_internal_test.go
new file mode 100644
index 0000000000000000000000000000000000000000..bc435b505c014ca13ed5fc16b33a21ebb089a3b7
--- /dev/null
+++ b/go/master/service_internal_test.go
@@ -0,0 +1,37 @@
+package master
+
+import "testing"
+
+func TestPartitionCount(t *testing.T) {
+	cs := make([]Chunk, 100)
+	ts := partition(cs, 5)
+	if len(ts) != 20 {
+		t.Error(len(ts))
+	}
+
+	cs = make([]Chunk, 101)
+	ts = partition(cs, 5)
+	if len(ts) != 21 {
+		t.Error(len(ts))
+	}
+
+	ts = partition(cs, 1)
+	if len(ts) != 101 {
+		t.Error(len(ts))
+	}
+
+	ts = partition(cs, 0)
+	if len(ts) != 101 {
+		t.Error(len(ts))
+	}
+}
+
+func TestPartionIndex(t *testing.T) {
+	cs := make([]Chunk, 100)
+	ts := partition(cs, 20)
+	for i := range ts {
+		if ts[i].Task.ID != i {
+			t.Error(ts[i], i)
+		}
+	}
+}
diff --git a/go/pserver/cclient/CMakeLists.txt b/go/pserver/cclient/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c017d7465611373309c6c60141fed864f5ccfb5d
--- /dev/null
+++ b/go/pserver/cclient/CMakeLists.txt
@@ -0,0 +1,13 @@
+cmake_minimum_required(VERSION 3.0)
+
+get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
+get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake")
+
+project(cxx_go C Go)
+
+include(golang)
+include(flags)
+
+go_library(client STATIC)
+add_subdirectory(test)
diff --git a/go/pserver/cclient/cclient.go b/go/pserver/cclient/cclient.go
new file mode 100644
index 0000000000000000000000000000000000000000..0b4aa79806b72f4608230d2216d1741389913d95
--- /dev/null
+++ b/go/pserver/cclient/cclient.go
@@ -0,0 +1,260 @@
+package main
+
+/*
+#include <stdlib.h>
+#include <string.h>
+typedef enum {
+  PADDLE_ELEMENT_TYPE_INT32   = 0,
+  PADDLE_ELEMENT_TYPE_UINT32  = 1,
+  PADDLE_ELEMENT_TYPE_INT64   = 2,
+  PADDLE_ELEMENT_TYPE_UINT64  = 3,
+  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
+  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
+} paddle_element_type;
+
+typedef struct {
+  char*               name;
+  paddle_element_type element_type;
+  unsigned char*      content;
+  int                 content_len;
+} paddle_parameter, paddle_gradient;
+
+static inline void paddle_release_param(paddle_parameter* param) {
+  if (param != NULL) {
+    if (param->name != NULL) {
+      free(param->name);
+    }
+
+    if (param->content != NULL) {
+      free(param->content);
+    }
+
+    free(param);
+  }
+}
+
+typedef int client;
+*/
+import "C"
+
+import (
+	"log"
+	"strings"
+	"sync"
+	"unsafe"
+
+	"github.com/PaddlePaddle/Paddle/go/pserver"
+)
+
+var nullPtr = unsafe.Pointer(uintptr(0))
+var mu sync.Mutex
+var handleMap = make(map[C.client]*pserver.Client)
+var curHandle C.client
+
+func add(c *pserver.Client) C.client {
+	mu.Lock()
+	defer mu.Unlock()
+	client := curHandle
+	curHandle++
+	handleMap[client] = c
+	return client
+}
+
+func get(client C.client) *pserver.Client {
+	mu.Lock()
+	defer mu.Unlock()
+	return handleMap[client]
+}
+
+func remove(client C.client) *pserver.Client {
+	mu.Lock()
+	defer mu.Unlock()
+	h := handleMap[client]
+	delete(handleMap, client)
+	return h
+}
+
+func cArrayToSlice(p unsafe.Pointer, len int) []byte {
+	if p == nullPtr {
+		return nil
+	}
+
+	// create a Go clice backed by a C array, reference:
+	// https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
+	//
+	// Go garbage collector will not interact with this data, need
+	// to be freed properly.
+	return (*[1 << 30]byte)(p)[:len:len]
+}
+
+type selector bool
+
+func (s selector) Select() bool {
+	return bool(s)
+}
+
+type lister []pserver.Server
+
+func (l lister) List() []pserver.Server {
+	return l
+}
+
+//export paddle_new_pserver_client
+func paddle_new_pserver_client(addrs *C.char, selected int) C.client {
+	a := C.GoString(addrs)
+	as := strings.Split(a, ",")
+	servers := make([]pserver.Server, len(as))
+	for i := range as {
+		servers[i].Index = i
+		servers[i].Addr = as[i]
+	}
+	c := pserver.NewClient(lister(servers), len(as), selector(selected != 0))
+	return add(c)
+}
+
+//export paddle_new_etcd_pserver_client
+func paddle_new_etcd_pserver_client(etcd_addr *C.char) C.client {
+	// TODO(helin): fault tolerant pserver client using etcd.
+	panic("not implemented.")
+}
+
+//export paddle_pserver_client_release
+func paddle_pserver_client_release(client C.client) {
+	remove(client)
+}
+
+//export paddle_begin_init_params
+func paddle_begin_init_params(client C.client) C.int {
+	c := get(client)
+	if selected := c.BeginInitParams(); selected {
+		return 1
+	}
+	return 0
+}
+
+//export paddle_init_param
+func paddle_init_param(client C.client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
+	et := pserver.ElementType(param.element_type)
+	name := C.GoString(param.name)
+	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
+	pc := pserver.ParameterWithConfig{
+		Param:  pserver.Parameter{Name: name, ElementType: et, Content: content},
+		Config: cArrayToSlice(param_config, int(config_len)),
+	}
+	c := get(client)
+	err := c.InitParam(pc)
+	if err != nil {
+		log.Println(err)
+		return -1
+	}
+
+	return 0
+}
+
+//export paddle_finish_init_params
+func paddle_finish_init_params(client C.client) C.int {
+	c := get(client)
+	err := c.FinishInitParams()
+	if err != nil {
+		log.Println(err)
+		return -1
+	}
+
+	return 0
+}
+
+//export paddle_send_grads
+func paddle_send_grads(client C.client, grads *C.paddle_gradient, total C.int) C.int {
+	var gs []pserver.Gradient
+	for i := 0; i < int(total); i++ {
+		grad := (*C.paddle_gradient)(unsafe.Pointer((uintptr(unsafe.Pointer(grads)) + uintptr(i)*unsafe.Sizeof(*grads))))
+		et := pserver.ElementType(grad.element_type)
+		name := C.GoString(grad.name)
+		content := cArrayToSlice(unsafe.Pointer(grad.content), int(grad.content_len))
+		gs = append(gs, pserver.Gradient{Name: name, ElementType: et, Content: content})
+	}
+
+	c := get(client)
+	err := c.SendGrads(gs)
+	if err != nil {
+		log.Println(err)
+		return -1
+	}
+
+	return 0
+}
+
+//export paddle_get_params
+func paddle_get_params(client C.client, names **C.char, dst **C.paddle_parameter, total C.int) C.int {
+	var ns []string
+	for i := 0; i < int(total); i++ {
+		name := *(**C.char)(unsafe.Pointer((uintptr(unsafe.Pointer(names)) + uintptr(i)*unsafe.Sizeof(*names))))
+		ns = append(ns, C.GoString(name))
+	}
+	c := get(client)
+	ps, err := c.GetParams(ns)
+	if err != nil {
+		log.Println(err)
+		return -1
+	}
+
+	for i := 0; i < int(total); i++ {
+		if i >= len(ps) {
+			break
+		}
+
+		p := ps[i]
+		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
+		nameReady := false
+		contentAllocated := false
+
+		if unsafe.Pointer(param) == nullPtr {
+			param = (*C.paddle_parameter)(C.calloc(1, C.size_t(unsafe.Sizeof(*param))))
+		} else {
+			if unsafe.Pointer(param.name) != nullPtr {
+				if n := C.GoString(param.name); n != p.Name {
+					log.Println("Warning: the pre-allocated parameter name does not match the parameter name, it will be freed.", n, p.Name)
+					C.free(unsafe.Pointer(param.name))
+				} else {
+					nameReady = true
+				}
+			}
+
+			if unsafe.Pointer(param.content) != nullPtr {
+				if int(param.content_len) == len(p.Content) {
+					contentAllocated = true
+				} else {
+					log.Println("Warning: the pre-allocated content len does not match parameter content len, the pre-allocated content will be freed.", param.content_len, len(p.Content))
+					C.free(unsafe.Pointer(param.content))
+				}
+			}
+		}
+
+		if !nameReady {
+			param.name = C.CString(p.Name)
+		}
+		if !contentAllocated {
+			param.content = (*C.uchar)(C.malloc(C.size_t(len(p.Content))))
+		}
+		C.memcpy(unsafe.Pointer(param.content), unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)))
+		param.content_len = C.int(len(p.Content))
+		param.element_type = C.paddle_element_type(p.ElementType)
+	}
+
+	return 0
+}
+
+//export paddle_save_model
+func paddle_save_model(client C.client, path *C.char) C.int {
+	p := C.GoString(path)
+	c := get(client)
+	err := c.Save(p)
+	if err != nil {
+		log.Println(err)
+		return -1
+	}
+
+	return 0
+}
+
+func main() {} // Required but ignored
diff --git a/go/pserver/cclient/test/CMakeLists.txt b/go/pserver/cclient/test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..16f84648c1de3a8fdb4595c00bdb7608a152ded2
--- /dev/null
+++ b/go/pserver/cclient/test/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.0)
+
+include_directories(${CMAKE_BINARY_DIR})
+
+add_executable(main main.c)
+add_dependencies(main client)
+
+if(APPLE)
+  set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
+endif()
+target_link_libraries(main ${CMAKE_BINARY_DIR}/libclient.a)
diff --git a/go/pserver/cclient/test/main.c b/go/pserver/cclient/test/main.c
new file mode 100644
index 0000000000000000000000000000000000000000..f75a2110b947520dfec1265e56eaf2ba7ac3b51b
--- /dev/null
+++ b/go/pserver/cclient/test/main.c
@@ -0,0 +1,70 @@
+#include <stdio.h>
+
+#include "libclient.h"
+
+void fail() {
+  // TODO(helin): fix: gtest using cmake is not working, using this
+  // hacky way for now.
+  printf("test failed.\n");
+  exit(-1);
+}
+
+int main() {
+  char addr[] = "localhost:3000";
+  client c = paddle_new_pserver_client(addr, 1);
+retry:
+  if (paddle_begin_init_params(c)) {
+    paddle_parameter param;
+    char name_a[] = "param_a";
+    char name_b[] = "param_b";
+    unsigned char content[] = {0x00, 0x11, 0x22};
+    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+    param.name = name_a;
+    param.content = content;
+    param.content_len = 3;
+    if (paddle_init_param(c, param, NULL, 0) != 0) {
+      goto retry;
+    }
+    param.element_type = PADDLE_ELEMENT_TYPE_INT32;
+    param.name = name_b;
+    param.content = content;
+    param.content_len = 3;
+    if (paddle_init_param(c, param, NULL, 0) != 0) {
+      goto retry;
+    }
+    if (paddle_finish_init_params(c) != 0) {
+      goto retry;
+    }
+  } else {
+    fail();
+  }
+
+  unsigned char content[] = {0x00, 0x11, 0x22};
+  paddle_gradient grads[2] = {
+      {"param_a", PADDLE_ELEMENT_TYPE_INT32, content, 3},
+      {"param_b", PADDLE_ELEMENT_TYPE_FLOAT32, content, 3}};
+
+  if (!paddle_send_grads(c, grads, 2)) {
+    fail();
+  }
+
+  paddle_parameter* params[2] = {NULL, NULL};
+  char* names[] = {"param_a", "param_b"};
+  if (!paddle_get_params(c, names, params, 2)) {
+    fail();
+  }
+
+  // get parameters again by reusing the allocated parameter buffers.
+  if (!paddle_get_params(c, names, params, 2)) {
+    fail();
+  }
+
+  paddle_release_param(params[0]);
+  paddle_release_param(params[1]);
+
+  if (!paddle_save_model(c, "/tmp/")) {
+    fail();
+  }
+
+  return 0;
+}
diff --git a/go/pserver/client.go b/go/pserver/client.go
new file mode 100644
index 0000000000000000000000000000000000000000..f8bd0aa59f30ec7e2b2d318929af96135d3128ed
--- /dev/null
+++ b/go/pserver/client.go
@@ -0,0 +1,232 @@
+package pserver
+
+import (
+	"hash/fnv"
+	"log"
+	"sort"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/pserver/internal/connection"
+)
+
+// TODO(helin): add RPC call retry logic
+
+// Selector selects if the client should initialize parameter servers.
+type Selector interface {
+	Select() bool
+}
+
+// Server is the identification of a parameter Server.
+type Server struct {
+	Index int
+	Addr  string
+}
+
+// Lister lists currently available parameter servers.
+type Lister interface {
+	List() []Server
+}
+
+// Client is the client to parameter servers.
+type Client struct {
+	sel      Selector
+	pservers []*connection.Conn
+}
+
+// NewClient creates a new client.
+func NewClient(l Lister, pserverNum int, sel Selector) *Client {
+	c := &Client{sel: sel}
+	c.pservers = make([]*connection.Conn, pserverNum)
+	for i := 0; i < pserverNum; i++ {
+		c.pservers[i] = connection.New()
+	}
+	go c.monitorPservers(l, pserverNum)
+	return c
+}
+
+// monitorPservers monitors pserver addresses, and updates connection
+// when the address changes.
+func (c *Client) monitorPservers(l Lister, pserverNum int) {
+	knownServers := make([]Server, pserverNum)
+	ticker := time.NewTicker(10 * time.Second)
+	monitor := func() {
+		curServers := make([]Server, pserverNum)
+		list := l.List()
+		for _, l := range list {
+			curServers[l.Index] = l
+		}
+
+		for i := range knownServers {
+			if knownServers[i].Addr != curServers[i].Addr {
+				err := c.pservers[i].Connect(curServers[i].Addr)
+				if err != nil {
+					log.Println(err)
+
+					// connect to addr failed, set
+					// to last known addr in order
+					// to retry next time.
+					curServers[i].Addr = knownServers[i].Addr
+				}
+			}
+		}
+
+		knownServers = curServers
+	}
+
+	monitor()
+	for _ = range ticker.C {
+		monitor()
+	}
+}
+
+// BeginInitParams begins to initialize parameters on parameter
+// servers.
+//
+// BeginInitParams will be called from multiple trainers, only one
+// trainer will be selected to initialize the parameters on parameter
+// servers. Other trainers will be blocked until the initialization is
+// done, and they need to get the initialized parameters from
+// parameter servers using GetParams.
+func (c *Client) BeginInitParams() bool {
+	return c.sel.Select()
+}
+
+// InitParam initializes the parameter on parameter servers.
+func (c *Client) InitParam(paramWithConfigs ParameterWithConfig) error {
+	var dummy int
+	return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, &dummy)
+}
+
+// FinishInitParams tells parameter servers client has sent all
+// parameters to parameter servers as initialization.
+func (c *Client) FinishInitParams() error {
+	for _, p := range c.pservers {
+		var dummy int
+		err := p.Call("Service.FinishInitParams", dummy, &dummy)
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// SendGrads sends gradients to parameter servers for updating
+// parameters.
+func (c *Client) SendGrads(grads []Gradient) error {
+	errCh := make(chan error, len(grads))
+	for _, g := range grads {
+		go func(g Gradient) {
+			var dummy int
+			err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, &dummy)
+			errCh <- err
+		}(g)
+	}
+
+	recv := 0
+	for err := range errCh {
+		if err != nil {
+			return err
+		}
+
+		recv++
+		if recv == len(grads) {
+			break
+		}
+	}
+	return nil
+}
+
+type result struct {
+	idx   int
+	param Parameter
+	err   error
+}
+
+type results []result
+
+func (r results) Len() int {
+	return len(r)
+}
+
+func (r results) Less(i int, j int) bool {
+	return r[i].idx < r[j].idx
+}
+
+func (r results) Swap(i int, j int) {
+	r[i], r[j] = r[j], r[i]
+}
+
+// GetParams gets parameters from parameter servers.
+func (c *Client) GetParams(names []string) ([]Parameter, error) {
+	rCh := make(chan result, len(names))
+
+	for idx, name := range names {
+		go func(name string, idx int) {
+			var parameter Parameter
+			err := c.pservers[c.partition(name)].Call("Service.GetParam", name, &parameter)
+			rCh <- result{idx: idx, param: parameter, err: err}
+		}(name, idx)
+	}
+
+	var rs results
+	recv := 0
+	for r := range rCh {
+		if r.err != nil {
+			return nil, r.err
+		}
+		rs = append(rs, r)
+
+		recv++
+		if recv == len(names) {
+			break
+		}
+	}
+	sort.Sort(rs)
+
+	ps := make([]Parameter, len(rs))
+	for i := range rs {
+		ps[i] = rs[i].param
+	}
+
+	return ps, nil
+}
+
+// Save indicates parameters to save the parameter to the given path.
+func (c *Client) Save(path string) error {
+	errCh := make(chan error, len(c.pservers))
+
+	for _, p := range c.pservers {
+		var dummy int
+		err := p.Call("Service.Save", path, &dummy)
+		errCh <- err
+	}
+
+	recv := 0
+	for err := range errCh {
+		if err != nil {
+			return err
+		}
+
+		recv++
+		if recv == len(c.pservers) {
+			break
+		}
+	}
+
+	// TODO(helin): there will be many files under path, need to
+	// merge them into a single file.
+	return nil
+}
+
+func strHash(s string) uint32 {
+	h := fnv.New32a()
+	h.Write([]byte(s))
+	return h.Sum32()
+}
+
+// TODO(helin): now partition only select which parameter server to
+// send the entire parameter. We need to partition a parameter into
+// small blocks and send to different parameter servers.
+func (c *Client) partition(key string) int {
+	return int(strHash(key) % uint32(len(c.pservers)))
+}
diff --git a/go/pserver/client_test.go b/go/pserver/client_test.go
new file mode 100644
index 0000000000000000000000000000000000000000..a9a0948a51a31a1c7393f716e3dfc436dbf919af
--- /dev/null
+++ b/go/pserver/client_test.go
@@ -0,0 +1,123 @@
+package pserver_test
+
+import (
+	"net"
+	"net/http"
+	"net/rpc"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/PaddlePaddle/Paddle/go/pserver"
+)
+
+const numPserver = 10
+
+var port [numPserver]int
+
+func init() {
+	for i := 0; i < numPserver; i++ {
+		l, err := net.Listen("tcp", ":0")
+		if err != nil {
+			panic(err)
+		}
+
+		ss := strings.Split(l.Addr().String(), ":")
+		p, err := strconv.Atoi(ss[len(ss)-1])
+		if err != nil {
+			panic(err)
+		}
+		port[i] = p
+
+		go func(l net.Listener) {
+			s := pserver.NewService()
+			server := rpc.NewServer()
+			err := server.Register(s)
+			if err != nil {
+				panic(err)
+			}
+
+			mux := http.NewServeMux()
+			mux.Handle(rpc.DefaultRPCPath, server)
+			err = http.Serve(l, mux)
+			if err != nil {
+				panic(err)
+			}
+		}(l)
+	}
+}
+
+type selector bool
+
+func (s selector) Select() bool {
+	return bool(s)
+}
+
+type lister []pserver.Server
+
+func (l lister) List() []pserver.Server {
+	return l
+}
+
+func TestClientFull(t *testing.T) {
+	servers := make([]pserver.Server, numPserver)
+	for i := 0; i < numPserver; i++ {
+		servers[i] = pserver.Server{Index: i, Addr: ":" + strconv.Itoa(port[i])}
+	}
+	c := pserver.NewClient(lister(servers), len(servers), selector(true))
+	selected := c.BeginInitParams()
+	if !selected {
+		t.Fatal("should be selected.")
+	}
+
+	const numParameter = 100
+	for i := 0; i < numParameter; i++ {
+		var p pserver.Parameter
+		p.Name = "p_" + strconv.Itoa(i)
+		p.ElementType = pserver.Float32
+		p.Content = make([]byte, (i+1)*100)
+		err := c.InitParam(pserver.ParameterWithConfig{Param: p})
+		if err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	err := c.FinishInitParams()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	var grads []pserver.Gradient
+	for i := 0; i < numParameter/2; i++ {
+		var g pserver.Gradient
+		g.Name = "p_" + strconv.Itoa(i)
+		g.ElementType = pserver.Float32
+		g.Content = make([]byte, (i+1)*100)
+		grads = append(grads, g)
+	}
+
+	err = c.SendGrads(grads)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	names := make([]string, numParameter)
+	for i := 0; i < numParameter; i++ {
+		names[i] = "p_" + strconv.Itoa(i)
+	}
+
+	params, err := c.GetParams(names)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(names) != len(params) {
+		t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
+	}
+
+	for i := range params {
+		if names[i] != params[i].Name {
+			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i])
+		}
+	}
+}
diff --git a/go/pserver/internal/connection/conn.go b/go/pserver/internal/connection/conn.go
new file mode 100644
index 0000000000000000000000000000000000000000..1c04f117254054741b7d45fb16462b5ce84a2aea
--- /dev/null
+++ b/go/pserver/internal/connection/conn.go
@@ -0,0 +1,84 @@
+package connection
+
+import (
+	"errors"
+	"net/rpc"
+	"sync"
+)
+
+// TODO(helin): add TCP re-connect logic
+
+// Conn is a connection to a parameter server
+type Conn struct {
+	mu       sync.Mutex
+	client   *rpc.Client
+	waitConn chan struct{}
+}
+
+// New creates a new connection.
+func New() *Conn {
+	c := &Conn{}
+	return c
+}
+
+// Connect connects the connection to a address.
+func (c *Conn) Connect(addr string) error {
+	c.mu.Lock()
+	if c.client != nil {
+		err := c.client.Close()
+		if err != nil {
+			c.mu.Unlock()
+			return err
+		}
+
+		c.client = nil
+	}
+	c.mu.Unlock()
+
+	client, err := rpc.DialHTTP("tcp", addr)
+	if err != nil {
+		return err
+	}
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if c.client == nil {
+		c.client = client
+		if c.waitConn != nil {
+			close(c.waitConn)
+			c.waitConn = nil
+		}
+	} else {
+		return errors.New("client already set from a concurrent goroutine")
+	}
+
+	return nil
+}
+
+// Call make a RPC call.
+//
+// Call will be blocked until the connection to remote RPC service
+// being established.
+func (c *Conn) Call(serviceMethod string, args interface{}, reply interface{}) error {
+	c.mu.Lock()
+	client := c.client
+	var waitCh chan struct{}
+	if client == nil {
+		if c.waitConn != nil {
+			waitCh = c.waitConn
+		} else {
+			waitCh = make(chan struct{})
+			c.waitConn = waitCh
+		}
+	}
+	c.mu.Unlock()
+
+	if waitCh != nil {
+		// wait until new connection being established
+		<-waitCh
+		return c.Call(serviceMethod, args, reply)
+	}
+
+	return client.Call(serviceMethod, args, reply)
+}
diff --git a/go/pserver/optimizer.c b/go/pserver/optimizer.c
new file mode 100644
index 0000000000000000000000000000000000000000..b8da3ec9592053e3efe00e69d73a8ae259a30a2f
--- /dev/null
+++ b/go/pserver/optimizer.c
@@ -0,0 +1,52 @@
+#include <stdlib.h>
+
+#include "optimizer.h"
+
+typedef int (*update_func)(void*, void*, paddle_element_type, const void*, int);
+typedef void (*release_func)(void*);
+
+typedef struct paddle_optimizer {
+  update_func update;
+  release_func release;
+  void* optimizer;
+} paddle_optimizer;
+
+void paddle_release_optimizer(paddle_optimizer* o) {
+  o->release(o->optimizer);
+  free(o);
+}
+
+int paddle_update_parameter(paddle_optimizer* o,
+                            void* buffer,
+                            paddle_element_type element_type,
+                            const void* gradient,
+                            int num_bytes) {
+  return o->update(o->optimizer, buffer, element_type, gradient, num_bytes);
+}
+
+typedef struct { double learning_rate; } SGD_optimizer;
+
+int update_SGD(void* optimizer,
+               void* buffer,
+               paddle_element_type element_type,
+               const void* gradient,
+               int num_bytes) {
+  SGD_optimizer* o = (SGD_optimizer*)optimizer;
+  // TODO
+  return 0;
+}
+
+void release_SGD(void* optimizer) {
+  SGD_optimizer* o = (SGD_optimizer*)optimizer;
+  // nothing allocated on heap
+}
+
+paddle_optimizer* paddle_create_SGD_optimizer(double learning_rate) {
+  SGD_optimizer* impl = (SGD_optimizer*)malloc(sizeof(SGD_optimizer));
+  impl->learning_rate = learning_rate;
+  paddle_optimizer* opt = (paddle_optimizer*)malloc(sizeof(paddle_optimizer));
+  opt->update = update_SGD;
+  opt->release = release_SGD;
+  opt->optimizer = impl;
+  return opt;
+}
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
new file mode 100644
index 0000000000000000000000000000000000000000..417f8c509388055028bd46e42501741298308193
--- /dev/null
+++ b/go/pserver/optimizer.go
@@ -0,0 +1,51 @@
+package pserver
+
+/*
+#include "optimizer.h"
+*/
+import "C"
+import (
+	"fmt"
+	"unsafe"
+)
+
+type optimizerType int
+
+const (
+	sgd optimizerType = iota
+)
+
+var nullPtr = unsafe.Pointer(uintptr(0))
+
+type optimizer struct {
+	opt *C.struct_paddle_optimizer
+}
+
+func newOptimizer(t optimizerType, learning_rate float64) *optimizer {
+	o := &optimizer{}
+	o.opt = C.paddle_create_SGD_optimizer(C.double(learning_rate))
+	return o
+}
+
+func (o *optimizer) UpdateParameter(p Parameter, g Gradient) error {
+	if len(p.Content) != len(g.Content) {
+		return fmt.Errorf("Name: %s, parameter and gradient length not match, parameter: %d, gradient: %d", p.Name, len(p.Content), len(g.Content))
+	}
+
+	if p.ElementType != g.ElementType {
+		return fmt.Errorf("Name: %s, parameter and gradient element type not match, parameter: %v, gradient: %v", p.Name, p.ElementType, g.ElementType)
+	}
+
+	r := C.paddle_update_parameter(o.opt, unsafe.Pointer(&p.Content[0]), C.paddle_element_type(p.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content)))
+	if r != 0 {
+		return fmt.Errorf("optimizer update returned error code: %d", r)
+	}
+	return nil
+}
+
+func (o *optimizer) Cleanup() {
+	if unsafe.Pointer(o.opt) != nullPtr {
+		C.paddle_release_optimizer(o.opt)
+		o.opt = (*C.struct_paddle_optimizer)(nullPtr)
+	}
+}
diff --git a/go/pserver/optimizer.h b/go/pserver/optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7e3ff0530035f2cec4359a97d3e8ff81362d363
--- /dev/null
+++ b/go/pserver/optimizer.h
@@ -0,0 +1,22 @@
+#ifndef PADDLE_PSERVER_OPTIMIZER_H
+#define PADDLE_PSERVER_OPTIMIZER_H
+
+typedef enum {
+  PADDLE_ELEMENT_TYPE_INT32 = 0,
+  PADDLE_ELEMENT_TYPE_UINT32 = 1,
+  PADDLE_ELEMENT_TYPE_INT64 = 2,
+  PADDLE_ELEMENT_TYPE_UINT64 = 3,
+  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
+  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
+} paddle_element_type;
+
+struct paddle_optimizer;
+struct paddle_optimizer* paddle_create_SGD_optimizer(double learning_rate);
+void paddle_release_optimizer(struct paddle_optimizer* o);
+int paddle_update_parameter(struct paddle_optimizer* o,
+                            void* buffer,
+                            paddle_element_type element_type,
+                            const void* gradient,
+                            int num_bytes);
+
+#endif /* PADDLE_PSERVER_OPTIMIZER_H */
diff --git a/go/pserver/optimizer_test.go b/go/pserver/optimizer_test.go
new file mode 100644
index 0000000000000000000000000000000000000000..64d6d092aa1864fbca012214ced5e03e157d4a4c
--- /dev/null
+++ b/go/pserver/optimizer_test.go
@@ -0,0 +1,8 @@
+package pserver
+
+import "testing"
+
+func TestSGDCreateRelease(t *testing.T) {
+	o := newOptimizer(sgd, 1)
+	o.Cleanup()
+}
diff --git a/go/pserver/service.go b/go/pserver/service.go
new file mode 100644
index 0000000000000000000000000000000000000000..d5787b9708bb15629a6e6290ffc97ee9885bc8b8
--- /dev/null
+++ b/go/pserver/service.go
@@ -0,0 +1,139 @@
+package pserver
+
+import (
+	"errors"
+	"fmt"
+	"sync"
+)
+
+// ElementType is the type of elements of a Parameter.
+type ElementType int
+
+var ErrAlreadyInitialized = errors.New("pserver already initialized")
+var ErrUninitialized = errors.New("pserver not fully initialized")
+
+// Supported element types
+const (
+	Int32 ElementType = iota
+	UInt32
+	Int64
+	UInt64
+	Float32
+	Float64
+)
+
+// Parameter is a piece of data to sync with the parameter server.
+type Parameter struct {
+	Name        string
+	ElementType ElementType
+	Content     []byte
+}
+
+// ParameterWithConfig contains the parameter and the configuration.
+type ParameterWithConfig struct {
+	Param  Parameter
+	Config []byte // parameter configuration in Proto Buffer format
+}
+
+// Gradient is the gradient of the parameter.
+type Gradient Parameter
+
+// Service is the RPC service for pserver.
+type Service struct {
+	initialized chan struct{}
+
+	mu       sync.Mutex
+	opt      *optimizer
+	paramMap map[string]Parameter
+}
+
+// NewService creates a new service.
+func NewService() *Service {
+	s := &Service{opt: newOptimizer(sgd, 0.01)}
+	s.paramMap = make(map[string]Parameter)
+	s.initialized = make(chan struct{})
+	return s
+}
+
+// InitParam initializes a parameter.
+func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) error {
+	select {
+	case <-s.initialized:
+		return ErrAlreadyInitialized
+	default:
+	}
+
+	// TODO(helin): parse parameter config
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// TODO(helin): check if paramWithConfigs.Param.Content is
+	// properly memory aligned, if not, make copy to a memory
+	// aligned region.
+	s.paramMap[paramWithConfigs.Param.Name] = paramWithConfigs.Param
+	return nil
+}
+
+// FinishInitParams tells the parameter server that the parameter
+// initialization has finished.
+func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
+	select {
+	case <-s.initialized:
+		return ErrAlreadyInitialized
+	default:
+	}
+
+	close(s.initialized)
+	return nil
+}
+
+// SendGrad sends gradient to parameter servers for parameter
+// optimization.
+func (s *Service) SendGrad(g Gradient, dummy *int) error {
+	select {
+	case <-s.initialized:
+	default:
+		return ErrUninitialized
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	p, ok := s.paramMap[g.Name]
+	if !ok {
+		return fmt.Errorf("parameter: %s does not exist", g.Name)
+	}
+
+	return s.opt.UpdateParameter(p, g)
+}
+
+// GetParam gets parameters from the parameter server.
+func (s *Service) GetParam(name string, parameter *Parameter) error {
+	<-s.initialized
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	p, ok := s.paramMap[name]
+	if !ok {
+		return fmt.Errorf("parameter: %s does not exist", name)
+	}
+
+	// The parameter content (a byte slice) may change
+	// during RPC serialization due to write from other
+	// goroutine, we allow it since mini-batch based deep
+	// learning optimization methods are stochastic in
+	// nature. This race condition is allowed deliberately
+	// to save the program from making a copy of the
+	// paramter content.
+	*parameter = p
+	return nil
+}
+
+// Save tells the parameter server to save parameters.
+func (s *Service) Save(path string, dummy *int) error {
+	<-s.initialized
+
+	// TODO
+	return nil
+}
diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go
new file mode 100644
index 0000000000000000000000000000000000000000..4c9fac4536e09013916aadb26af3a86a5a775b4f
--- /dev/null
+++ b/go/pserver/service_test.go
@@ -0,0 +1,149 @@
+package pserver_test
+
+import (
+	"reflect"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/pserver"
+)
+
+func TestFull(t *testing.T) {
+	s := pserver.NewService()
+	var p pserver.Parameter
+	p.Name = "param_a"
+	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
+	p.ElementType = pserver.Int32
+	var dummy int
+	err := s.InitParam(pserver.ParameterWithConfig{p, nil}, &dummy)
+	if err != nil {
+		t.FailNow()
+	}
+
+	var p1 pserver.Parameter
+	p1.Name = "param_b"
+	p1.Content = []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+	p1.ElementType = pserver.Float32
+	err = s.InitParam(pserver.ParameterWithConfig{p1, nil}, &dummy)
+	if err != nil {
+		t.FailNow()
+	}
+
+	err = s.FinishInitParams(0, &dummy)
+	if err != nil {
+		t.FailNow()
+	}
+
+	var param pserver.Parameter
+	err = s.GetParam("param_b", &param)
+	if err != nil {
+		t.FailNow()
+	}
+
+	if !reflect.DeepEqual(param, p1) {
+		t.FailNow()
+	}
+
+	g1, g2 := pserver.Gradient(p1), pserver.Gradient(p)
+	err = s.SendGrad(g1, &dummy)
+	if err != nil {
+		t.FailNow()
+	}
+	err = s.SendGrad(g2, &dummy)
+
+	if err != nil {
+		t.FailNow()
+	}
+
+	var param1 pserver.Parameter
+	err = s.GetParam("param_a", &param1)
+	if err != nil {
+		t.FailNow()
+	}
+
+	// don't compare content, since it's already changed by
+	// gradient update.
+	param1.Content = nil
+	p.Content = nil
+
+	if !reflect.DeepEqual(param1, p) {
+		t.FailNow()
+	}
+}
+
+func TestMultipleInit(t *testing.T) {
+	s := pserver.NewService()
+	var dummy int
+	err := s.FinishInitParams(0, &dummy)
+	if err != nil {
+		t.FailNow()
+	}
+
+	err = s.FinishInitParams(0, &dummy)
+	if err != pserver.ErrAlreadyInitialized {
+		t.FailNow()
+	}
+}
+
+func TestUninitialized(t *testing.T) {
+	s := pserver.NewService()
+	var dummy int
+	err := s.SendGrad(pserver.Gradient{}, &dummy)
+	if err != pserver.ErrUninitialized {
+		t.FailNow()
+	}
+}
+
+func TestBlockUntilInitialized(t *testing.T) {
+	s := pserver.NewService()
+	ch := make(chan struct{}, 2)
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		var param pserver.Parameter
+		err := s.GetParam("param_a", &param)
+		if err != nil {
+			t.FailNow()
+		}
+		wg.Done()
+		ch <- struct{}{}
+	}()
+
+	wg.Add(1)
+	go func() {
+		var dummy int
+		err := s.Save("", &dummy)
+		if err != nil {
+			t.FailNow()
+		}
+		wg.Done()
+		ch <- struct{}{}
+	}()
+
+	time.Sleep(50 * time.Millisecond)
+
+	select {
+	case <-ch:
+		// some function returned before initialization is completed.
+		t.FailNow()
+	default:
+	}
+
+	var p pserver.Parameter
+	p.Name = "param_a"
+	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
+	p.ElementType = pserver.Int32
+	var dummy int
+	err := s.InitParam(pserver.ParameterWithConfig{p, nil}, &dummy)
+	if err != nil {
+		t.FailNow()
+	}
+
+	err = s.FinishInitParams(0, &dummy)
+	if err != nil {
+		t.FailNow()
+	}
+
+	wg.Wait()
+}
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index c6fd9cc54ae3a671c5bdcf54cbaa873c59280694..9898dc083ebb1783a0e2ddd12afaa9c3d5a79e98 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -9,7 +9,12 @@ add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
 
-find_package(boost QUIET)
+# Do not build go directory until go cmake is working smoothly.
+# if(CMAKE_Go_COMPILER)
+#   add_subdirectory(go)
+# endif()
+
+find_package(Boost QUIET)
 
 if(Boost_FOUND)
   include_directories(${Boost_INCLUDE_DIRS})
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
index d49b189e253f7a0792fe3f1fe7c8fdbb7071acd4..c6f9106912c475dda76b4c11e0793cc5a9f78d3f 100644
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -151,4 +151,24 @@ int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
   return a.getBatchSize();
 }
 
+void Arguments::setSlotFrameHeight(size_t idx, size_t h) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.setFrameHeight(h);
+}
+
+void Arguments::setSlotFrameWidth(size_t idx, size_t w) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.setFrameWidth(w);
+}
+
+size_t Arguments::getSlotFrameHeight(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return a.getFrameHeight();
+}
+
+size_t Arguments::getSlotFrameWidth(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return a.getFrameWidth();
+}
+
 void* Arguments::getInternalArgumentsPtr() const { return &m->outputs; }
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index 1cec77c0cae6ffbf7a1ca22092e8e41a6f9f0fc5..e147659566dba6cfbfd677e3b616bdaa4a73485c 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -26,10 +26,7 @@ FILE(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
 SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
 
 SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
-SET(CMAKE_CXX_FLAGS "-std=c++11 -fPIC -Wall")
-IF(WITH_COVERAGE)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
-ENDIF(WITH_COVERAGE)
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign")
 
 SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
     paddle_parameter
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index d51204012171c9887acd5f578f913143182efe36..da0f157abd68c73c45f498cf9ef2726aac67c95b 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -454,6 +454,25 @@ public:
                                         IVector* vec) throw(RangeError);
   void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
 
+  /**
+   * Set the frame height of the idx-th Argument.
+   *
+   * @param ids The index of which Argument.
+   * @param h The height value.
+   */
+  void setSlotFrameHeight(size_t idx, size_t h) throw(RangeError);
+
+  /**
+   * Set the frame height of the idx-th Argument.
+   *
+   * @param ids The index of which Argument.
+   * @param h The height value.
+   */
+  void setSlotFrameWidth(size_t idx, size_t w) throw(RangeError);
+
+  size_t getSlotFrameHeight(size_t idx = 0) const throw(RangeError);
+  size_t getSlotFrameWidth(size_t idx = 0) const throw(RangeError);
+
   float sum() const;
 
 private:
diff --git a/paddle/api/test/testArguments.py b/paddle/api/test/testArguments.py
index 9fe44de94ea6ddb71d2dfbb2243fc86ede0d0531..4d40ffec9a030bf756a515266b2c33915fcc4e10 100644
--- a/paddle/api/test/testArguments.py
+++ b/paddle/api/test/testArguments.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from py_paddle import swig_paddle
+import numpy as np
 import unittest
 
 
@@ -36,6 +37,17 @@ class TestArguments(unittest.TestCase):
         np_arr = iv.toNumpyArrayInplace()
         self.assertEqual(np_arr.shape, (6, ))
 
+    def test_arguments_shape(self):
+        h, w = 4, 6
+        v = np.random.rand(2, h * w)
+        m = swig_paddle.Matrix.createDense(v.flatten(), 2, h * w)
+        args = swig_paddle.Arguments.createArguments(1)
+        args.setSlotValue(0, m)
+        args.setSlotFrameHeight(0, h)
+        args.setSlotFrameWidth(0, w)
+        self.assertEqual(args.getSlotFrameHeight(), h)
+        self.assertEqual(args.getSlotFrameWidth(), w)
+
 
 if __name__ == '__main__':
     swig_paddle.initPaddle("--use_gpu=0")
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
index 1b52a79cebb1210b09fc9f30282bfd799a35dcf9..206f512563466d40e9ad1db0ddb4753ffb6bf55a 100644
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -58,10 +58,16 @@ target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR}
 link_paddle_exe(paddle_capi_shared)
 
 # install library & headers.
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library} DESTINATION lib)
 install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
-install(TARGETS paddle_capi_shared DESTINATION lib)
+if(ANDROID)
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library}
+          DESTINATION lib/${ANDROID_ABI})
+  install(TARGETS paddle_capi_shared DESTINATION lib/${ANDROID_ABI})
+else(ANDROID)
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library} DESTINATION lib)
+  install(TARGETS paddle_capi_shared DESTINATION lib)
+endif(ANDROID)
 
 # this variable used for unittest
 set(PADDLE_CAPI_INC_PATH
diff --git a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
index 9c49a4bd2083794e98b099b25944bedec3d5a2ff..aaa24325514812eda33309660ba85c3ceece770e 100644
--- a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
@@ -17,10 +17,9 @@ limitations under the License. */
 
 #include <stdio.h>
 #include "hl_base.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include "hl_neon_matrix_kernel.cuh"
-#else
-#include "hl_sse_matrix_kernel.cuh"
+
+#ifndef __CUDA_ARCH__
+#include "hl_cpu_matrix_kernel_detail.cuh"
 #endif
 
 /**
@@ -114,35 +113,6 @@ void hl_cpu_apply_quaternary_op(Op op,
   }
 }
 
-template <class Agg, class Op, class Saver>
-void hl_matrix_row_op(Agg agg, Op op, Saver sv,
-                      int dimM, int dimN,
-                      real *dst, int ld,
-                      real *A, int lda) {
-  for (int i = 0; i < dimM; i++) {
-    real tmp = agg.init();
-    for (int j = 0; j < dimN; j++) {
-        tmp = agg(tmp, op(A[i * lda + j]));
-    }
-    dst[i*ld] = sv(dst[i*ld], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_matrix_row_op(Agg agg, Op op, Saver sv,
-                      int dimM, int dimN,
-                      real *dst, int ld,
-                      real *A, int lda,
-                      real *B, int ldb) {
-  for (int i = 0; i < dimM; i++) {
-    real tmp = agg.init();
-    for (int j = 0; j < dimN; j++) {
-        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
-    }
-    dst[i*ld] = sv(dst[i*ld], tmp);
-  }
-}
-
 template <class Agg, class Op, class Saver>
 void hl_cpu_matrix_row_op(Agg agg, Op op, Saver sv,
                           int dimM, int dimN,
diff --git a/paddle/cuda/include/hl_sse_matrix_kernel.cuh b/paddle/cuda/include/hl_cpu_matrix_kernel_detail.cuh
similarity index 89%
rename from paddle/cuda/include/hl_sse_matrix_kernel.cuh
rename to paddle/cuda/include/hl_cpu_matrix_kernel_detail.cuh
index 9e50580669d2d4523dda239e90b4ed18a9214e2f..85ca836fdc46682195ac29a1ebf2237c28fc3311 100644
--- a/paddle/cuda/include/hl_sse_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_cpu_matrix_kernel_detail.cuh
@@ -13,26 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 
-#ifndef HL_SSE_MATRIX_KERNEL_CUH_
-#define HL_SSE_MATRIX_KERNEL_CUH_
+#ifndef HL_MATRIX_KERNEL_DETAIL_CUH_
+#define HL_MATRIX_KERNEL_DETAIL_CUH_
 
 #include "hl_matrix_type.cuh"
 
-#define VECTOR_SIZE     16
-
-#ifndef PADDLE_TYPE_DOUBLE
-/* number of float in vector */
-#define     VECTOR_LEN      4
-#define     VECTOR_SET      _mm_set_ps1
-#else
-#if   defined(__APPLE__) || defined(__OSX__)
-#define     _mm_set_pd1     _mm_set1_pd
-#endif
-/* number of double in vector */
-#define     VECTOR_LEN      2
-#define     VECTOR_SET      _mm_set_pd1
-#endif
-
 inline bool hl_check_align(size_t size) {
   return !(size & (VECTOR_SIZE - 1));
 }
@@ -41,27 +26,63 @@ inline bool hl_check_align(void *ptr) {
   return hl_check_align(reinterpret_cast<size_t>(ptr));
 }
 
-#ifndef PADDLE_TYPE_DOUBLE
-template <class Agg>
-inline real hl_agg_op(Agg agg, vecType mm) {
-  __m128 lo = _mm_unpacklo_ps(mm, mm);
-  __m128 hi = _mm_unpackhi_ps(mm, mm);
-  __m128 tmp1 = agg.vecOp(lo, hi);
-  __m128 tmp2 = _mm_movehl_ps(tmp1, tmp1);
-  __m128 ret = agg.vecOp(tmp1, tmp2);
+template <class Agg, class Op, class Saver>
+void hl_matrix_row_op(Agg agg, Op op, Saver sv,
+                      int dimM, int dimN,
+                      real *dst, int ld,
+                      real *A, int lda) {
+  for (int i = 0; i < dimM; i++) {
+    real tmp = agg.init();
+    for (int j = 0; j < dimN; j++) {
+        tmp = agg(tmp, op(A[i * lda + j]));
+    }
+    dst[i*ld] = sv(dst[i*ld], tmp);
+  }
+}
 
-  return _mm_cvtss_f32(ret);
+template <class Agg, class Op, class Saver>
+void hl_matrix_row_op(Agg agg, Op op, Saver sv,
+                      int dimM, int dimN,
+                      real *dst, int ld,
+                      real *A, int lda,
+                      real *B, int ldb) {
+  for (int i = 0; i < dimM; i++) {
+    real tmp = agg.init();
+    for (int j = 0; j < dimN; j++) {
+        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
+    }
+    dst[i*ld] = sv(dst[i*ld], tmp);
+  }
 }
-#else
-template <class Agg>
-inline real hl_agg_op(Agg agg, vecType mm) {
-  __m128d lo = _mm_unpacklo_pd(mm, mm);
-  __m128d hi = _mm_unpackhi_pd(mm, mm);
-  __m128d ret = agg.vecOp(lo, hi);
-
-  return _mm_cvtsd_f64(ret);
+
+template <class Agg, class Op, class Saver>
+void hl_matrix_column_op(Agg agg, Op op, Saver sv,
+                         int dimM, int dimN,
+                         real *dst,
+                         real *A, int lda) {
+  for (int j = 0; j < dimN; j++) {
+    real tmp = agg.init();
+    for (int i = 0; i < dimM; i++) {
+        tmp = agg(tmp, op(A[i * lda + j]));
+    }
+    dst[j] = sv(dst[j], tmp);
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_matrix_column_op(Agg agg, Op op, Saver sv,
+                         int dimM, int dimN,
+                         real *dst,
+                         real *A, int lda,
+                         real *B, int ldb) {
+  for (int j = 0; j < dimN; j++) {
+    real tmp = agg.init();
+    for (int i = 0; i < dimM; i++) {
+        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
+    }
+    dst[j] = sv(dst[j], tmp);
+  }
 }
-#endif
 
 template <class Agg, class Op, class Saver>
 void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
@@ -118,35 +139,6 @@ void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
   }
 }
 
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda,
-                         real *B, int ldb) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
-
 /*
  * MaxRow greater than or equal dimN
  * dimN is multiples of VECTOR_LEN
@@ -315,4 +307,4 @@ void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
   }
 }
 
-#endif /* HL_SSE_MATRIX_KERNEL_CUH_ */
+#endif /* HL_MATRIX_KERNEL_DETAIL_CUH_ */
diff --git a/paddle/cuda/include/hl_cpu_scalar.cuh b/paddle/cuda/include/hl_cpu_scalar.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..93043cd4bc030ef525d5bcf8d83196f2ce92eec6
--- /dev/null
+++ b/paddle/cuda/include/hl_cpu_scalar.cuh
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_CPU_SCALAR_CUH_
+#define HL_CPU_SCALAR_CUH_
+
+#define VECTOR_SIMD false
+#define VECTOR_SET  hl_vec_set
+
+#ifndef PADDLE_TYPE_DOUBLE
+/* size of float */
+#define VECTOR_SIZE 4
+#else
+/* size of double */
+#define VECTOR_SIZE 8
+#endif
+
+typedef real vecType;
+
+/* Consider a real as a vector */
+#define VECTOR_LEN  1
+
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  return mm;
+}
+
+INLINE real hl_vec_set(const real r) {
+  return r;
+}
+
+INLINE real hl_vec_classification_error(const real a,
+                                        const real b,
+                                        const real p,
+                                        const real r) {
+  return ((a > p) == (b > p)) ? 0.0f : 1.0f;
+}
+
+#endif  // HL_CPU_SCALAR_CUH_
diff --git a/paddle/cuda/include/hl_cpu_simd_neon.cuh b/paddle/cuda/include/hl_cpu_simd_neon.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..0b1cf4abdc4d5ef2a640c75587308f7f082b854b
--- /dev/null
+++ b/paddle/cuda/include/hl_cpu_simd_neon.cuh
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_CPU_SIMD_NEON_CUH_
+#define HL_CPU_SIMD_NEON_CUH_
+
+#include <arm_neon.h>
+
+#define VECTOR_SIMD true
+#define VECTOR_SIZE 16
+#define VECTOR_SET  hl_vec_set
+
+#ifndef PADDLE_TYPE_DOUBLE
+
+typedef float32x4_t vecType;
+
+/* number of float in vector */
+#define VECTOR_LEN  4
+
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  float32x4_t rev = vrev64q_f32(mm);
+  float32x4_t tmp1 = agg.vecOp(rev, rev);
+  float32x2_t lo = vget_high_f32(rev);
+  float32x2_t hi = vget_low_f32(rev);
+  float32x4_t tmp2 = vcombine_f32(hi, lo);
+  float32x4_t ret = agg.vecOp(tmp1, tmp2);
+
+  return vgetq_lane_f32(ret, 0);
+}
+
+inline float32x4_t hl_vec_set(const real f) {
+  return vdupq_n_f32(f);
+}
+
+inline float32x4_t hl_vec_classification_error(const float32x4_t a,
+                                               const float32x4_t b,
+                                               const float32x4_t p,
+                                               const float32x4_t r) {
+  uint32x4_t tmp1 = vcgtq_f32(a, p);
+  uint32x4_t tmp2 = vcgtq_f32(b, p);
+  uint32x4_t tmp3 = veorq_u32(tmp1, tmp2);
+  return vcvtq_f32_u32(vandq_u32(tmp3, vcvtq_u32_f32(r)));
+}
+
+#else
+
+#ifdef __aarch64__
+typedef float64x2_t vecType;
+
+/* number of float in vector */
+#define VECTOR_LEN  2
+#define VECTOR_SET  vdupq_n_f64
+
+#error To be implemented
+#else
+#error NEON instructions does not support double precision
+#endif  // __aarch64__
+
+#endif
+
+#endif  // HL_CPU_SIMD_NEON_CUH_
diff --git a/paddle/cuda/include/hl_cpu_simd_sse.cuh b/paddle/cuda/include/hl_cpu_simd_sse.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..a104b626220f473324fc3c99e7cd305c3e86f3db
--- /dev/null
+++ b/paddle/cuda/include/hl_cpu_simd_sse.cuh
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_CPU_SIMD_SSE_CUH_
+#define HL_CPU_SIMD_SSE_CUH_
+
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#define VECTOR_SIMD true
+#define VECTOR_SIZE 16
+#define VECTOR_SET  hl_vec_set
+
+#ifndef PADDLE_TYPE_DOUBLE
+
+typedef __m128  vecType;
+
+/* number of float in vector */
+#define VECTOR_LEN  4
+
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  __m128 lo = _mm_unpacklo_ps(mm, mm);
+  __m128 hi = _mm_unpackhi_ps(mm, mm);
+  __m128 tmp1 = agg.vecOp(lo, hi);
+  __m128 tmp2 = _mm_movehl_ps(tmp1, tmp1);
+  __m128 ret = agg.vecOp(tmp1, tmp2);
+
+  return _mm_cvtss_f32(ret);
+}
+
+inline __m128 hl_vec_set(const real f) {
+  return _mm_set_ps1(f);
+}
+
+inline __m128 hl_vec_classification_error(const __m128 a,
+                                          const __m128 b,
+                                          const __m128 p,
+                                          const __m128 r) {
+  __m128 tmp1 = _mm_cmpgt_ps(a, p);
+  __m128 tmp2 = _mm_cmpgt_ps(b, p);
+  __m128 tmp3 = _mm_xor_ps(tmp1, tmp2);
+  return _mm_and_ps(tmp3, r);
+}
+
+#else
+
+typedef __m128d vecType;
+
+/* number of double in vector */
+#define VECTOR_LEN  2
+
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  __m128d lo = _mm_unpacklo_pd(mm, mm);
+  __m128d hi = _mm_unpackhi_pd(mm, mm);
+  __m128d ret = agg.vecOp(lo, hi);
+
+  return _mm_cvtsd_f64(ret);
+}
+
+inline __m128d hl_vec_set(const real d) {
+#if defined(__APPLE__) || defined(__OSX__)
+  return _mm_set1_pd(d);
+#else
+  return _mm_set_pd1(d);
+#endif
+}
+
+inline __m128d hl_vec_classification_error(const __m128d a,
+                                           const __m128d b,
+                                           const __m128d p,
+                                           const __m128d r) {
+  __m128d tmp1 = _mm_cmpgt_pd(a, p);
+  __m128d tmp2 = _mm_cmpgt_pd(b, p);
+  __m128d tmp3 = _mm_xor_pd(tmp1, tmp2);
+  return _mm_and_pd(tmp3, r);
+}
+
+#endif
+
+#endif  // HL_CPU_SIMD_SSE_CUH_
diff --git a/paddle/cuda/include/hl_matrix_base.cuh b/paddle/cuda/include/hl_matrix_base.cuh
index 8b755c1095c2c4fdb7e74d8cddc948e6a6af380b..53fdb47ec9c05f5cf85d0956176ad9abf6d656f9 100644
--- a/paddle/cuda/include/hl_matrix_base.cuh
+++ b/paddle/cuda/include/hl_matrix_base.cuh
@@ -18,26 +18,6 @@ limitations under the License. */
 
 #include "hl_matrix_type.cuh"
 
-#ifdef __CUDA_ARCH__
-/**
- * CUDA kernel inline function
- */
-#define INLINE   __device__ inline
-#else
-/**
- * CPP inline function
- */
-#define INLINE   inline
-#endif
-
-#ifndef PADDLE_TYPE_DOUBLE
-#define     DEVICE_FMAX     fmaxf
-#define     DEVICE_FMIN     fminf
-#else
-#define     DEVICE_FMAX     fmax
-#define     DEVICE_FMIN     fmin
-#endif
-
 class BaseOp {
 public:
   static const bool sse = false;
@@ -66,10 +46,8 @@ typedef BaseOp SSESquaredDiff;
 typedef BaseOp SSEFirst;
 typedef BaseOp SSESecond;
 typedef BaseOp SSEClassificationError;
-#elif defined(__ARM__NEON__) || defined(__ARM_NEON)
-#include "hl_matrix_base_neon.cuh"
 #else
-#include "hl_matrix_base_sse.cuh"
+#include "hl_matrix_base_detail.cuh"
 #endif
 
 namespace aggregate {
@@ -124,7 +102,7 @@ public:
   add2(const real s1, const real s2)
     : SSEAdd2(s1, s2), p1(s1), p2(s2) {}
   INLINE real operator()(const real a, const real b) const {
-     return p1 * a + p2 * b;
+    return p1 * a + p2 * b;
   }
 };
 
diff --git a/paddle/cuda/include/hl_matrix_base_detail.cuh b/paddle/cuda/include/hl_matrix_base_detail.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..de1fd17d524a486cc15af721731d2e815f17263e
--- /dev/null
+++ b/paddle/cuda/include/hl_matrix_base_detail.cuh
@@ -0,0 +1,153 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_MATRIX_BASE_DETAIL_CUH_
+#define HL_MATRIX_BASE_DETAIL_CUH_
+
+#include "hl_matrix_type.cuh"
+#include "hl_tensor_ops.h"
+
+namespace aggregate {
+class SSESum {
+public:
+  static const bool sse = VECTOR_SIMD;
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
+    return hppl::binary::add<vecType>()(a, b);
+  }
+};
+
+class SSEMax {
+public:
+  static const bool sse = VECTOR_SIMD;
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
+    return hppl::binary::max<vecType>()(a, b);
+  }
+};
+
+class SSEMin {
+public:
+  static const bool sse = VECTOR_SIMD;
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
+    return hppl::binary::min<vecType>()(a, b);
+  }
+};
+}  // namespace aggregate
+
+namespace base {
+namespace unary {
+class SSEIdentity {
+public:
+  static const bool sse = VECTOR_SIMD;
+  INLINE vecType vecOp(const vecType a) const {
+    return a;
+  }
+};
+}  // namespace unary
+
+namespace binary {
+class SSEAdd {
+public:
+  static const bool sse = VECTOR_SIMD;
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
+    return hppl::binary::add<vecType>()(a, b);
+  }
+};
+
+class SSEAdd2 {
+public:
+  static const bool sse = VECTOR_SIMD;
+  const real p1;
+  const real p2;
+  vecType mp1;
+  vecType mp2;
+
+public:
+  SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
+    mp1 = hl_vec_set(p1);
+    mp2 = hl_vec_set(p2);
+  }
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
+    return hppl::binary::add_scale<vecType>(mp1, mp2)(a, b);
+  }
+};
+
+class SSESub {
+public:
+  static const bool sse = VECTOR_SIMD;
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
+    return hppl::binary::sub<vecType>()(a, b);
+  }
+};
+
+class SSEMul {
+public:
+  static const bool sse = VECTOR_SIMD;
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
+    return hppl::binary::mul<vecType>()(a, b);
+  }
+};
+
+class SSEDiv {
+public:
+  static const bool sse = VECTOR_SIMD;
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
+    return hppl::binary::div<vecType>()(a, b);
+  }
+};
+
+class SSESquaredDiff {
+public:
+  static const bool sse = VECTOR_SIMD;
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
+    vecType tmp = hppl::binary::sub<vecType>()(a, b);
+    return hppl::binary::mul<vecType>()(tmp, tmp);
+  }
+};
+
+class SSEFirst {
+public:
+  static const bool sse = VECTOR_SIMD;
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
+    return a;
+  }
+};
+
+class SSESecond {
+public:
+  static const bool sse = VECTOR_SIMD;
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
+    return b;
+  }
+};
+
+class SSEClassificationError {
+public:
+  static const bool sse = VECTOR_SIMD;
+  const real p;
+  vecType mp;
+  vecType result;
+
+public:
+  explicit SSEClassificationError(const real s) : p(s) {
+    mp = hl_vec_set(p);
+    result = hl_vec_set(1.0f);
+  }
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
+    return hl_vec_classification_error(a, b, mp, result);
+  }
+};
+}  // namespace binary
+}  // namespace base
+
+#endif /* HL_MATRIX_BASE_DETAIL_CUH_ */
diff --git a/paddle/cuda/include/hl_matrix_base_neon.cuh b/paddle/cuda/include/hl_matrix_base_neon.cuh
deleted file mode 100644
index e13019f5ee24ad600005c99678426ee3808b0e54..0000000000000000000000000000000000000000
--- a/paddle/cuda/include/hl_matrix_base_neon.cuh
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_MATRIX_BASE_NEON_CUH_
-#define HL_MATRIX_BASE_NEON_CUH_
-
-namespace aggregate {
-class SSESum {
-public:
-  static const bool sse = true;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
-    return vaddq_f32(a, b);
-  }
-};
-
-class SSEMax {
-public:
-  static const bool sse = true;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
-    return vmaxq_f32(a, b);
-  }
-};
-
-class SSEMin {
-public:
-  static const bool sse = true;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
-    return vminq_f32(a, b);
-  }
-};
-}  // namespace aggregate
-
-namespace base {
-namespace unary {
-class SSEIdentity {
-public:
-  static const bool sse = true;
-  INLINE float32x4_t vecOp(const float32x4_t a) const {
-    return a;
-  }
-};
-}  // namespace unary
-
-namespace binary {
-class SSEAdd {
-public:
-  static const bool sse = true;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
-    return vaddq_f32(a, b);
-  }
-};
-
-class SSEAdd2 {
-public:
-  static const bool sse = true;
-  const real p1;
-  const real p2;
-  float32x4_t mp1;
-  float32x4_t mp2;
-
-public:
-  SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
-    mp1 = vdupq_n_f32(p1);
-    mp2 = vdupq_n_f32(p2);
-  }
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
-    float32x4_t tmp1, tmp2;
-    tmp1 = vmulq_f32(mp1, a);
-    tmp2 = vmulq_f32(mp2, b);
-    return vaddq_f32(tmp1, tmp2);
-  }
-};
-
-class SSESub {
-public:
-  static const bool sse = true;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
-    return vsubq_f32(a, b);
-  }
-};
-
-class SSEMul {
-public:
-  static const bool sse = true;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
-    return vmulq_f32(a, b);
-  }
-};
-
-class SSEDiv {
-public:
-  static const bool sse = true;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
-    float32x4_t tmp;
-    tmp = vrecpeq_f32(b);
-    return vmulq_f32(a, tmp);
-  }
-};
-
-class SSESquaredDiff {
-public:
-  static const bool sse = true;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
-    float32x4_t tmp;
-    tmp = vsubq_f32(a, b);
-    return vmulq_f32(tmp, tmp);
-  }
-};
-
-class SSEFirst {
-public:
-  static const bool sse = true;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
-    return a;
-  }
-};
-
-class SSESecond {
-public:
-  static const bool sse = true;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
-    return b;
-  }
-};
-
-class SSEClassificationError {
-public:
-  static const bool sse = true;
-  const real p;
-  float32x4_t mp;
-  uint32x4_t result;
-
-public:
-  explicit SSEClassificationError(const real s) : p(s) {
-    mp = vdupq_n_f32(p);
-    result = vdupq_n_u32(1);
-  }
-  // TODO: to be check
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
-    uint32x4_t tmp1 = vcgtq_f32(a, mp);
-    uint32x4_t tmp2 = vcgtq_f32(b, mp);
-    uint32x4_t tmp3 = veorq_u32(tmp1, tmp2);
-    return vcvtq_f32_u32(vandq_u32(tmp3, result));
-  }
-};
-}  // namespace binary
-}  // namespace base
-
-#endif /* HL_MATRIX_BASE_NEON_CUH_ */
diff --git a/paddle/cuda/include/hl_matrix_base_sse.cuh b/paddle/cuda/include/hl_matrix_base_sse.cuh
deleted file mode 100644
index db6c9cca03a8974a15cd2e7fbaf73033e3a57f4b..0000000000000000000000000000000000000000
--- a/paddle/cuda/include/hl_matrix_base_sse.cuh
+++ /dev/null
@@ -1,211 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_MATRIX_BASE_SSE_CUH_
-#define HL_MATRIX_BASE_SSE_CUH_
-
-namespace aggregate {
-class SSESum {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_add_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_add_pd(a, b);
-  }
-};
-
-class SSEMax {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_max_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_max_pd(a, b);
-  }
-};
-
-class SSEMin {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_min_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_min_pd(a, b);
-  }
-};
-}  // namespace aggregate
-
-namespace base {
-namespace unary {
-class SSEIdentity {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a) const {
-    return a;
-  }
-  INLINE __m128d vecOp(const __m128d a) const {
-    return a;
-  }
-};
-}  // namespace unary
-
-namespace binary {
-class SSEAdd {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_add_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_add_pd(a, b);
-  }
-};
-
-class SSEAdd2 {
-public:
-  static const bool sse = true;
-  const real p1;
-  const real p2;
-  union {__m128 f; __m128d d;} mp1;
-  union {__m128 f; __m128d d;} mp2;
-
-public:
-  SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
-    if (sizeof(real) == sizeof(float)) {
-      mp1.f = _mm_set1_ps(p1);
-      mp2.f = _mm_set1_ps(p2);
-    } else {
-      mp1.d = _mm_set1_pd(p1);
-      mp2.d = _mm_set1_pd(p2);
-    }
-  }
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    __m128 tmp1, tmp2;
-    tmp1 = _mm_mul_ps(mp1.f, a);
-    tmp2 = _mm_mul_ps(mp2.f, b);
-    return _mm_add_ps(tmp1, tmp2);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    __m128d tmp1, tmp2;
-    tmp1 = _mm_mul_pd(mp1.d, a);
-    tmp2 = _mm_mul_pd(mp2.d, b);
-    return _mm_add_pd(tmp1, tmp2);
-  }
-};
-
-class SSESub {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_sub_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_sub_pd(a, b);
-  }
-};
-
-class SSEMul {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_mul_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_mul_pd(a, b);
-  }
-};
-
-class SSEDiv {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_div_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_div_pd(a, b);
-  }
-};
-
-class SSESquaredDiff {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_mul_ps(_mm_sub_ps(a, b), _mm_sub_ps(a, b));
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_mul_pd(_mm_sub_pd(a, b), _mm_sub_pd(a, b));
-  }
-};
-
-class SSEFirst {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return a;
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return a;
-  }
-};
-
-class SSESecond {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return b;
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return b;
-  }
-};
-
-class SSEClassificationError {
-public:
-  static const bool sse = true;
-  const real p;
-  union {__m128 f; __m128d d;} mp;
-  union {__m128 f; __m128d d;} result;
-
-public:
-  explicit SSEClassificationError(const real s) : p(s) {
-    if (sizeof(real) == sizeof(float)) {
-      mp.f = _mm_set1_ps(p);
-      result.f = _mm_set1_ps(1.0f);
-    } else {
-      mp.d = _mm_set1_pd(p);
-      result.d = _mm_set1_pd(1.0);
-    }
-  }
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    __m128 tmp1 = _mm_cmpgt_ps(a, mp.f);
-    __m128 tmp2 = _mm_cmpgt_ps(b, mp.f);
-    __m128 tmp3 = _mm_xor_ps(tmp1, tmp2);
-    return _mm_and_ps(tmp3, result.f);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    __m128d tmp1 = _mm_cmpgt_pd(a, mp.d);
-    __m128d tmp2 = _mm_cmpgt_pd(b, mp.d);
-    __m128d tmp3 = _mm_xor_pd(tmp1, tmp2);
-    return _mm_and_pd(tmp3, result.d);
-  }
-};
-}  // namespace binary
-}  // namespace base
-
-#endif /* HL_MATRIX_BASE_SSE_CUH_ */
diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/cuda/include/hl_matrix_type.cuh
index f965ba966793f6f6eea0ad3606f60553fe904dda..e18235219bd9f78dd87a92d448cb290d9a4904a1 100644
--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -17,35 +17,35 @@ limitations under the License. */
 
 #include "hl_base.h"
 
-#if defined(__CUDA_ARCH__)
+#ifdef __CUDA_ARCH__
+/**
+ * CUDA kernel inline function
+ */
+#define INLINE   __device__ inline
+#else
+/**
+ * CPP inline function
+ */
+#define INLINE   inline
+#endif
+
+#ifdef __CUDA_ARCH__
 #include <vector_types.h>
 #ifndef PADDLE_TYPE_DOUBLE
 typedef float4 vecType;
 #else
 typedef double2 vecType;
 #endif
-#elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
-#include <arm_neon.h>
-#ifndef PADDLE_TYPE_DOUBLE
-typedef float32x4_t  vecType;
-#else
-#error NEON instructions does not support double precision
-#endif
+#elif defined(__SSE3__)
+#include "hl_cpu_simd_sse.cuh"
+#define PADDLE_USE_SSE3
+#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && !defined(__NVCC__)
+// Currently nvcc does not support neon intrinsic.
+// TODO: Extract simd intrinsic implementation from .cu files.
+#include "hl_cpu_simd_neon.cuh"
+#define PADDLE_USE_NEON
 #else
-#include <mmintrin.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-#ifndef PADDLE_TYPE_DOUBLE
-typedef __m128  vecType;
-#else
-typedef __m128d vecType;
-#endif
-#endif
-
-#ifdef __CUDA_ARCH__
-#define INLINE   __device__ inline
-#else
-#define INLINE   inline
+#include "hl_cpu_scalar.cuh"
 #endif
 
 #endif  // HL_MATRIX_TYPE_CUH_
diff --git a/paddle/cuda/include/hl_neon_matrix_kernel.cuh b/paddle/cuda/include/hl_neon_matrix_kernel.cuh
deleted file mode 100644
index 7b4e5b00079b66d0a46a1344a43f41962cf50f10..0000000000000000000000000000000000000000
--- a/paddle/cuda/include/hl_neon_matrix_kernel.cuh
+++ /dev/null
@@ -1,299 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_NEON_MATRIX_KERNEL_CUH_
-#define HL_NEON_MATRIX_KERNEL_CUH_
-
-#include "hl_matrix_type.cuh"
-
-#define VECTOR_SIZE     16
-
-/* number of float in vector */
-#define     VECTOR_LEN      4
-#define     VECTOR_SET      vdupq_n_f32
-
-inline bool hl_check_align(size_t size) {
-  return !(size & (VECTOR_SIZE - 1));
-}
-
-inline bool hl_check_align(void *ptr) {
-  return hl_check_align(reinterpret_cast<size_t>(ptr));
-}
-
-template <class Agg>
-inline real hl_agg_op(Agg agg, vecType mm) {
-  float32x4_t rev = vrev64q_f32(mm);
-  float32x4_t tmp1 = agg.vecOp(rev, rev);
-  float32x2_t lo = vget_high_f32(rev);
-  float32x2_t hi = vget_low_f32(rev);
-  float32x4_t tmp2 = vcombine_f32(hi, lo);
-  float32x4_t ret = agg.vecOp(tmp1, tmp2);
-
-  return vgetq_lane_f32(ret, 0);
-}
-
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda) {
-  for (int i = 0; i < dimM; i++, A += lda) {
-    vecType mm = VECTOR_SET(agg.init());
-    vecType *a = (vecType*)(A);
-    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++) {
-      mm = agg.vecOp(mm, op.vecOp(*a));
-    }
-
-    int rem = dimN % VECTOR_LEN;
-    if (rem) {
-      real tmp = hl_agg_op(agg, mm);
-      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
-      for (int j = 0; j < rem; j++) {
-        tmp = agg(tmp, op(a[j]));
-      }
-      dst[i*ld] = sv(dst[i*ld], tmp);
-    } else {
-      dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
-    }
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda,
-                          real *B, int ldb) {
-  for (int i = 0; i < dimM; i++, A += lda, B += ldb) {
-    vecType mm = VECTOR_SET(agg.init());
-    vecType *a = (vecType*)(A);
-    vecType *b = (vecType*)(B);
-    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++, b++) {
-        mm = agg.vecOp(mm, op.vecOp(*a, *b));
-    }
-
-    int rem = dimN % VECTOR_LEN;
-    if (rem) {
-      real tmp = hl_agg_op(agg, mm);
-      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
-      real *b = B + (dimN / VECTOR_LEN) * VECTOR_LEN;
-      for (int j = 0; j < rem; j++) {
-          tmp = agg(tmp, op(a[j], b[j]));
-      }
-      dst[i*ld] = sv(dst[i*ld], tmp);
-    } else {
-        dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
-    }
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda,
-                         real *B, int ldb) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
-
-/*
- * MaxRow greater than or equal dimN
- * dimN is multiples of VECTOR_LEN
- * so rem <= MaxRow / VECTOR_LEN
- */
-template <int MaxRow, class Agg, class Op, class Saver>
-void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
-                               int dimM, int dimN,
-                               real *dst,
-                               real *A, int lda) {
-  vecType mm[MaxRow / VECTOR_LEN];
-  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
-    mm[n] = VECTOR_SET(agg.init());
-  }
-
-  for (int i = 0; i < dimM; i++) {
-    vecType *a = (vecType*)(A + i * lda);
-    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
-    }
-  }
-
-  vecType *result = (vecType*)(dst);
-  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-    result[n] = sv.vecOp(result[n], mm[n]);
-  }
-
-  int rem = dimN % VECTOR_LEN;
-  if (rem) {
-    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda);
-  }
-}
-
-/*
- * dimN is multiples of VECTOR_LEN
- * dimN greater than Step
- */
-template <int Step, class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda) {
-  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step) {
-    vecType mm[Step / VECTOR_LEN];
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      mm[n] = VECTOR_SET(agg.init());
-    }
-
-    for (int i = 0; i < dimM; i++) {
-      vecType *a = (vecType*)(A + i * lda);
-      for (int n = 0; n < Step / VECTOR_LEN; n++) {
-        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
-      }
-    }
-
-    vecType *result = (vecType*)(dst);
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      result[n] = sv.vecOp(result[n], mm[n]);
-    }
-  }
-
-  int remRow = dimN % Step;
-  if (remRow) {
-    hl_sse_column_op_with_rem<Step>(agg, op, sv, dimM, remRow, dst, A, lda);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda) {
-  if (dimN <= 16) {
-    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else if (dimN <= 32) {
-    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else if (dimN <= 1024 || dimM <= 512) {
-    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else {
-    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda);
-  }
-}
-
-template <int MaxRow, class Agg, class Op, class Saver>
-void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
-                               int dimM, int dimN,
-                               real *dst,
-                               real *A, int lda,
-                               real *B, int ldb) {
-  vecType mm[MaxRow / VECTOR_LEN];
-  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
-    mm[n] = VECTOR_SET(agg.init());
-  }
-
-  for (int i = 0; i < dimM; i++) {
-    vecType *a = (vecType*)(A + i * lda);
-    vecType *b = (vecType*)(B + i * ldb);
-    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
-    }
-  }
-
-  vecType *result = (vecType*)(dst);
-  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-    result[n] = sv.vecOp(result[n], mm[n]);
-  }
-
-  int rem = dimN % VECTOR_LEN;
-  if (rem) {
-    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    B += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda, B, ldb);
-  }
-}
-
-template <int Step, class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda,
-                             real *B, int ldb) {
-  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step, B += Step) {
-    vecType mm[Step / VECTOR_LEN];
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      mm[n] = VECTOR_SET(agg.init());
-    }
-
-    for (int i = 0; i < dimM; i++) {
-      vecType *a = (vecType*)(A + i * lda);
-      vecType *b = (vecType*)(B + i * ldb);
-      for (int n = 0; n < Step / VECTOR_LEN; n++) {
-        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
-      }
-    }
-
-    vecType *result = (vecType*)(dst);
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      result[n] = sv.vecOp(result[n], mm[n]);
-    }
-  }
-
-  int remRow = dimN % Step;
-  if (remRow) {
-    hl_sse_column_op_with_rem<Step>(
-        agg, op, sv, dimM, remRow, dst, A, lda, B, ldb);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda,
-                             real *B, int ldb) {
-  if (dimN <= 16) {
-    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else if (dimN <= 32) {
-    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else if (dimN <= 1024 || dimM <= 512) {
-    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else {
-    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  }
-}
-
-#endif /* HL_NEON_MATRIX_KERNEL_CUH_ */
diff --git a/paddle/cuda/include/hl_tensor_ops.h b/paddle/cuda/include/hl_tensor_ops.h
index 7945b98201b1812790fb0d53123e9ee007640485..93d38b7d2299d994cde0934213668a525bffa80c 100644
--- a/paddle/cuda/include/hl_tensor_ops.h
+++ b/paddle/cuda/include/hl_tensor_ops.h
@@ -328,6 +328,208 @@ public:
   INLINE T operator()(const T a, const T b) const { return a < b ? b : a; }
 };
 
+#ifdef PADDLE_USE_SSE3
+#ifndef PADDLE_TYPE_DOUBLE
+template <>
+class add<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_add_ps(a, b);
+  }
+};
+
+template <>
+class add_scale<__m128> {
+private:
+  const __m128 p1;
+  const __m128 p2;
+
+public:
+  INLINE add_scale(const __m128 s1, const __m128 s2) : p1(s1), p2(s2) {}
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_add_ps(_mm_mul_ps(p1, a), _mm_mul_ps(p2, b));
+  }
+};
+
+template <>
+class sub<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_sub_ps(a, b);
+  }
+};
+
+template <>
+class mul<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_mul_ps(a, b);
+  }
+};
+
+template <>
+class div<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_div_ps(a, b);
+  }
+};
+
+template <>
+class min<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_min_ps(a, b);
+  }
+};
+
+template <>
+class max<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_max_ps(a, b);
+  }
+};
+#else
+template <>
+class add<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_add_pd(a, b);
+  }
+};
+
+template <>
+class add_scale<__m128d> {
+private:
+  const __m128d p1;
+  const __m128d p2;
+
+public:
+  INLINE add_scale(const __m128d s1, const __m128d s2) : p1(s1), p2(s2) {}
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_add_pd(_mm_mul_pd(p1, a), _mm_mul_pd(p2, b));
+  }
+};
+
+template <>
+class sub<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_sub_pd(a, b);
+  }
+};
+
+template <>
+class mul<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_mul_pd(a, b);
+  }
+};
+
+template <>
+class div<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_div_pd(a, b);
+  }
+};
+
+template <>
+class min<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_min_pd(a, b);
+  }
+};
+
+template <>
+class max<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_max_pd(a, b);
+  }
+};
+#endif  // PADDLE_TYPE_DOUBLE
+#endif  // PADDLE_USE_SSE3
+
+#ifdef PADDLE_USE_NEON
+#ifndef PADDLE_TYPE_DOUBLE
+template <>
+class add<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vmulq_f32(a, b);
+  }
+};
+
+template <>
+class add_scale<float32x4_t> {
+private:
+  const float32x4_t p1;
+  const float32x4_t p2;
+
+public:
+  INLINE add_scale(const float32x4_t s1, const float32x4_t s2)
+      : p1(s1), p2(s2) {}
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vaddq_f32(vmulq_f32(p1, a), vmulq_f32(p2, b));
+  }
+};
+
+template <>
+class sub<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vsubq_f32(a, b);
+  }
+};
+
+template <>
+class mul<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vmulq_f32(a, b);
+  }
+};
+
+template <>
+class div<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    float32x4_t tmp = vrecpeq_f32(b);
+    return vmulq_f32(a, tmp);
+  }
+};
+
+template <>
+class min<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vminq_f32(a, b);
+  }
+};
+
+template <>
+class max<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vmaxq_f32(a, b);
+  }
+};
+#else
+#error To be implemented
+#endif  // PADDLE_TYPE_DOUBLE
+#endif  // PADDLE_USE_NEON
+
 }  // namespace binary
 }  // namespace hppl
 
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index c541b72e104bf2b81e2ac222d4af13ea2f90d289..a40530f41313be27dc1c2606501c6c00bed11c8b 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -396,6 +396,44 @@ Error __must_check backward(Argument& act) {
 }
 END_DEFINE_ACTIVATION(exponential)
 
+/**
+ * @brief Reciprocal Activation.
+ * \f[
+ * f(z) = 1/z
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(reciprocal)
+Error __must_check forward(Argument& act) {
+  act.value->reciprocal2();
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->dotMulSquare(*act.value);
+  act.grad->neg();
+  return Error();
+}
+END_DEFINE_ACTIVATION(reciprocal)
+
+/**
+ * @brief Square Root Activation.
+ * \f[
+ * f(z) = sqrt(z)
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(sqrt)
+Error __must_check forward(Argument& act) {
+  act.value->sqrt2();
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->dotDiv(*act.grad, *act.value);
+  act.grad->mulScalar(0.5);
+  return Error();
+}
+END_DEFINE_ACTIVATION(sqrt)
+
 /**
  * @brief Logarithm Activation.
  * \f[
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
index 13f02e51fe9e3831103982130bfdaa3255e1d174..1658282f3a5f79b128ce8685e92fd5cf9db2e41a 100644
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/math/Vector.h"
+#include "paddle/utils/StringUtil.h"
 
 #include "Evaluator.h"
 
@@ -74,6 +75,7 @@ class ChunkEvaluator : public Evaluator {
   std::vector<Segment> labelSegments_;
   std::vector<Segment> outputSegments_;
   std::set<int> excludedChunkTypes_;
+  mutable std::unordered_map<std::string, real> values_;
 
 public:
   virtual void init(const EvaluatorConfig& config) {
@@ -121,11 +123,9 @@ public:
   }
 
   virtual void printStats(std::ostream& os) const {
-    double precision = (double)numCorrect_ / numOutputSegments_;
-    double recall = (double)numCorrect_ / numLabelSegments_;
-    double f1 =
-        !numCorrect_ ? 0 : 2 * precision * recall / (precision + recall);
-    os << config_.name() << "=" << f1 << " true_chunks=" << numLabelSegments_
+    storeLocalValues();
+    os << config_.name() << "=" << values_["F1-score"]
+       << " true_chunks=" << numLabelSegments_
        << " result_chunks=" << numOutputSegments_
        << " correct_chunks=" << numCorrect_;
   }
@@ -243,6 +243,46 @@ public:
     if (tag == tagSingle_) return true;
     return false;
   }
+
+  // three metrics: precision, recall and F1-score
+  void getNames(std::vector<std::string>* names) {
+    storeLocalValues();
+    names->reserve(names->size() + values_.size());
+    for (auto it = values_.begin(); it != values_.end(); ++it) {
+      names->push_back(config_.name() + "." + it->first);
+    }
+  }
+
+  // get value by field name
+  real getValue(const std::string& name, Error* err) const {
+    storeLocalValues();
+    std::vector<std::string> buffers;
+    paddle::str::split(name, '.', &buffers);
+    auto it = values_.find(buffers.back());
+    if (it == values_.end()) {  // not found
+      *err = Error("No such key %s", name.c_str());
+      return 0.0f;
+    }
+
+    return it->second;
+  }
+
+  // get type of evaluator
+  std::string getTypeImpl() const { return "chunk"; }
+
+private:
+  void storeLocalValues() const {
+    CHECK_GE(numOutputSegments_, 0);
+    CHECK_GE(numLabelSegments_, 0);
+    double precision =
+        !numOutputSegments_ ? 0 : (double)numCorrect_ / numOutputSegments_;
+    double recall =
+        !numLabelSegments_ ? 0 : (double)numCorrect_ / numLabelSegments_;
+    values_["precision"] = precision;
+    values_["recall"] = recall;
+    values_["F1-score"] =
+        !numCorrect_ ? 0 : 2 * precision * recall / (precision + recall);
+  }
 };
 
 REGISTER_EVALUATOR(chunk, ChunkEvaluator);
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 69d5830dd2a1afb93948eacec1cb4309cf8c6109..6bfdea3c6e3f7cb80b620564f8229d954d773f04 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -217,10 +217,10 @@ void SmoothL1CostLayer::forwardImp(Matrix& output,
     targetCpu->copyFrom(target);
     outputCpu->copyFrom(output);
     labelCpu->copyFrom(*label.value);
-    targetCpu->smoothL1(*outputCpu, *labelCpu);
+    targetCpu->smoothL1(*outputCpu, *labelCpu, 1.0);
     target.copyFrom(*targetCpu);
   } else {
-    target.smoothL1(output, *label.value);
+    target.smoothL1(output, *label.value, 1.0);
   }
 }
 
@@ -238,10 +238,10 @@ void SmoothL1CostLayer::backwardImp(Matrix& output,
     outputGCpu->copyFrom(outputG);
     outputCpu->copyFrom(output);
     labelCpu->copyFrom(*label.value);
-    outputGCpu->smoothL1Bp(*outputCpu, *labelCpu);
+    outputGCpu->smoothL1Bp(*outputCpu, *labelCpu, 1.0);
     outputG.copyFrom(*outputGCpu);
   } else {
-    outputG.smoothL1Bp(output, *label.value);
+    outputG.smoothL1Bp(output, *label.value, 1.0);
   }
 }
 
diff --git a/paddle/gserver/tests/sequence_nest_layer_group.conf b/paddle/gserver/tests/sequence_nest_layer_group.conf
index c01b95f7a29ae73c2b3ccd5b56ad1d316cbc72ec..71ef53d08a2cea070806afb2c65ef15c4dd28f31 100644
--- a/paddle/gserver/tests/sequence_nest_layer_group.conf
+++ b/paddle/gserver/tests/sequence_nest_layer_group.conf
@@ -59,7 +59,7 @@ lstm_nest_group = recurrent_group(
     input=SubsequenceInput(emb_group), step=lstm_group, name="lstm_nest_group")
 # hasSubseq ->(seqlastins) seq
 lstm_last = last_seq(
-    input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE)
+    input=lstm_nest_group, agg_level=AggregateLevel.TO_SEQUENCE)
 
 # seq ->(expand) hasSubseq
 lstm_expand = expand_layer(
@@ -71,7 +71,7 @@ lstm_expand = expand_layer(
 lstm_average = pooling_layer(
     input=lstm_expand,
     pooling_type=AvgPooling(),
-    agg_level=AggregateLevel.EACH_SEQUENCE)
+    agg_level=AggregateLevel.TO_SEQUENCE)
 
 with mixed_layer(
         size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
diff --git a/paddle/majel/CMakeLists.txt b/paddle/majel/CMakeLists.txt
index 0c91fa72da4e5f7b19d0f3141799729e8a6375d3..93e5e2c22f0eb5797c635efd8ca34ffb74c03311 100644
--- a/paddle/majel/CMakeLists.txt
+++ b/paddle/majel/CMakeLists.txt
@@ -1,34 +1,8 @@
-cmake_minimum_required(VERSION 3.0)
+cc_library(place SRCS place.cc)
+cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
-if(GTEST_INCLUDE_DIR AND GTEST_LIBRARIES)
-    message("-- Found gtest (include: ${GTEST_INCLUDE_DIR}, library: ${GTEST_LIBRARIES})")
-else()
-    # find #include <majel/xx.h>
-    get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
-    include_directories(${PARENT_DIR})
+cc_library(ddim SRCS ddim.cc)
+cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 
-    # find cmake directory modules
-    get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
-    set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake")
-
-    # enable c++11
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-    # enable gtest
-    set(THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/third_party)
-    set(WITH_TESTING ON)
-    include(external/gtest)
-endif()
-
-########################### Build Majel #############################
-set(MAJEL_CXX_FILES place.cpp)
-set(MAJEL_CUDA_FILES "")
-
-if(CUDA_FOUND)
-    cuda_add_library(majel ${MAJEL_CUDA_FILES} ${MAJEL_CXX_FILES})
-else()
-    add_library(majel ${MAJEL_CXX_FILES})
-endif()
-#####################################################################
-
-add_subdirectory(test)
+nv_test(cuda_test SRCS cuda_test.cu)
+nv_test(dim_test SRCS dim_test.cu DEPS ddim)
diff --git a/paddle/majel/README.md b/paddle/majel/README.md
index 5539853056797284ca1fa5ef5ab16fa0059907f0..7a80816d8e4ffa3a9462f3d9b87eff0f048466aa 100644
--- a/paddle/majel/README.md
+++ b/paddle/majel/README.md
@@ -93,34 +93,97 @@ typedef boost::variant<
 
 Because `variant` may be thought of as "multi-type, single value", we can utilize it to implement unified interfaces for PaddlePaddle.
 
-## implement Tensor in Paddle
+`DDim` plays two kinds of roles in Majel. First, it is used to indicate the size of a tensor. For example, we can construct a new `DArray` by following way:
+ 
+ ```c++
+ DArray arr = make_darray(make_ddim({2,3}), 0.0f);
+ ```
+ It means that `arr` will be a two-dimension tensor, or a matrix. The size of its first dimension is 2 and the second is 3. All the element value of `arr` will be initialized as 0.0 .
+ 
+ The second meaning of `DDim` is tensor index. For example, if we want to access the value in the 1st row and 2nd column of `arr` and set it to 1.0, we can do like this:
+
+ ```c++
+ arr[make_ddim({0, 1})] = 1.0；
+ ```
+
+## Implement Tensor in Paddle
+
+We want to create a Tensor class to replace Vector and Matrix, and to support high-dimensional data. The operations on Tensor are implemented in both CPU and GPU. We also want to make sure that the Tensor interface is friendly to its callers.
+
+Tensor is only responsible for describing computing. It will not take charge of memory allocation policy, handles of some CUDA library context(e.g. cublasHandle, cudnnHandle), and dispatching CUDA kernels. Paddle has realize the initialization and resources management of hardware.
 
 Before writing code, please make sure you already look through Majel Source Code and grabbed the design philosophy of `DArray` in Majel.
 
-To assign subtasks to our colleagues, we have to discuss how to divide it to independent subtasks.
 
-- [ ] 1. First, we need to consider the third-party dependencies in Majel.
+### Memory Management
+`Allocation` manages a block of memory in device(CPU/GPU). We use `Place` to decribe memory location. The details of memory allocation and deallocation are implememted in `Allocator` and `DeAllocator`. Related low-level API such as `hl_malloc_device()` and `hl_malloc_host()` are provided by Paddle.
+
+### Dim and Array
+#### Dim
+
+`Dim` decribes the dimension information of an array.
+
+`DDimVar` is an alias of a specializd class of boost.variant class template.
+
+`DDim` is introduced to represent a dynamically sized dimension.
 
-    Majel heavily use `boost.variant`, but we don't want to integrate `boost` into PaddlePaddle. It's better to replace boost using the lightweight implementation. https://github.com/mapbox/variant Mapbox variant has the same speedy performance of `boost::variant `but is faster to compile, results in smaller binaries, and has no dependencies.
+For example:
 
-> @gangliao
+```
+Dim<2> d1 = make_dim(3, 3);
+DDim d2 = make_ddim({1, 2, 3});
+```
+
+You must appoint a concrete sized dimension to Dim, whereas DDim can represent a dynamically sized dimension.
+#### Array
+
+`Array` represents for a tensor with specific type and size.
 
-- [ ] 2. Re-implement `Place` and `Allocation/Memory`
+`DArrarVar` is an alias of a specialized class of boost.variant class template.
 
-    I found @wangkuiyi submitted a pull request includes `Place`. @gangliao and @qijun could re-implement `Allocation`, because we have the GPU development experience before joining Paddle team.
+`DArray` is introduced to represent a dynamically typed array.
+
+For example:
+
+```
+Array<float, 2> a1(Dim<2>(2, 2));
+DArray a2 = make_darray(make_ddim({3, 4}), 0.0, CpuPlace());
+```
 
-> @wangkuiyi @gangliao @qijun
+You must appoint the type and dimension of a Array, whereas DArray can represent a dynanmically typed array.
 
-- [ ] 3. Re-implement `Dim`.
 
-    `Dim` is an excellent implementation in Majel. 
+Please reference the section of `Learn from Majel` for more details.
 
-> ???
+### ArrayView
 
-- [ ] 4. Re-implement `Array/Tensor`.
+`ViewIterator` is a class template which implements basic iterator operation, including increment(++), decrement(--), dereference(*), equality comparisons(==) and so on.
+
+`ArrayView` is an encapsulation of `Array`， which introduces extra iterator methods, such as `begin()` and `end()`. The `begin()` method returns an iterator pointing to the first element in the ArrayView. And the `end()` method returns an iterator pointing to the pass-the-end element in the ArrayView.
+
+`ArrayView` make the visting and manipulating an array more efficiently, flexibly and safely.
+
+
+A global function `make_view` is provided to transform an array to corresponding arrayview.
+
+```
+template<typename T, int D>
+ArrayView<T, D> make_view(const Array<T, D>& in) {
+    return in;
+}
+```
+
+A global function `make_iterator` is provided to make iterator of an array.
+
+```
+template<typename T, int D>
+ViewIterator<ArrayView<T, D>> make_iterator(const Array<T, D>& in, Dim<D> idx) {
+    return make_iterator(make_view(in), idx);
+}
+```
 
-> Prerequisites: 1 - 3
+### Basic Operations
 
-- [ ] 5. Re-implement fundamental operators for `Array/Tensor`.
+The operations that manipulate DArray are defined as global functions, such as `ones`, `zeros`, `reshape`, `gemm` and so on.
 
-> Prerequisites: 1 - 4
+An array will be trasformed into an arrayview and then passed to the operation launching on a specific device(CPU/GPU).
diff --git a/paddle/majel/cuda_test.cu b/paddle/majel/cuda_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4067dda2f19f7661722d8a14a27c7b32ed6afc92
--- /dev/null
+++ b/paddle/majel/cuda_test.cu
@@ -0,0 +1,59 @@
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include "gtest/gtest.h"
+
+#define CHECK_ERR(x)                 \
+  if (x != cudaSuccess) {            \
+    fprintf(stderr,                  \
+            "%s in %s at line %d\n", \
+            cudaGetErrorString(err), \
+            __FILE__,                \
+            __LINE__);               \
+    exit(-1);                        \
+  }
+
+__global__ void vecAdd(float *d_A, float *d_B, float *d_C, int n) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < n) {
+    d_C[i] = d_A[i] + d_B[i];
+  }
+}
+
+TEST(Cuda, Equality) {
+  int n = 10;
+  // Memory allocation for h_A, h_B and h_C (in the host)
+  float h_A[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0};
+  float h_B[10] = {0.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
+  float h_C[10];
+  float *d_A, *d_B, *d_C;
+  cudaError_t err;
+  // Memory allocation for d_A, d_B and d_C (in the device)
+  err = cudaMalloc((void **)&d_A, sizeof(float) * n);
+  CHECK_ERR(err);
+
+  err = cudaMalloc((void **)&d_B, sizeof(float) * n);
+  CHECK_ERR(err);
+
+  err = cudaMalloc((void **)&d_C, sizeof(float) * n);
+  CHECK_ERR(err);
+
+  // Copying memory to device
+  err = cudaMemcpy(d_A, h_A, sizeof(float) * n, cudaMemcpyHostToDevice);
+  CHECK_ERR(err);
+
+  err = cudaMemcpy(d_B, h_B, sizeof(float) * n, cudaMemcpyHostToDevice);
+  CHECK_ERR(err);
+
+  // Calling the kernel
+  vecAdd<<<ceil(n / 256.0), 256>>>(d_A, d_B, d_C, n);
+
+  // Copying results back to host
+  err = cudaMemcpy(h_C, d_C, sizeof(float) * n, cudaMemcpyDeviceToHost);
+  CHECK_ERR(err);
+
+  EXPECT_EQ(h_C[0], 1.0);
+  for (int i = 1; i < n - 1; ++i) {
+    EXPECT_EQ(h_C[i], 11.0);
+  }
+  EXPECT_EQ(h_C[9], 1.0);
+}
diff --git a/paddle/majel/ddim.cc b/paddle/majel/ddim.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f32408ed53074234873ec0ea8ee7f4e449e5e908
--- /dev/null
+++ b/paddle/majel/ddim.cc
@@ -0,0 +1,222 @@
+#include "paddle/majel/ddim.h"
+
+namespace majel {
+
+///@cond HIDDEN
+
+template <int i>
+Dim<i> make_dim(const int* d) {
+  return Dim<i>(*d, make_dim<i - 1>(d + 1));
+}
+
+template <>
+Dim<1> make_dim<1>(const int* d) {
+  return Dim<1>(*d);
+}
+
+void make_ddim(DDim& ddim, const int* dims, int n) {
+  switch (n) {
+    case 1:
+      ddim = make_dim<1>(dims);
+      break;
+    case 2:
+      ddim = make_dim<2>(dims);
+      break;
+    case 3:
+      ddim = make_dim<3>(dims);
+      break;
+    case 4:
+      ddim = make_dim<4>(dims);
+      break;
+    case 5:
+      ddim = make_dim<5>(dims);
+      break;
+    case 6:
+      ddim = make_dim<6>(dims);
+      break;
+    case 7:
+      ddim = make_dim<7>(dims);
+      break;
+    case 8:
+      ddim = make_dim<8>(dims);
+      break;
+    case 9:
+      ddim = make_dim<9>(dims);
+      break;
+    default:
+      throw std::invalid_argument(
+          "Dynamic dimensions must have between [1, 9] dimensions.");
+  }
+}
+
+///@endcond
+
+DDim make_ddim(std::initializer_list<int> dims) {
+  DDim result(make_dim(0));
+  make_ddim(result, dims.begin(), dims.size());
+  return result;
+}
+
+DDim make_ddim(const std::vector<int>& dims) {
+  DDim result(make_dim(0));
+  make_ddim(result, &dims[0], dims.size());
+  return result;
+}
+
+///@cond HIDDEN
+// XXX For some reason, putting this in an anonymous namespace causes errors
+class DynamicMutableIndexer : public boost::static_visitor<int&> {
+public:
+  DynamicMutableIndexer(int idx) : idx_(idx) {}
+
+  template <int D>
+  int& operator()(Dim<D>& dim) const {
+    return dim[idx_];
+  }
+
+private:
+  int idx_;
+};
+
+class DynamicConstIndexer : public boost::static_visitor<int> {
+public:
+  DynamicConstIndexer(int idx) : idx_(idx) {}
+
+  template <int D>
+  int operator()(const Dim<D>& dim) const {
+    return dim[idx_];
+  }
+
+private:
+  int idx_;
+};
+
+///@endcond
+
+int& DDim::operator[](int idx) {
+  return boost::apply_visitor(DynamicMutableIndexer(idx), var);
+}
+
+int DDim::operator[](int idx) const {
+  return boost::apply_visitor(DynamicConstIndexer(idx), var);
+}
+
+bool DDim::operator==(DDim d) const {
+  if (var.which() != d.getVar().which()) {
+    return false;
+  } else {
+    std::vector<int> v1 = vectorize(*this);
+    std::vector<int> v2 = vectorize(d);
+
+    for (unsigned int i = 0; i < v1.size(); i++) {
+      if (v1[i] != v2[i]) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+}
+
+bool DDim::operator!=(DDim d) const { return !(*this == d); }
+
+DDim DDim::operator+(DDim d) const {
+  std::vector<int> v1 = vectorize(*this);
+  std::vector<int> v2 = vectorize(d);
+
+  std::vector<int> v3;
+
+  assert(v1.size() == v2.size());
+
+  for (unsigned int i = 0; i < v1.size(); i++) {
+    v3.push_back(v1[i] + v2[i]);
+  }
+
+  return make_ddim(v3);
+}
+
+DDim DDim::operator*(DDim d) const {
+  std::vector<int> v1 = vectorize(*this);
+  std::vector<int> v2 = vectorize(d);
+
+  std::vector<int> v3;
+
+  assert(v1.size() == v2.size());
+
+  for (unsigned int i = 0; i < v1.size(); i++) {
+    v3.push_back(v1[i] * v2[i]);
+  }
+
+  return make_ddim(v3);
+}
+
+int get(const DDim& ddim, int idx) { return ddim[idx]; }
+
+void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
+
+///@cond HIDDEN
+struct VectorizeVisitor : public boost::static_visitor<> {
+  std::vector<int>& vector;
+
+  VectorizeVisitor(std::vector<int>& v) : vector(v) {}
+
+  template <typename T>
+  void operator()(const T& t) {
+    vector.push_back(t.head);
+    this->operator()(t.tail);
+  }
+
+  void operator()(const Dim<1>& t) { vector.push_back(t.head); }
+};
+///@endcond
+
+std::vector<int> vectorize(const DDim& ddim) {
+  std::vector<int> result;
+  VectorizeVisitor visitor(result);
+  boost::apply_visitor(visitor, ddim);
+  return result;
+}
+
+ssize_t product(const DDim& ddim) {
+  ssize_t result = 1;
+  std::vector<int> v = vectorize(ddim);
+  for (auto i : v) {
+    result *= i;
+  }
+  return result;
+}
+
+///\cond HIDDEN
+
+struct ArityVisitor : boost::static_visitor<int> {
+  template <int D>
+  int operator()(Dim<D>) const {
+    return D;
+  }
+};
+
+///\endcond
+
+int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
+
+///\cond HIDDEN
+
+struct DDimPrinter : boost::static_visitor<void> {
+  std::ostream& os;
+  DDimPrinter(std::ostream& os_) : os(os_) {}
+
+  template <typename T>
+  void operator()(const T& t) {
+    os << t;
+  }
+};
+
+///\endcond
+
+std::ostream& operator<<(std::ostream& os, const majel::DDim& ddim) {
+  DDimPrinter printer(os);
+  boost::apply_visitor(printer, ddim);
+  return os;
+}
+
+}  // namespace majel
diff --git a/paddle/majel/ddim.h b/paddle/majel/ddim.h
new file mode 100644
index 0000000000000000000000000000000000000000..7be756f8c098ba5aa3a5ff4380c90f4b90a55bb7
--- /dev/null
+++ b/paddle/majel/ddim.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#include <boost/variant.hpp>
+#include <initializer_list>
+#include <stdexcept>
+#include <vector>
+
+#include "paddle/majel/dim.h"
+
+namespace majel {
+
+namespace {
+typedef boost::variant<Dim<1>,
+                       Dim<2>,
+                       Dim<3>,
+                       Dim<4>,
+                       Dim<5>,
+                       Dim<6>,
+                       Dim<7>,
+                       Dim<8>,
+                       Dim<9>>
+    DDimVar;
+}
+
+/**
+ * \brief A dynamically sized dimension.
+ *
+ * The number of dimensions must be between [1, 9].
+ */
+struct DDim {
+  DDimVar var;
+
+  DDim() : var(Dim<1>()) {}
+
+  template <int D>
+  DDim(const Dim<D>& in) : var(in) {}
+
+  template <int D>
+  DDim& operator=(const Dim<D>& in) {
+    var = in;
+    return *this;
+  }
+
+  int& operator[](int idx);
+  int operator[](int idx) const;
+
+  template <typename Visitor>
+  typename Visitor::result_type apply_visitor(Visitor& visitor) {
+    return var.apply_visitor(visitor);
+  }
+
+  template <typename Visitor>
+  typename Visitor::result_type apply_visitor(Visitor& visitor) const {
+    return var.apply_visitor(visitor);
+  }
+
+  DDimVar getVar() { return var; }
+
+  bool operator==(DDim d) const;
+
+  bool operator!=(DDim d) const;
+
+  DDim operator+(DDim d) const;
+
+  DDim operator*(DDim d) const;
+};
+
+/**
+ * \brief Make a DDim from std::vector<int>
+ *
+ * \param dims An vector of ints. Must be sized between [1, 9]
+ */
+DDim make_ddim(const std::vector<int>& dims);
+
+/**
+ * \brief Make a DDim from an initializer list
+ *
+ * \param dims An initializer list of ints. Must be sized between [1, 9]
+ *
+ */
+DDim make_ddim(std::initializer_list<int> dims);
+
+int get(const DDim& dim, int idx);
+void set(DDim& dim, int idx, int val);
+
+std::vector<int> vectorize(const DDim& ddim);
+
+ssize_t product(const DDim& ddim);
+
+/**
+ * \brief What is the length of this dimension?
+ *
+ * \param Dynamic dimension to inspect
+ */
+
+int arity(const DDim& ddim);
+
+std::ostream& operator<<(std::ostream&, const majel::DDim&);
+
+}  // namespace majel
+
+namespace boost {
+
+template <typename T>
+T get(const majel::DDim& in) {
+  return boost::get<T>(in.var);
+}
+
+}  // namespace boost
diff --git a/paddle/majel/ddim_test.cc b/paddle/majel/ddim_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a5b8a7c4d26740c1c4169547e76a0cf5558facc0
--- /dev/null
+++ b/paddle/majel/ddim_test.cc
@@ -0,0 +1,65 @@
+//#include <stdexcept>
+//#include <unittest/unittest.h>
+#include <sstream>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/majel/ddim.h"
+
+TEST(DDim, Equality) {
+  // construct a DDim from an initialization list
+  majel::DDim ddim = majel::make_ddim({9, 1, 5});
+  EXPECT_EQ(ddim[0], 9);
+  EXPECT_EQ(ddim[1], 1);
+  EXPECT_EQ(ddim[2], 5);
+
+  // construct a DDim from a vector
+  std::vector<int> vec({9, 1, 5});
+  majel::DDim vddim = majel::make_ddim(vec);
+  EXPECT_EQ(ddim[0], 9);
+  EXPECT_EQ(ddim[1], 1);
+  EXPECT_EQ(ddim[2], 5);
+
+  // mutate a DDim
+  ddim[1] = 2;
+  EXPECT_EQ(ddim[1], 2);
+  majel::set(ddim, 0, 6);
+  EXPECT_EQ(majel::get(ddim, 0), 6);
+
+  // vectorize a DDim
+  std::vector<int> res_vec = majel::vectorize(vddim);
+  EXPECT_EQ(res_vec[0], 9);
+  EXPECT_EQ(res_vec[1], 1);
+  EXPECT_EQ(res_vec[2], 5);
+  majel::Dim<3> d(3, 2, 1);
+  res_vec = majel::vectorize(majel::DDim(d));
+  EXPECT_EQ(res_vec[0], 3);
+  EXPECT_EQ(res_vec[1], 2);
+  EXPECT_EQ(res_vec[2], 1);
+
+  // add two DDims
+  majel::DDim ddim_sum = ddim + vddim;
+  EXPECT_EQ(ddim_sum[0], 15);
+  EXPECT_EQ(ddim_sum[1], 3);
+  EXPECT_EQ(ddim_sum[2], 10);
+
+  // multiply two DDims
+  majel::DDim ddim_mul = ddim * vddim;
+  EXPECT_EQ(ddim_mul[0], 54);
+  EXPECT_EQ(ddim_mul[1], 2);
+  EXPECT_EQ(ddim_mul[2], 25);
+
+  // arity of a DDim
+  EXPECT_EQ(majel::arity(ddim), 3);
+
+  // product of a DDim
+  EXPECT_EQ(majel::product(vddim), 45);
+}
+
+TEST(DDim, Print) {
+  // print a DDim
+  std::stringstream ss;
+  majel::DDim ddim = majel::make_ddim({2, 3, 4});
+  ss << ddim;
+  EXPECT_EQ("2, 3, 4", ss.str());
+}
diff --git a/paddle/majel/detail/cuda_assert.h b/paddle/majel/detail/cuda_assert.h
new file mode 100644
index 0000000000000000000000000000000000000000..9490d0ae3eff01bdb4403de710b7bfd878e87f03
--- /dev/null
+++ b/paddle/majel/detail/cuda_assert.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+
+#if defined(__APPLE__) && defined(__CUDA_ARCH__) && !defined(NDEBUG)
+#include <stdio.h>
+#define MAJEL_ASSERT(e)                                                       \
+  do {                                                                        \
+    if (!(e)) {                                                               \
+      printf(                                                                 \
+          "%s:%d Assertion `%s` failed.\n", __FILE__, __LINE__, TOSTRING(e)); \
+      asm("trap;");                                                           \
+    }                                                                         \
+  } while (0)
+
+#define MAJEL_ASSERT_MSG(e, m)                      \
+  do {                                              \
+    if (!(e)) {                                     \
+      printf("%s:%d Assertion `%s` failed (%s).\n", \
+             __FILE__,                              \
+             __LINE__,                              \
+             TOSTRING(e),                           \
+             m);                                    \
+      asm("trap;");                                 \
+    }                                               \
+  } while (0)
+#else
+#include <assert.h>
+#define MAJEL_ASSERT(e) assert(e)
+#define MAJEL_ASSERT_MSG(e, m) assert((e) && (m))
+#endif
diff --git a/paddle/majel/detail/hostdevice.h b/paddle/majel/detail/hostdevice.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7de86b7b2f75d206e730ec409bbee5d0a08942e
--- /dev/null
+++ b/paddle/majel/detail/hostdevice.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#ifdef __CUDACC__
+#define HOSTDEVICE __host__ __device__
+#define HOST __host__
+#else
+#define HOSTDEVICE
+#define HOST
+#endif
diff --git a/paddle/majel/dim.h b/paddle/majel/dim.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4b0c6aea683384d4657dd5db6f419b9e1108704
--- /dev/null
+++ b/paddle/majel/dim.h
@@ -0,0 +1,451 @@
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <type_traits>
+
+#include "paddle/majel/detail/cuda_assert.h"
+#include "paddle/majel/detail/hostdevice.h"
+
+namespace majel {
+
+// Statically sized, statically indexed dimension
+template <int i>
+struct Dim {
+  static constexpr int dimensions = i;
+
+  template <typename... Args>
+  HOSTDEVICE Dim(int _head, Args... _tail) : head(_head), tail(_tail...) {
+    static_assert(sizeof...(_tail) == i - 1,
+                  "Dim initialized with the wrong number of parameters");
+  }
+
+  HOSTDEVICE
+  Dim(int _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
+
+  HOSTDEVICE
+  Dim() : head(0), tail() {}
+
+  /** Construct a Dim from a linear index and size.  Uses Fortran order
+   * indexing. */
+  HOSTDEVICE
+  Dim(int idx, const Dim<i>& size)
+      : head(idx % size.head), tail(idx / size.head, size.tail) {}
+
+  /** Construct a Dim with each dimension set to the given index */
+  HOSTDEVICE
+  Dim(int idx) : head(idx), tail(idx) {}
+
+  HOSTDEVICE
+  bool operator==(const Dim<i>& o) const {
+    return (head == o.head) && (tail == o.tail);
+  }
+
+  HOSTDEVICE
+  bool operator!=(const Dim<i>& o) const { return !(*this == o); }
+
+  HOSTDEVICE
+  int& operator[](int idx);
+  HOSTDEVICE
+  int operator[](int idx) const;
+
+  HOST std::string to_string() const;
+
+  int head;
+  Dim<i - 1> tail;
+};
+
+// Base case specialization
+template <>
+struct Dim<1> {
+  static constexpr int dimensions = 1;
+
+  HOSTDEVICE
+  Dim(int _head) : head(_head) {}
+
+  HOSTDEVICE
+  Dim() : head(0) {}
+
+  HOSTDEVICE
+  Dim(int idx, const Dim<1>& size) : head(idx) {
+#ifndef __CUDA_ARCH__
+    if (idx >= size.head) {
+      throw std::invalid_argument("Index out of range.");
+    }
+#else
+    MAJEL_ASSERT(idx < size.head);
+#endif
+  }
+
+  HOSTDEVICE
+  bool operator==(const Dim<1>& o) const { return (head == o.head); }
+
+  HOSTDEVICE
+  bool operator!=(const Dim<1>& o) const { return !(*this == o); }
+
+  HOSTDEVICE
+  int& operator[](int idx);
+  HOSTDEVICE
+  int operator[](int idx) const;
+
+  int head;
+};
+
+namespace {
+
+// Helper for accessing Dim classes
+template <int i>
+struct DimGetter {
+  // Return a copy if Dim is const
+  template <typename D>
+  HOSTDEVICE static int impl(const D& d) {
+    return DimGetter<i - 1>::impl(d.tail);
+  }
+  // Return a reference if Dim is mutable
+  template <typename D>
+  HOSTDEVICE static int& impl(D& d) {
+    return DimGetter<i - 1>::impl(d.tail);
+  }
+};
+
+// Eureka! We found the element!
+template <>
+struct DimGetter<0> {
+  // Return a copy if Dim is const
+  template <typename D>
+  HOSTDEVICE static int impl(const D& d) {
+    return d.head;
+  }
+  // Return a reference if Dim is mutable
+  template <typename D>
+  HOSTDEVICE static int& impl(D& d) {
+    return d.head;
+  }
+};
+
+template <int D>
+HOSTDEVICE int& indexer(Dim<D>& dim, int idx) {
+#ifndef __CUDA_ARCH__
+  if (idx < 0) {
+    throw std::invalid_argument("Tried to access a negative dimension");
+  }
+#else
+  MAJEL_ASSERT(idx >= 0);
+#endif
+  if (idx == 0) {
+    return dim.head;
+  }
+  return indexer(dim.tail, idx - 1);
+}
+
+template <>
+HOSTDEVICE int& indexer<1>(Dim<1>& dim, int idx) {
+#ifndef __CUDA_ARCH__
+  if (idx != 0) {
+    throw std::invalid_argument("Invalid index");
+  }
+#else
+  MAJEL_ASSERT(idx == 0);
+#endif
+  return dim.head;
+}
+
+template <int D>
+HOSTDEVICE int indexer(const Dim<D>& dim, int idx) {
+#ifndef __CUDA_ARCH__
+  if (idx < 0) {
+    throw std::invalid_argument("Tried to access a negative dimension");
+  }
+#else
+  MAJEL_ASSERT(idx >= 0);
+#endif
+  if (idx == 0) {
+    return dim.head;
+  }
+  return indexer(dim.tail, idx - 1);
+}
+
+template <>
+HOSTDEVICE int indexer<1>(const Dim<1>& dim, int idx) {
+#ifndef __CUDA_ARCH__
+  if (idx != 0) {
+    throw std::invalid_argument("Invalid index");
+  }
+#else
+  MAJEL_ASSERT(idx == 0);
+#endif
+  return dim.head;
+}
+
+}  // namespace
+// Static access to constant Dim
+template <int i, int l>
+HOSTDEVICE int get(const Dim<l>& d) {
+  return DimGetter<i>::impl(d);
+}
+
+// Static access to mutable Dim
+template <int i, int l>
+HOSTDEVICE int& get(Dim<l>& d) {
+  return DimGetter<i>::impl(d);
+}
+
+// Dynamic access to constant Dim
+template <int l>
+HOSTDEVICE int Dim<l>::operator[](int i) const {
+  return indexer(*this, i);
+}
+
+// Dynamic access to mutable Dim
+template <int l>
+HOSTDEVICE int& Dim<l>::operator[](int i) {
+  return indexer(*this, i);
+}
+
+// Dynamic access to constant Dim
+inline HOSTDEVICE int Dim<1>::operator[](int i) const {
+  return indexer(*this, i);
+}
+
+// Dynamic access to mutable Dim
+inline HOSTDEVICE int& Dim<1>::operator[](int i) { return indexer(*this, i); }
+
+// Dynamic access to constant Dim
+// without std::enable_if will try to instantiate this on get<0>(d)
+template <int l>
+HOSTDEVICE typename std::enable_if<(l > 0), int>::type get(const Dim<l>& d,
+                                                           int i) {
+  return d[i];
+}
+
+// Dynamic access to mutable Dim
+template <int l>
+HOSTDEVICE typename std::enable_if<(l > 0), int&>::type get(Dim<l>& d, int i) {
+  return d[i];
+}
+
+// Dot product of two dims
+template <int i>
+HOSTDEVICE int linearize(const Dim<i>& a, const Dim<i>& b) {
+  return a.head * b.head + linearize(a.tail, b.tail);
+}
+
+// Base case dot product of two Dims
+// Notice it is inline because it is no longer a template
+template <>
+HOSTDEVICE inline int linearize(const Dim<1>& a, const Dim<1>& b) {
+  return a.head * b.head;
+}
+
+// Product of a Dim
+template <int i>
+HOSTDEVICE int product(const Dim<i>& a, int prod = 1) {
+  return prod * a.head * product(a.tail);
+}
+
+// Base case product of a Dim
+// Notice it is inline because it is no longer a template
+template <>
+HOSTDEVICE inline int product(const Dim<1>& a, int prod) {
+  return prod * a.head;
+}
+
+// Is 0 <= idx_i < size_i for all i?
+template <int i>
+HOSTDEVICE bool contained(const Dim<i>& idx, const Dim<i>& size) {
+  return ((0 <= idx.head) && (idx.head < size.head) &&
+          contained(idx.tail, size.tail));
+}
+
+// Base case of is 0 <= idx_i < size_i ?
+// Notice it is inline because it is no longer a template
+template <>
+HOSTDEVICE inline bool contained(const Dim<1>& idx, const Dim<1>& size) {
+  return ((0 <= idx.head) && (idx.head < size.head));
+}
+
+/**
+ * \brief Check if a size and a stride create a Fortran order contiguous
+ * block of memory.
+ */
+template <int i>
+HOST bool contiguous(const Dim<i>& size, const Dim<i>& stride, int mul = 1) {
+  if (product(size) == 0) return true;
+  int contiguous_stride = get<0>(size) == 1 ? 0 : mul;
+  return (get<0>(stride) == contiguous_stride &&
+          contiguous(size.tail, stride.tail, mul * get<0>(size)));
+}
+
+///\cond HIDDEN
+// Base case of contiguous, check the nth stride is the size of
+// the prefix multiply of n-1 dims.
+template <>
+inline bool contiguous(const Dim<1>& size, const Dim<1>& stride, int mul) {
+  if (get<0>(size) == 0) return true;
+  int contiguous_stride = get<0>(size) == 1 ? 0 : mul;
+  return get<0>(stride) == contiguous_stride;
+}
+///\endcond
+
+/**
+ * \brief Compute exclusive prefix-multiply of a Dim.
+ */
+template <int i>
+HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i>& src, int mul = 1) {
+  return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head));
+}
+
+///\cond HIDDEN
+// Base case of ex_prefix_mul
+// Notice it is inline because it is no longer a template
+template <>
+HOSTDEVICE inline Dim<1> ex_prefix_mul(const Dim<1>& src, int mul) {
+  return Dim<1>(mul);
+}
+///\endcond
+
+/**
+ * \brief Calculate strides of a contiguous array of the given size
+ *
+ * Sets the stride for any dimension with an extent of 1 to 0.
+ * \param size Dim object containing the size of the array.
+ * \param base The base stride to use.
+ * \return Dim object the same size as \p size with the strides.
+ */
+template <int i>
+HOSTDEVICE Dim<i> contiguous_strides(const Dim<i>& size, int base = 1) {
+  int stride = size.head == 1 ? 0 : base;
+  return Dim<i>(stride, contiguous_strides(size.tail, base * size.head));
+}
+
+///\cond HIDDEN
+
+// Base case of contiguous_strides
+template <>
+HOSTDEVICE inline Dim<1> contiguous_strides(const Dim<1>& size, int base) {
+  int stride = size.head == 1 ? 0 : base;
+  return Dim<1>(stride);
+}
+
+///\endcond
+
+/**
+ * Add two dimensions together
+ */
+template <int i>
+HOSTDEVICE Dim<i> dim_plus(const Dim<i>& a, const Dim<i>& b) {
+  return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail));
+}
+
+// Base case
+template <>
+HOSTDEVICE inline Dim<1> dim_plus(const Dim<1>& a, const Dim<1>& b) {
+  return Dim<1>(a.head + b.head);
+}
+
+template <int i>
+HOSTDEVICE Dim<i> operator+(const Dim<i>& lhs, const Dim<i>& rhs) {
+  return dim_plus(lhs, rhs);
+}
+
+/**
+ * Multiply two dimensions together
+ */
+template <int i>
+HOSTDEVICE Dim<i> dim_mult(const Dim<i>& a, const Dim<i>& b) {
+  return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail));
+}
+
+// Base case
+template <>
+HOSTDEVICE inline Dim<1> dim_mult(const Dim<1>& a, const Dim<1>& b) {
+  return Dim<1>(a.head * b.head);
+}
+
+template <int i>
+HOSTDEVICE Dim<i> operator*(const Dim<i>& lhs, const Dim<i>& rhs) {
+  return dim_mult(lhs, rhs);
+}
+
+/**
+ * \brief Normalize strides to ensure any dimension with extent 1
+ * has stride 0.
+ *
+ * \param size Dim object containing the size of an array
+ * \param stride Dim object containing stride of an array
+ * \return Dim object the same size as \p size with normalized strides
+ *
+ */
+
+template <int i>
+HOSTDEVICE Dim<i> normalize_strides(const Dim<i>& size, const Dim<i>& stride) {
+  int norm_stride = size.head == 1 ? 0 : stride.head;
+  return Dim<i>(norm_stride, normalize_strides(size.tail, stride.tail));
+}
+
+///\cond HIDDEN
+
+template <>
+HOSTDEVICE inline Dim<1> normalize_strides(const Dim<1>& size,
+                                           const Dim<1>& stride) {
+  int norm_stride = size.head == 1 ? 0 : stride.head;
+  return Dim<1>(norm_stride);
+}
+
+///\endcond
+
+/**
+ * Helper function to create a Dim
+ *
+ * \param idxes The type of Dim constructed depends on the number of params
+ *
+ */
+
+template <typename... Args>
+HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
+  return Dim<sizeof...(Args)>(idxes...);
+}
+
+// Allows us to output a Dim
+// XXX For some reason, overloading fails to resolve this correctly
+template <int i>
+typename std::enable_if<(i > 1), std::ostream&>::type operator<<(
+    std::ostream& os, const majel::Dim<i>& d) {
+  os << d.head << ", " << d.tail;
+  return os;
+}
+
+// Base case that allows us to output a Dim
+// XXX I wish this could be an overload instead of a template
+template <int i>
+typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
+    std::ostream& os, const majel::Dim<i>& d) {
+  os << d.head;
+  return os;
+}
+
+template <int i>
+HOST std::string Dim<i>::to_string() const {
+  std::stringstream stream;
+
+  stream << *this;
+
+  return stream.str();
+}
+
+template <int D>
+HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
+  Dim<D> result;
+
+  for (int i = 0; i < D - 1; ++i) {
+    result[i] = linear_index % extents[i];
+    linear_index /= extents[i];
+  }
+
+  result[D - 1] = linear_index;
+
+  return result;
+}
+
+}  // namespace majel
diff --git a/paddle/majel/dim_test.cu b/paddle/majel/dim_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a7d81e595bea7fa6326ea350e2702e1ef8f5caa4
--- /dev/null
+++ b/paddle/majel/dim_test.cu
@@ -0,0 +1,128 @@
+#include <thrust/device_vector.h>
+#include <sstream>
+
+#include "paddle/majel/dim.h"
+#include "gtest/gtest.h"
+
+__global__ void test(majel::Dim<2>* o) {
+    o[0] = majel::make_dim(5, 6);
+}
+
+__global__ void dyn_idx_gpu(int* o) {
+    auto d = majel::make_dim(5, 6);
+    o[0] = d[1];
+}
+
+TEST(Dim, Equality) {
+    // construct a Dim on the CPU
+    auto a = majel::make_dim(3, 4);
+    EXPECT_EQ(majel::get<0>(a), 3);
+    EXPECT_EQ(majel::get<1>(a), 4);
+
+    // construct a Dim on the GPU
+    thrust::device_vector<majel::Dim<2>> t(2);
+    test<<<1,1>>>(thrust::raw_pointer_cast(t.data()));
+    a = t[0];
+    EXPECT_EQ(majel::get<0>(a), 5);
+    EXPECT_EQ(majel::get<1>(a), 6);
+
+    // linearization
+    auto b = majel::make_dim(7, 8);
+    EXPECT_EQ(majel::linearize(a, b), 83);
+
+    // product
+    EXPECT_EQ(majel::product(a), 30);
+
+    // mutate a Dim
+    majel::get<1>(b) = 10;
+    EXPECT_EQ(majel::get<0>(b), 7);
+    EXPECT_EQ(majel::get<1>(b), 10);
+
+    // dynamic access
+    majel::get(b, 0) = 8;
+    b[1] = 11;
+    EXPECT_EQ(majel::get<0>(b), 8);
+    EXPECT_EQ(majel::get<1>(b), 11);
+    EXPECT_EQ(majel::get(b, 0), 8);
+    EXPECT_EQ(b[1], 11);
+
+    // dynamic access on GPU
+    thrust::device_vector<int> r(1);
+    dyn_idx_gpu<<<1,1>>>(thrust::raw_pointer_cast(r.data()));
+    int res = r[0];
+    EXPECT_EQ(res, 6);
+
+    // ex_prefix_mul
+    majel::Dim<3> c = majel::ex_prefix_mul(majel::Dim<3>(3, 4, 5));
+    EXPECT_EQ(majel::get<0>(c), 1);
+    EXPECT_EQ(majel::get<1>(c), 3);
+    EXPECT_EQ(majel::get<2>(c), 12);
+
+    // contiguous_strides
+    c = majel::contiguous_strides(majel::Dim<3>(10, 1, 10));
+    EXPECT_EQ(majel::get<0>(c), 1);
+    EXPECT_EQ(majel::get<1>(c), 0);
+    EXPECT_EQ(majel::get<2>(c), 10);
+    c = majel::contiguous_strides(majel::Dim<3>(10, 10, 1));
+    EXPECT_EQ(majel::get<0>(c), 1);
+    EXPECT_EQ(majel::get<1>(c), 10);
+    EXPECT_EQ(majel::get<2>(c), 0);
+    c = majel::contiguous_strides(majel::Dim<3>(1, 10, 10));
+    EXPECT_EQ(majel::get<0>(c), 0);
+    EXPECT_EQ(majel::get<1>(c), 1);
+    EXPECT_EQ(majel::get<2>(c), 10);
+    c = majel::contiguous_strides(majel::Dim<3>(2, 3, 4));
+    EXPECT_EQ(majel::get<0>(c), 1);
+    EXPECT_EQ(majel::get<1>(c), 2);
+    EXPECT_EQ(majel::get<2>(c), 6);
+
+    // generate from an index
+    auto size = majel::make_dim(4, 5, 2);
+    c = majel::Dim<3>(14, size);
+    EXPECT_EQ(majel::get<0>(c), 2);
+    EXPECT_EQ(majel::get<1>(c), 3);
+    EXPECT_EQ(majel::get<2>(c), 0);
+    c = majel::Dim<3>(25, size);
+    EXPECT_EQ(majel::get<0>(c), 1);
+    EXPECT_EQ(majel::get<1>(c), 1);
+    EXPECT_EQ(majel::get<2>(c), 1);
+}
+
+TEST(Dim, Bool) {
+    auto a = majel::make_dim(3, 4);
+    auto b = majel::make_dim(5, 6);
+    auto c = majel::make_dim(3, 4);
+
+    // in_bounds check
+    EXPECT_TRUE(majel::contained(a, b));
+    EXPECT_FALSE(majel::contained(b, a));
+
+    // comparison
+    EXPECT_TRUE(a == a);
+    EXPECT_FALSE(a == b);
+    EXPECT_TRUE(a == c);
+
+    // contiguous check
+    int x = 4, y = 5, z = 2;
+    majel::Dim<3> sizef(x, y, z);
+    majel::Dim<3> stridea(1, x, x*y);
+    majel::Dim<3> strideb(2, 2*x, 2*x*y);
+    majel::Dim<3> stridec(1, x, 2*x*y);
+    EXPECT_TRUE(majel::contiguous(sizef, stridea));
+    EXPECT_FALSE(majel::contiguous(sizef, strideb));
+    EXPECT_FALSE(majel::contiguous(sizef, stridec));
+}
+
+TEST(Dim, Print) {
+    {
+        std::stringstream ss;
+        auto a = majel::make_dim(2, 3);
+        ss << a;
+        EXPECT_EQ(ss.str(), "2, 3");
+    }
+    {
+        std::stringstream ss;
+        ss << majel::make_dim(8);
+        EXPECT_EQ(ss.str(), "8");
+    }
+}
diff --git a/paddle/majel/place.cpp b/paddle/majel/place.cc
similarity index 95%
rename from paddle/majel/place.cpp
rename to paddle/majel/place.cc
index eecd8e5b730704258d2bd7d98a75a0a80e13a797..ca50b37843e0ba047f8f8b8d24a3d3c131587382 100644
--- a/paddle/majel/place.cpp
+++ b/paddle/majel/place.cc
@@ -1,4 +1,4 @@
-#include <majel/place.h>
+#include "paddle/majel/place.h"
 
 namespace majel {
 
@@ -16,7 +16,7 @@ public:
   void operator()(const GpuPlace& p) { os_ << "GpuPlace(" << p.device << ")"; }
 };
 
-}  // namespace majel
+}  // namespace detail
 
 static Place the_default_place;
 
diff --git a/paddle/majel/test/place_test.cpp b/paddle/majel/place_test.cc
similarity index 96%
rename from paddle/majel/test/place_test.cpp
rename to paddle/majel/place_test.cc
index c9a53802b23ef8b225b9e8ef0acfe1b0c5562289..6a099ae6b6e4f63a6ce845ab17eaab6e12c2c0b0 100644
--- a/paddle/majel/test/place_test.cpp
+++ b/paddle/majel/place_test.cc
@@ -1,4 +1,4 @@
-#include "majel/place.h"
+#include "paddle/majel/place.h"
 #include <sstream>
 #include "gtest/gtest.h"
 
diff --git a/paddle/majel/test/CMakeLists.txt b/paddle/majel/test/CMakeLists.txt
deleted file mode 100644
index 0cc7103b0391184951c3d5c0511ef790e1bcbd2d..0000000000000000000000000000000000000000
--- a/paddle/majel/test/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-file(GLOB_RECURSE ALL_TEST_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cpp" "*.cc")
-
-add_executable(majel_tests ${ALL_TEST_FILES})
-add_dependencies(majel_tests majel)
-target_link_libraries(majel_tests     
-                      ${Boost_LIBRARIES}
-                      ${GTEST_LIBRARIES}
-                      majel
-                     )
-add_test(majel_tests majel_tests)
diff --git a/paddle/majel/test/test_framework.cpp b/paddle/majel/test/test_framework.cpp
deleted file mode 100644
index 443e2dbb3f2b7064f52ccfa017111b7e781f0e97..0000000000000000000000000000000000000000
--- a/paddle/majel/test/test_framework.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "gtest/gtest.h"
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 6ed48c8d88ee698689de6f7a7f470b97a094ea5b..120d69f718b954925438fbd2119d69f0be13b3e9 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -758,7 +758,7 @@ public:
                  T p3);           // decayRate
 
   /// apply L1/L2 to *this*
-  void applyL1(T learningRate, T decayRate);
+  virtual void applyL1(T learningRate, T decayRate);
   void applyL1(BaseMatrixT& lr, T learningRate, T decayRate);
   void applyL2(T learningRate, T decayRate);
   void applyL2(BaseMatrixT& lr, T learningRate, T decayRate);
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index 1a3bb432bfb743fe814fa94c0c104bb6bc598cb8..7045562dd44f8f3e0be9181b32954c04f0865fa4 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -180,7 +180,6 @@ int getri<double>(const CBLAS_ORDER order,
                   const int lda,
                   const int* ipiv) {
   return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
-  return 0;
 }
 
 template <>
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 6ac61be0bf1b7a4e308705617faf5af2886a4082..c910146164ebfb0737583c72c48ce6dbc5b49939 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -3606,7 +3606,7 @@ void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
   }
 }
 
-void CpuMatrix::smoothL1(Matrix& output, Matrix& label) {
+void CpuMatrix::smoothL1(Matrix& output, Matrix& label, real destScale) {
   CHECK(output.useGpu_ == false && label.useGpu_ == false)
       << "Matrix type are not equal";
 
@@ -3624,6 +3624,7 @@ void CpuMatrix::smoothL1(Matrix& output, Matrix& label) {
   for (size_t i = 0; i < numSamples; ++i, out += dim, lbl += dim) {
     for (size_t j = 0; j < dim; ++j) {
       real absVal = std::fabs(out[j] - lbl[j]);
+      cost[i] *= destScale;
       if (absVal < 1.0)
         cost[i] += 0.5 * absVal * absVal;
       else
@@ -3632,7 +3633,7 @@ void CpuMatrix::smoothL1(Matrix& output, Matrix& label) {
   }
 }
 
-void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label) {
+void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label, real destScale) {
   CHECK(output.useGpu_ == false && label.useGpu_ == false)
       << "Matrix type are not equal";
 
@@ -3650,6 +3651,7 @@ void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label) {
   for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim, lbl += dim) {
     for (size_t j = 0; j < dim; ++j) {
       real val = out[j] - lbl[j];
+      grad[j] *= destScale;
       if (std::fabs(val) < 1) {
         grad[j] += val;
       } else {
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 3252adb19e4c2e48f86c3c811bfc7d75fd06a8f7..748be850b4c902d1b48c1dafbb0d5ea2bf197e6e 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -789,11 +789,11 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  virtual void smoothL1(Matrix& output, Matrix& label) {
+  virtual void smoothL1(Matrix& output, Matrix& label, real destScale) {
     LOG(FATAL) << "Not implemented";
   }
 
-  virtual void smoothL1Bp(Matrix& outputV, Matrix& label) {
+  virtual void smoothL1Bp(Matrix& outputV, Matrix& label, real destScale) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -1736,8 +1736,8 @@ public:
   /// gradient of sumOfSquares.
   void sumOfSquaresBp(Matrix& outputV, Matrix& label);
 
-  void smoothL1(Matrix& output, Matrix& label);
-  void smoothL1Bp(Matrix& output, Matrix& label);
+  void smoothL1(Matrix& output, Matrix& label, real destScale);
+  void smoothL1Bp(Matrix& output, Matrix& label, real destScale);
 
   void tanh(Matrix& output);
   void tanhDerivative(Matrix& output);
diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/math/SparseRowMatrix.cpp
index b8c781ca1fd46c9840817abe26a20eec005c37e9..b086433fe535225ad05453b7d13c3846f5ce3c2b 100644
--- a/paddle/math/SparseRowMatrix.cpp
+++ b/paddle/math/SparseRowMatrix.cpp
@@ -54,7 +54,7 @@ void SparseRowCpuMatrix::zeroMem() {
   clearRows();
 }
 
-void SparseRowCpuMatrix::applyL1Decay(real learningRate, real decayRate) {
+void SparseRowCpuMatrix::applyL1(real learningRate, real decayRate) {
   apply([=](real* buf, size_t len) {
     CpuVector value(0, nullptr);
     value.subVecFrom(buf, 0, len);
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index 1ccbf97b25922ae52377d7048da3a07012d21003..8704eb038d5d42ca834d232c0a651e9ffb2b40f3 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -94,7 +94,7 @@ public:
   /**
    * apply L1 to all sparse rows, should be apply after indices ready.
    */
-  void applyL1Decay(real learningRate, real decayRate);
+  virtual void applyL1(real learningRate, real decayRate);
 
   void clearIndices() { clearRows(); }
   void zeroMemThread(size_t tid, size_t numThreads);
diff --git a/paddle/math/TensorEvaluate.h b/paddle/math/TensorEvaluate.h
index 9de2099b850d1723fe085eeed97c5b141629eec1..687bad37113b7426ff04ed5b2ad9449da4b88bb9 100644
--- a/paddle/math/TensorEvaluate.h
+++ b/paddle/math/TensorEvaluate.h
@@ -103,7 +103,10 @@ inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) {
 }
 #else
 template <class T, typename LeftType, typename RightType>
-inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {}
+inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {
+  LOG(FATAL) << "Since it is gcc compiled, "
+                "this calculation does not support GPU implementation.";
+}
 #endif
 
 }  // namespace paddle
diff --git a/paddle/parameter/FirstOrderOptimizer.cpp b/paddle/parameter/FirstOrderOptimizer.cpp
index dbb738e98b5874f5bb33026ad585a6c3ef327d1d..5938b2210c7174c9a0ce659220825b74af007db5 100644
--- a/paddle/parameter/FirstOrderOptimizer.cpp
+++ b/paddle/parameter/FirstOrderOptimizer.cpp
@@ -161,6 +161,7 @@ void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
                                         const ParameterConfig& config,
                                         size_t sparseId) const {
   CHECK(sparseId == -1LU) << "Sparse update is not supported";
+
   BaseMatrix& value = *vecs[PARAMETER_VALUE];
   BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
   BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
@@ -265,6 +266,7 @@ void AdamParameterOptimizer::update(const VectorPtr vecs[],
                                     const ParameterConfig& config,
                                     size_t sparseId) const {
   CHECK(sparseId == -1UL) << "Sparse update is not supported";
+
   real beta1_power = std::pow(beta1_, step_);
   real beta2_power = std::pow(beta2_, step_);
   real learningRate = config.learning_rate() * learningRate_;
@@ -303,18 +305,25 @@ void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
 void OptimizerWithGradientClipping::update(const VectorPtr vecs[],
                                            const ParameterConfig& config,
                                            size_t sparseId) const {
+  real globalThreshold = optConfig_.gradient_clipping_threshold();
+  real localThreshold = config.gradient_clipping_threshold();
+
+  // Use local gradient clipping threshold if it's enabled,
+  // otherwise using the global one.
+  real threshold = localThreshold > 0.0f ? localThreshold : globalThreshold;
+  std::string field = localThreshold > 0.0f ? "local" : "global";
+
   real maxAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsMax();
-  if (maxAbsGrad > config.gradient_clipping_threshold()) {
+  if (maxAbsGrad > threshold) {
     if (FLAGS_log_clipping) {
       real avgAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsSum() /
                         vecs[PARAMETER_GRADIENT]->getSize();
-      LOG(INFO) << "parameter=" << config.name() << " need clipping,"
-                << " max grad=" << maxAbsGrad << " avg grad=" << avgAbsGrad;
+      LOG(INFO) << "parameter=" << config.name() << " need clipping by "
+                << field << " threshold=" << threshold
+                << ", max grad=" << maxAbsGrad << ", avg grad=" << avgAbsGrad;
     }
-    vecs[PARAMETER_GRADIENT]->clip(-config.gradient_clipping_threshold(),
-                                   config.gradient_clipping_threshold());
+    vecs[PARAMETER_GRADIENT]->clip(-threshold, threshold);
   }
-
   optimizer_->update(vecs, config, sparseId);
 }
 
diff --git a/paddle/parameter/OptimizerWithRegularizer.cpp b/paddle/parameter/OptimizerWithRegularizer.cpp
index 85f13c8bc08c534224a1a8365d541737980b439f..7910b12444938a0555c211bb3dfd0f4209e480ec 100644
--- a/paddle/parameter/OptimizerWithRegularizer.cpp
+++ b/paddle/parameter/OptimizerWithRegularizer.cpp
@@ -131,7 +131,8 @@ ParameterOptimizer* OptimizerWithRegularizer::create(
     bool inPserver) {
   ParameterOptimizer* optimizer =
       ParameterOptimizer::create(optConfig, inPserver);
-  if (paraConfig.gradient_clipping_threshold() > 0.0f &&
+  if ((optConfig.gradient_clipping_threshold() > 0.0f ||
+       paraConfig.gradient_clipping_threshold() > 0.0f) &&
       !dynamic_cast<AddOptimizer*>(optimizer)) {
     optimizer = new OptimizerWithGradientClipping(optConfig, optimizer);
   }
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index b8efabbe2a0b54edec64f6cee62b44c76ca7bf10..ebe36d49376882fe4c1013e19dcf71f452b3e501 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "OptimizerFunctions.h"
 #include "OptimizerWithRegularizer.h"
 #include "ParameterUpdateFunctions.h"
+#include "ThreadLocalBuffer.h"
 #include "hl_gpu.h"
 #include "paddle/math/CpuSparseMatrix.h"
 #include "paddle/math/MathUtils.h"
@@ -262,15 +263,6 @@ void Parameter::setMat(ParameterType pType, int matType) {
   }
 }
 
-SparsePrefetchRowCpuMatrix* Parameter::getPrefetchMatrix() {
-  MatrixPtr mat = mats_[PARAMETER_VALUE];
-  if (mat) {
-    return dynamic_cast<SparsePrefetchRowCpuMatrix*>(mat.get());
-  }
-
-  return nullptr;
-}
-
 void Parameter::incUpdate(const UpdateCallback& callback) {
   // Static parameter is fixed, and does not need to be updated
   if (isStatic()) {
@@ -422,37 +414,4 @@ bool Parameter::load(std::istream& s) {
   return true;
 }
 
-ThreadLocal<std::vector<VectorPtr>> Parameter::tlsTempBufs_;
-
-VectorPtr* Parameter::getTlsTempBufs() {
-  std::vector<VectorPtr>& bufs = *tlsTempBufs_;
-  if (bufs.empty()) {
-    bufs.resize(NUM_PARAMETER_TYPES);
-    for (auto& vec : bufs) {
-      vec.reset(new CpuVector(0, nullptr));
-    }
-  }
-  return bufs.data();
-}
-
-void Parameter::exec(ExecFunc func) {
-  auto execFunc = [this, func](int tid, size_t numThreads) {
-    if (numThreads == 1) {  // single thread
-      func(this->getBufs());
-    } else {  // multi thread
-      VectorPtr* vecs = Parameter::getTlsTempBufs();
-      auto interval = calcSplitArrayInterval(
-          this->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
-      for (size_t i = 0; i < (size_t)NUM_PARAMETER_TYPES; ++i) {
-        if (bufs_[i]) {
-          vecs[i]->subVecFrom(*bufs_[i], interval);
-        }
-      }
-      func(vecs);
-    }
-  };
-
-  getBuf(PARAMETER_VALUE)->exec(execFunc);
-}
-
 }  // namespace paddle
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 36d2b65f3bd1056a4ac6a1029000fe4cce6420ce..0bac76f068ec22bec52766b43e331fe109a34188 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -40,17 +40,6 @@ class Parameter;
 typedef std::function<void(Parameter* param)> UpdateCallback;
 typedef std::function<void(int paramId, Parameter* param)> ParamInitCallback;
 
-struct Segment {
-  int64_t beginDim;
-  int64_t endDim;
-
-  // We allow the possibility that the parameters are not stored at contiguous
-  // memory locations for speed reason (i.e. data alignemnt)
-  // This means that the dimenstion is not same as the position in the memroy
-  // buffer.
-  int64_t beginPos;  // beginning position in the local value or grad buffer
-};
-
 class Parameter;
 typedef std::shared_ptr<Parameter> ParameterPtr;
 
@@ -167,13 +156,6 @@ public:
     }
   }
 
-  void enableSharedType(ParameterType type, VectorPtr vec, MatType matType) {
-    if (!bufs_[type]) {
-      bufs_[type] = vec;
-      setMat(type, matType);
-    }
-  }
-
   /// for batchGradientMachine: blockNum is number of partitions of the matrix.
   bool isGradShared(size_t* blockNum = NULL);
 
@@ -203,20 +185,6 @@ public:
 
   const MatrixPtr& getMat(ParameterType pType) const { return mats_[pType]; }
 
-  const IVectorPtr& getIntBuf(ParameterType pType) { return intBufs_[pType]; }
-
-  void setIntBuf(ParameterType pType, const IVectorPtr& iVec) {
-    intBufs_[pType] = iVec;
-  }
-
-  SparsePrefetchRowCpuMatrix* getPrefetchMatrix();
-
-  float getLearnRate() const { return config_.learning_rate(); }
-
-  float getInitMean() const { return config_.initial_mean(); }
-
-  float getInitStandardDeviation() const { return config_.initial_std(); }
-
   void setValueUpdated() { updated_ = true; }
 
   void clearValueUpdated() { updated_ = false; }
@@ -243,8 +211,6 @@ public:
    */
   bool load(std::istream& is);
 
-  std::vector<Segment>& getGradientSegments() { return gradSegments_; }
-
   void incShared() { sharedCount_++; }
 
   /**
@@ -351,35 +317,22 @@ protected:
 
   int sharedCount_;
   int updateCounter_;
-  std::vector<Segment> gradSegments_;  // segments of non-zero gradient
 
   bool updated_;
   SparseFormat format_;
 
-  static ThreadLocal<std::vector<VectorPtr>> tlsTempBufs_;
-
   std::vector<std::shared_ptr<IParameterUpdaterHook>> updaterHooks_;
 
 public:
   void setSharedCount(int cnt) { sharedCount_ = cnt; }
   int getSharedCount() { return sharedCount_; }
 
-  void singleUpdate(void* data);
   bool isSparse() { return config_.is_sparse(); }
   SparseFormat getFormat() { return format_; }
 
   static const std::string kMissParameterFail;
   static const std::string kMissParameterRand;
   static const std::string kMissParameterZero;
-
-  static VectorPtr* getTlsTempBufs();
-
-  /**
-   * exec a func in single/multi thread.
-   * vecs is bufs_ of Parameter, as input of ExecFunc.
-   */
-  typedef std::function<void(const VectorPtr vecs[])> ExecFunc;
-  void exec(ExecFunc func);
 };
 
 typedef std::map<std::string, ParameterPtr> ParameterMap;
diff --git a/paddle/parameter/ParameterOptimizer.h b/paddle/parameter/ParameterOptimizer.h
index 2bdc793d605e01f8e055087bb3e0973168cb0213..f98ba569b569379b30d034739a7f84aaf97108db 100644
--- a/paddle/parameter/ParameterOptimizer.h
+++ b/paddle/parameter/ParameterOptimizer.h
@@ -167,6 +167,7 @@ public:
     }
     parameterTypes_.push_back(type);
   }
+
   real getLearningRate() const { return learningRate_; }
 
   virtual void setNoDecay() { applyDecay_ = false; }
@@ -201,6 +202,7 @@ protected:
    * so, if lr change in StartBatch, please assign to learningRate_
    */
   real learningRate_;
+
   std::unique_ptr<LearningRateScheduler> learningRateScheduler_;
   int64_t pass_;  // current training pass (starting from 0)
   bool firstTime_;
diff --git a/paddle/parameter/ThreadLocalBuffer.cpp b/paddle/parameter/ThreadLocalBuffer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b21dd15245cf7c3d0418d37e6e8925c9e906f482
--- /dev/null
+++ b/paddle/parameter/ThreadLocalBuffer.cpp
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ThreadLocalBuffer.h"
+#include "Parameter.h"
+
+namespace paddle {
+namespace parameter {
+
+static ThreadLocal<std::vector<VectorPtr>> tlsTempBufs_;
+
+VectorPtr* getThreadLocalBuffer() {
+  std::vector<VectorPtr>& bufs = *tlsTempBufs_;
+  if (bufs.empty()) {
+    bufs.resize(NUM_PARAMETER_TYPES);
+    for (auto& vec : bufs) {
+      vec.reset(new CpuVector(0, nullptr));
+    }
+  }
+  return bufs.data();
+}
+
+}  // namespace parameter
+}  // namespace paddle
diff --git a/paddle/parameter/ThreadLocalBuffer.h b/paddle/parameter/ThreadLocalBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c916519c974a5bdeea407dcc1bc6d196756874ee
--- /dev/null
+++ b/paddle/parameter/ThreadLocalBuffer.h
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+namespace parameter {
+extern VectorPtr* getThreadLocalBuffer();
+}  // namespace parameter
+}  // namespace paddle
diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp
index a97859f83fe6495b298e920346c964ef2a9b146c..f7e391f76324a09c203dfbbb449feb050caa8fb4 100644
--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
@@ -243,7 +243,8 @@ void ParameterClient2::prepareSendData(
     CHECK_GE(blockSize, 1LU) << "blockSize should > 0 " << blockSize;
     const auto paraSize = parameter->getSize();
     if (sparseUpdate) {
-      const auto prefetchMat = parameter->getPrefetchMatrix();
+      auto prefetchMat = std::dynamic_pointer_cast<SparsePrefetchRowCpuMatrix>(
+          parameter->getMat(PARAMETER_VALUE));
       CHECK(prefetchMat != nullptr) << "prefetchMat is nullptr";
       auto sendMat = dynamic_cast<SparseRowCpuMatrix*>(
           parameter->getMat(parameterType).get());
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index 19ff40ba7e9584f772043f939bcb31caf666163d..41ac15336d3150417da1cf1631319604584991ec 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <fstream>
 
 #include "paddle/math/SIMDFunctions.h"
-
 #include "paddle/parameter/AverageOptimizer.h"
 #include "paddle/parameter/FirstOrderOptimizer.h"
 #include "paddle/parameter/OptimizerFunctions.h"
@@ -26,6 +25,7 @@ limitations under the License. */
 #include "paddle/parameter/ParameterOptimizer.h"
 #include "paddle/parameter/ParameterUpdateFunctions.h"
 #include "paddle/parameter/Regularizer.h"
+#include "paddle/parameter/ThreadLocalBuffer.h"
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/Stat.h"
@@ -618,7 +618,7 @@ void ParameterServer2::asyncSGD(const SendParameterRequest& request,
 
   bool commitGradient = asyncGrdientCommitCheckAndStat(request);
 
-  VectorPtr* vecs = Parameter::getTlsTempBufs();
+  VectorPtr* vecs = parameter::getThreadLocalBuffer();
   size_t bufferIndex = 0;
   for (const auto& block : request.blocks()) {
     int64_t offset = getBlockOffset(block);
@@ -1051,15 +1051,15 @@ void ParameterServer2::clearUnusedSegments(CpuVector* vec) {
 }
 
 void ParameterServer2::parallelExecForEachBlock(ExecFunc func) {
-  SyncThreadPool::execHelper(syncThreadPool_.get(),
-                             [&](int tid, size_t numThreads) {
-                               int64_t numBlocks = blockIdMap_.size();
-                               VectorPtr* vecs = Parameter::getTlsTempBufs();
-                               for (int64_t blockId = tid; blockId < numBlocks;
-                                    blockId += numThreads) {
-                                 func(blockId, vecs);
-                               }
-                             });
+  SyncThreadPool::execHelper(
+      syncThreadPool_.get(), [&](int tid, size_t numThreads) {
+        int64_t numBlocks = blockIdMap_.size();
+        VectorPtr* vecs = parameter::getThreadLocalBuffer();
+        for (int64_t blockId = tid; blockId < numBlocks;
+             blockId += numThreads) {
+          func(blockId, vecs);
+        }
+      });
 }
 
 void ParameterServer2::blockTraverse(
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index 7c6b83541002071d6e9d00c17be97b6ce4bf8528..edc2e0292378fea0cd904d7f017762c1dade6caf 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -17,6 +17,7 @@ import collections
 import swig_paddle
 import numpy
 import itertools
+from functools import reduce
 
 __all__ = ['DataProviderConverter']
 
@@ -65,6 +66,8 @@ class IScanner(object):
 
         :param argument: Output arguments object.
         :type argument: swig_paddle.Arguments
+        :param dat: Output arguments object.
+        :type dat: The Python object, numpy.array or List.
         :return:
         """
         pass
@@ -95,17 +98,35 @@ class DenseScanner(IScanner):
     def __init__(self, input_type, pos):
         IScanner.__init__(self, input_type, pos)
         self.__mat__ = None
+        self.__shape__ = None
         self.__height__ = 0
+        self.__dim__ = 0
 
     def pre_scan(self, dat):
         self.__height__ += 1
+        if self.__shape__ is None:
+            self.__shape__ = numpy.array(dat).shape
+            if len(self.__shape__) > 3:
+                raise ValueError(
+                    "The dimension of input cannot be greater than 3.")
+            self.__dim__ = reduce(lambda x, y: x * y, self.__shape__)
+            if len(self.__shape__) == 1 and self.__dim__ != self.input_type.dim:
+                raise ValueError(
+                    "The data size must be equal to it in data layer.")
+        else:
+            if self.__shape__ != numpy.array(dat).shape:
+                raise ValueError(
+                    "The data shape must be same in one mini-batch.")
 
     def finish_pre_scan(self, argument):
         self.__mat__ = numpy.ndarray(
-            shape=(self.__height__, self.input_type.dim), dtype=numpy.float32)
+            shape=(self.__height__, self.__dim__), dtype=numpy.float32)
         self.__height__ = 0
 
     def scan(self, dat):
+        # It's better to use NumPy array for speed.
+        dat = numpy.array(dat)
+        dat = dat.flatten()
         self.__mat__[self.__height__] = dat
         self.__height__ += 1
 
@@ -116,6 +137,14 @@ class DenseScanner(IScanner):
         m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True,
                                                     self.data_in_gpu)
         argument.setSlotValue(self.pos, m)
+        if len(self.__shape__) > 1:
+            # The last-two dimenstions are the frame height and width.
+            # For example, the layout is CHW for 3-D feature of image.
+            # The H and W are the fram height and width.
+            h, w = self.__shape__[-2:]
+            argument.setSlotFrameHeight(self.pos, h)
+            argument.setSlotFrameWidth(self.pos, w)
+        self.__shape__ = None
 
 
 class SparseBinaryScanner(IScanner):
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 101b44e6c62ecf0b84d65ee7b6e90e64bd7b3272..9f0f9f2d74db8e0b538adb8263e2844c2cf4b74f 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -21,6 +21,8 @@ cd /paddle/build
 
 # build script will not fail if *.deb does not exist
 rm *.deb 2>/dev/null || true
+# delete previous built whl packages
+rm -rf /paddle/paddle/dist 2>/dev/null || true
 
 cat <<EOF
 ========================================
@@ -131,8 +133,6 @@ cat > /paddle/build/Dockerfile <<EOF
 FROM ${BASE_IMAGE}
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ENV HOME /root
-ENV LANG en_US.UTF-8
-# Use Fix locales to en_US.UTF-8
 EOF
 
 if [[ -n ${APT_MIRROR} ]]; then
@@ -153,6 +153,7 @@ RUN apt-get update &&\
     paddle version
 ${DOCKERFILE_CUDNN_DSO}
 ${DOCKERFILE_GPU_ENV}
+
 # default command shows the paddle version and exit
 CMD ["paddle", "version"]
 EOF
diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh
new file mode 100644
index 0000000000000000000000000000000000000000..bfa10c91553563bddac8c1b41bf21490fb89d3cf
--- /dev/null
+++ b/paddle/scripts/docker/build_android.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+set -xe
+
+mkdir -p /paddle/build
+cd /paddle/build
+rm -f /paddle/install 2>/dev/null || true
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
+      -DANDROID_ABI=armeabi-v7a \
+      -DANDROID_ARM_NEON=ON \
+      -DANDROID_ARM_MODE=ON \
+      -DHOST_C_COMPILER=/usr/bin/gcc \
+      -DHOST_CXX_COMPILER=/usr/bin/g++ \
+      -DCMAKE_INSTALL_PREFIX=/paddle/install \
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+      -DCMAKE_C_FLAGS_RELWITHDEBINFO="-O3" \
+      -DCMAKE_CXX_FLAGS_RELWITHDEBINFO="-O3" \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+make -j `nproc`
+make install
+
+export PATH=/paddle/install/bin:/paddle/install/opt/paddle/bin:$PATH
+paddle version
diff --git a/paddle/scripts/run_python_tests.sh b/paddle/scripts/run_python_tests.sh
index 02d2cdb977473c1032b06ffca59544b3ba98d1fa..1ed497aaeccdb629181809a0cbc48abb57ae4c44 100755
--- a/paddle/scripts/run_python_tests.sh
+++ b/paddle/scripts/run_python_tests.sh
@@ -24,12 +24,21 @@ PYTHON=$1; shift
 if [ $USE_VIRTUALENV_FOR_TEST -ne 0 ]; then
    rm -rf .test_env
    virtualenv .test_env
+   unset PYTHONHOME
+   unset PYTHONPATH
    source .test_env/bin/activate
    PYTHON=python
 fi
 
-export PYTHONPATH=$SCRIPTPATH/../../python/
-$PYTHON -m pip install $SCRIPTPATH/../dist/*.whl requests matplotlib opencv-python ipython==5.3
+$PYTHON -m pip install $SCRIPTPATH/../dist/*.whl
+
+if [ "X${PADDLE_PACKAGE_DIR}" != "X" ]; then
+   $PYTHON -m pip install ${PADDLE_PACKAGE_DIR}/*.whl
+else
+   export PYTHONPATH=$SCRIPTPATH/../../python/
+fi
+
+$PYTHON -m pip install ipython==5.3
 
 for fn in "$@"
 do
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh
index 67b89adb4ddb7bb93cb776d64711078cb11a2784..c784293695bf134b5e990639778b6e84ba45d00d 100755
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
@@ -60,7 +60,6 @@ function deploy_docs() {
 
 deploy_docs "master" "." 
 deploy_docs "develop" "./develop/"
-deploy_docs "release/0.10.0" "./release/0.10.0/"
 
 # Check is there anything changed.
 set +e
diff --git a/paddle/trainer/RemoteParameterUpdater.cpp b/paddle/trainer/RemoteParameterUpdater.cpp
index 6939738203f41e0c1f7204d54834e34b2cd90682..7314266cb24da9b9e9f0f1cbe61ed363247f51fe 100644
--- a/paddle/trainer/RemoteParameterUpdater.cpp
+++ b/paddle/trainer/RemoteParameterUpdater.cpp
@@ -747,28 +747,32 @@ void SparseRemoteParameterUpdater::getParametersRemote(bool fullSize,
                                                        bool apply) {
   ParameterType sendBackParameterType =
       (useApplyInPserver_ && apply) ? PARAMETER_APPLY : PARAMETER_VALUE;
+  std::function<void()> getParams;
+  std::function<void(Parameter&, real)> applyL1;
   if (fullSize) {
-    parameterClient_->getParameter(
-        /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
-    if (config_.shrink_parameter_value() > 0) {
-      for (auto& para : parameters_) {
-        if (para->getConfig().decay_rate_l1() > 0) {
-          para->getBuf(PARAMETER_VALUE)
-              ->applyL1(1.0f,                               // learningRate
-                        config_.shrink_parameter_value());  // decayRate
-        }
-      }
-    }
+    getParams = [&] {
+      parameterClient_->getParameter(
+          /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
+    };
+    applyL1 = [](Parameter& para, real decayRate) {
+      para.getBuf(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
+    };
   } else {
-    REGISTER_TIMER("getParamSparse");
-    parameterClient_->getParameterSparse(
-        /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
+    getParams = [&] {
+      parameterClient_->getParameterSparse(
+          /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
+    };
+    applyL1 = [](Parameter& para, real decayRate) {
+      para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
+    };
+  }
+  {
+    REGISTER_TIMER("getParamDenseAndSparse");
+    getParams();
     if (config_.shrink_parameter_value() > 0) {
       for (auto& para : parameters_) {
         if (para->getConfig().decay_rate_l1() > 0) {
-          para->getPrefetchMatrix()->applyL1Decay(
-              1.0f,                               // learningRate
-              config_.shrink_parameter_value());  // decayRate
+          applyL1(*para, config_.shrink_parameter_value());
         }
       }
     }
diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/trainer/ThreadParameterUpdater.cpp
index 870d4a4b0246fe244bbd3796ec14449eb181aad2..3c85c3aaac68fc29da90c24d1208887a17009d5f 100644
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 
 #include "paddle/math/SparseRowMatrix.h"
+#include "paddle/parameter/ThreadLocalBuffer.h"
 #include "paddle/utils/Thread.h"
 
 DECLARE_int32(trainer_count);
@@ -98,7 +99,7 @@ void SgdThreadUpdater::threadTraverse(
     int tid,
     size_t numThreads,
     Parameter* para) {
-  VectorPtr* vecs = Parameter::getTlsTempBufs();
+  VectorPtr* vecs = parameter::getThreadLocalBuffer();
   if (para->isGradSparseUpdate()) {
     size_t height = para->getConfig().dims(0);
     size_t width = para->getConfig().dims(1);
@@ -214,7 +215,7 @@ void SgdThreadUpdater::threadUpdateSparse(int tid,
                                           Parameter* para) {
   int pid = para->getID();
   ParameterOptimizer* optimizer = optimizers_[pid].get();
-  VectorPtr* vecs = Parameter::getTlsTempBufs();
+  VectorPtr* vecs = parameter::getThreadLocalBuffer();
 
   size_t height = para->getConfig().dims(0);
   size_t width = para->getConfig().dims(1);
@@ -286,7 +287,7 @@ void SgdThreadUpdater::threadUpdateDense(int tid,
                                          Parameter* para) {
   int pid = para->getID();
   ParameterOptimizer* optimizer = optimizers_[pid].get();
-  VectorPtr* vecs = Parameter::getTlsTempBufs();
+  VectorPtr* vecs = parameter::getThreadLocalBuffer();
 
   auto interval = calcSplitArrayInterval(
       para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
diff --git a/paddle/utils/CpuId.cpp b/paddle/utils/CpuId.cpp
index edd33c454122d95078e0fde2a2e9d68903951ee8..7186feef041eb3b1be459a506294f83f9a00ad94 100644
--- a/paddle/utils/CpuId.cpp
+++ b/paddle/utils/CpuId.cpp
@@ -19,19 +19,22 @@ limitations under the License. */
 /// for MSVC
 #define CPUID(info, x) __cpuidex(info, x, 0)
 
-#elif !defined(__ANDROID__)
+#else
 
+#if !defined(__arm__) && !defined(__aarch64__)
 #include <cpuid.h>
-
 /// for GCC/Clang
 #define CPUID(info, x) __cpuid_count(x, 0, info[0], info[1], info[2], info[3])
+#endif
 
 #endif
 
 namespace paddle {
 
 SIMDFlags::SIMDFlags() {
-#if !defined(__ANDROID__)
+#if defined(__arm__) || defined(__aarch64__)
+  simd_flags_ = SIMD_NEON;
+#else
   unsigned int cpuInfo[4];
   // CPUID: https://en.wikipedia.org/wiki/CPUID
   // clang-format off
@@ -52,8 +55,6 @@ SIMDFlags::SIMDFlags() {
   CPUID(cpuInfo, 0x80000001);
   simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4  : SIMD_NONE;
   // clang-fotmat on
-#else
-  simd_flags_ = SIMD_NEON;
 #endif
 }
 
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
index 310c9a6542563891d4ba5888e58406ea28d6a2ce..3a0903d1f268cf0132da3de43396391219edf004 100644
--- a/paddle/utils/arch/linux/Locks.cpp
+++ b/paddle/utils/arch/linux/Locks.cpp
@@ -55,8 +55,11 @@ public:
 };
 
 #else
-
+// clang-format off
+#include <cstddef>
 #include <atomic>
+// clang-format on
+
 class SpinLockPrivate {
 public:
   inline void lock() {
diff --git a/paddle/utils/tests/test_SIMDFlags.cpp b/paddle/utils/tests/test_SIMDFlags.cpp
index 185789c927be19385d6ddc7a1889b6cc56109d38..a808d456a69866f72502bcf1ae244cec14738e22 100644
--- a/paddle/utils/tests/test_SIMDFlags.cpp
+++ b/paddle/utils/tests/test_SIMDFlags.cpp
@@ -19,7 +19,7 @@ using namespace paddle;  // NOLINT
 
 TEST(SIMDFlags, gccTest) {
 #if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) && \
-    !defined(__arm__)
+    !defined(__arm__) && !defined(__aarch64__)
   // clang-format off
   CHECK(!__builtin_cpu_supports("sse")    != HAS_SSE);
   CHECK(!__builtin_cpu_supports("sse2")   != HAS_SSE2);
diff --git a/proto/TrainerConfig.proto b/proto/TrainerConfig.proto
index a334e07b6282a6ff9867482e0c3a299df2a78d1d..a819d20d11ff3932d331801007b8cfb9c77a3f2b 100644
--- a/proto/TrainerConfig.proto
+++ b/proto/TrainerConfig.proto
@@ -128,6 +128,9 @@ message OptimizationConfig {
   // when async_lagged_grad_discard_ratio * num_gradient_servers commit passed,
   // current async gradient will be discard silently.
   optional double async_lagged_grad_discard_ratio = 37 [default = 1.5];
+
+  // global threshold for gradient clipping 
+  optional double gradient_clipping_threshold = 38 [default = 0.0];
 };
 
 message TrainerConfig {
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index bfa19d5ecc84a08614852c4c93de5b5793c1be9c..3640dd3a75ea212a84255ea7f6369b63606482ab 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -23,14 +23,17 @@ add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
 add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)
 
-add_subdirectory(paddle/trainer_config_helpers/tests)
-if (WITH_SWIG_PY)
-  # enable v2 API unittest only when paddle swig api is compiled
-  add_subdirectory(paddle/v2/tests)
-  add_subdirectory(paddle/v2/reader/tests)
-  add_subdirectory(paddle/v2/plot/tests)
-endif()
+set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 
-install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
+if (WITH_TESTING)
+  add_subdirectory(paddle/trainer_config_helpers/tests)
+  if (WITH_SWIG_PY)
+    # enable v2 API unittest only when paddle swig api is compiled
+    add_subdirectory(paddle/v2/tests)
+    add_subdirectory(paddle/v2/reader/tests)
+    add_subdirectory(paddle/v2/plot/tests)
+  endif()
+endif()
+install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
     DESTINATION opt/paddle/share/wheels
 )
diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
index a36f0ebfdcb9f90f54ba2d688f9f4bcee2939ef3..7e305e2cd9fbe306368a44d08f7f66b4185ae2d2 100644
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
@@ -72,9 +72,16 @@ class InputType(object):
 
 def dense_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
     """
-    Dense Vector. It means the input feature is dense float vector. For example,
-    if the input is an image with 28*28 pixels, the input of Paddle neural
-    network should be a dense vector with dimension 784.
+    Dense Array. It means the input feature is dense array with float type.
+    For example, if the input is an image with 28*28 pixels, the input of
+    Paddle neural network could be a dense vector with dimension 784 or a
+    numpy array with shape (28, 28).
+
+    For the 2-D convolution operation, each sample in one mini-batch must have
+    the similarly size in PaddlePaddle now. But, it supports variable-dimension
+    feature across mini-batch. For the variable-dimension, the param dim is not
+    used. While the data reader must yield numpy array and the data feeder will
+    set the data shape correctly.
 
     :param dim: dimension of this vector.
     :type dim: int
@@ -135,6 +142,10 @@ sparse_binary_vector = sparse_non_value_slot
 sparse_vector = sparse_value_slot
 integer_value = index_slot
 
+# dense_array can be used for variable-length input feature.
+# Each feature is not a vector, but a multi-dimensional array.
+dense_array = dense_slot
+
 
 def dense_vector_sequence(dim):
     """
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 57d30b088b873a94a11483aea536a9e4f6493129..5d540664a7f56b4fc27ecd5dc46bf36b0268eb98 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2320,6 +2320,9 @@ def Memory(name,
         memory_name = name + "+delay1"
     agent_name = memory_name
     if is_sequence:
+        config_assert(
+            boot_layer is not None,
+            "there must be boot_layer in network when is_sequence = True")
         agent_layer = SequenceAgentLayer(agent_name, size)
     else:
         agent_layer = AgentLayer(agent_name, size)
@@ -3368,12 +3371,13 @@ def make_importer(config_dir, config_args):
     return Import
 
 
-settings = dict(
+DEFAULT_SETTING = dict(
     batch_size=None,
     mini_batch_size=None,
     algorithm='async_sgd',
     async_lagged_grad_discard_ratio=1.5,
     learning_method='momentum',
+    gradient_clipping_threshold=None,
     num_batches_per_send_parameter=None,
     num_batches_per_get_parameter=None,
     center_parameter_update_method=None,
@@ -3400,6 +3404,8 @@ settings = dict(
     adam_beta2=0.999,
     adam_epsilon=1e-8, )
 
+settings = copy.deepcopy(DEFAULT_SETTING)
+
 settings_deprecated = dict(usage_ratio=1., )
 
 trainer_settings = dict(
@@ -3540,10 +3546,8 @@ def update_g_config():
     return g_config
 
 
-def parse_config(trainer_config, config_arg_str):
+def begin_parse(config_arg_str=''):
     '''
-    @param trainer_config: can be a string of config file name or a function name
-    with config logic
     @param config_arg_str: a string of the form var1=val1,var2=val2. It will be
     passed to config script as a dictionary CONFIG_ARGS
     '''
@@ -3551,12 +3555,23 @@ def parse_config(trainer_config, config_arg_str):
     for hook in _parse_config_hooks:
         hook()
 
-    config_args = {}
-
     logger.findCaller = find_caller
     logger.fatal = my_fatal
 
     g_config.model_config.type = "nn"
+
+    global g_current_submodel, g_root_submodel
+    g_root_submodel = g_config.model_config.sub_models.add()
+    g_root_submodel.name = 'root'
+    g_root_submodel.is_recurrent_layer_group = False
+    g_current_submodel = g_root_submodel
+
+
+def parse_config(trainer_config, config_arg_str):
+    begin_parse(config_arg_str)
+
+    config_args = {}
+
     if config_arg_str:
         config_args = dict([f.split('=') for f in config_arg_str.split(',')])
 
@@ -3569,14 +3584,6 @@ def parse_config(trainer_config, config_arg_str):
         extension_module = importlib(extension_module_name)
         g_extended_config_funcs = extension_module.get_config_funcs(g_config)
 
-    g_config.model_config.type = 'nn'
-
-    global g_current_submodel, g_root_submodel
-    g_root_submodel = g_config.model_config.sub_models.add()
-    g_root_submodel.name = 'root'
-    g_root_submodel.is_recurrent_layer_group = False
-    g_current_submodel = g_root_submodel
-
     if hasattr(trainer_config, '__call__'):
         trainer_config.func_globals.update(
             make_config_environment("", config_args))
diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py
index 06be3e45993bedc2ccf9874e1ab503a9fdbba623..c749fa827fea4a808ab715dcb3442aa24d06a4d2 100644
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -17,7 +17,7 @@ __all__ = [
     "IdentityActivation", "LinearActivation", 'SequenceSoftmaxActivation',
     'ExpActivation', "ReluActivation", "BReluActivation", "SoftReluActivation",
     "STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation",
-    "LogActivation"
+    "LogActivation", "SqrtActivation", "ReciprocalActivation"
 ]
 
 
@@ -224,3 +224,27 @@ class LogActivation(BaseActivation):
 
     def __init__(self):
         BaseActivation.__init__(self, 'log', False)
+
+
+class SqrtActivation(BaseActivation):
+    """
+    Square Root Activation.
+
+    .. math::
+       f(z) = sqrt(z)
+    """
+
+    def __init__(self):
+        BaseActivation.__init__(self, 'sqrt', False)
+
+
+class ReciprocalActivation(BaseActivation):
+    """
+    Reciprocal Activation.
+
+    .. math::
+       f(z) = 1/z
+    """
+
+    def __init__(self):
+        BaseActivation.__init__(self, 'reciprocal', False)
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index 7ae9e5cb3050fa6f70fa84785a1ddbdc68c70235..d1167a234caed3753c6beedfc89b01054e3688e1 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -110,15 +110,16 @@ class ParameterAttribute(object):
                  momentum=None,
                  gradient_clipping_threshold=None,
                  sparse_update=False):
-        # initialize strategy.
+        self.attr = {}
+
         if is_static:
-            self.attr = {'is_static': True}
-        elif initial_std is None and initial_mean is None and initial_max \
+            self.attr['is_static'] = True
+
+        if initial_std is None and initial_mean is None and initial_max \
                 is None and initial_min is None:
-            self.attr = {'initial_smart': True}
+            self.attr['initial_smart'] = True
         elif is_compatible_with(initial_std, float) or \
              is_compatible_with(initial_mean, float):
-            self.attr = dict()
             if initial_std is not None:
                 self.attr['initial_std'] = initial_std
             if initial_mean is not None:
@@ -131,7 +132,6 @@ class ParameterAttribute(object):
             assert initial_min < initial_max
             initial_mean = (initial_max + initial_min) / 2
             initial_std = initial_mean - initial_min
-            self.attr = dict()
             self.attr['initial_mean'] = initial_mean
             self.attr['initial_std'] = initial_std
             self.attr['initial_strategy'] = 1  # Uniform Random
diff --git a/python/paddle/trainer_config_helpers/config_parser.py b/python/paddle/trainer_config_helpers/config_parser.py
deleted file mode 100644
index 4b91b8d2824cd89ac0d6da696492bd9289b6e5f4..0000000000000000000000000000000000000000
--- a/python/paddle/trainer_config_helpers/config_parser.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.trainer.config_parser as config_parser
-'''
-This file is a wrapper of formal config_parser. The main idea of this file is to 
-separete different config logic into different function, such as network configuration
- and optimizer configuration.
-'''
-
-__all__ = [
-    "parse_trainer_config", "parse_network_config", "parse_optimizer_config"
-]
-
-
-def parse_trainer_config(trainer_conf, config_arg_str):
-    return config_parser.parse_config(trainer_conf, config_arg_str)
-
-
-def parse_network_config(network_conf):
-    config = config_parser.parse_config(network_conf, '')
-    return config.model_config
-
-
-def parse_optimizer_config(optimizer_conf):
-    config = config_parser.parse_config(optimizer_conf, '')
-    return config.opt_config
diff --git a/python/paddle/trainer_config_helpers/config_parser_utils.py b/python/paddle/trainer_config_helpers/config_parser_utils.py
index 681b177a55f48d02a8ff792945dd7cc3b05cd976..ee5bbbfb2de7640ebef04edce34332ce4f44c67e 100644
--- a/python/paddle/trainer_config_helpers/config_parser_utils.py
+++ b/python/paddle/trainer_config_helpers/config_parser_utils.py
@@ -12,15 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import paddle.trainer.config_parser as config_parser
+from paddle.proto.TrainerConfig_pb2 import OptimizationConfig
 '''
-This file is a wrapper of formal config_parser. The main idea of this file is to 
+This file is a wrapper of formal config_parser. The main idea of this file is to
 separete different config logic into different function, such as network configuration
  and optimizer configuration.
 '''
 
 __all__ = [
-    "parse_trainer_config", "parse_network_config", "parse_optimizer_config"
+    "parse_trainer_config", "parse_network_config", "parse_optimizer_config",
+    "reset_parser"
 ]
 
 
@@ -34,5 +37,15 @@ def parse_network_config(network_conf, config_arg_str=''):
 
 
 def parse_optimizer_config(optimizer_conf, config_arg_str=''):
-    config = config_parser.parse_config(optimizer_conf, config_arg_str)
-    return config.opt_config
+    config_parser.settings = copy.deepcopy(config_parser.DEFAULT_SETTING)
+    optimizer_conf()
+    opt_config = OptimizationConfig()
+    for k, v in config_parser.settings.iteritems():
+        if v is None:
+            continue
+        opt_config.__setattr__(k, v)
+    return opt_config
+
+
+def reset_parser():
+    config_parser.begin_parse()
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index 567521ee9dbadb7a2502cfb9972ef0940e1e410a..a5234f3e47f6caa4b365de593648e0ee5ad6e4a2 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -347,32 +347,71 @@ def chunk_evaluator(
         excluded_chunk_types=None, ):
     """
     Chunk evaluator is used to evaluate segment labelling accuracy for a
-    sequence. It calculates the chunk detection F1 score.
+    sequence. It calculates precision, recall and F1 scores for the chunk detection.
 
-    A chunk is correctly detected if its beginning, end and type are correct.
-    Other chunk type is ignored.
+    To use chunk evaluator, several concepts need to be clarified firstly.
 
-    For each label in the label sequence, we have:
+    * **Chunk type** is the type of the whole chunk and a chunk consists of one or several words.  (For example in NER, ORG for organization name, PER for person name etc.)
 
-    .. code-block:: python
+    * **Tag type** indicates the position of a word in a chunk. (B for begin, I for inside, E for end, S for single)
+    We can name a label by combining tag type and chunk type. (ie. B-ORG for begining of an organization name)
 
-       tagType = label % numTagType
-       chunkType = label / numTagType
-       otherChunkType = numChunkTypes
+    The construction of label dictionary should obey the following rules:
 
-    The total number of different labels is numTagType*numChunkTypes+1.
-    We support 4 labelling scheme.
-    The tag type for each of the scheme is shown as follows:
+    - Use one of the listed labelling schemes. These schemes differ in ways indicating chunk boundry.
 
-    .. code-block:: python
+    .. code-block:: text
+
+        Scheme    Description                                                                                  
+        plain    Use the same label for the whole chunk.
+        IOB      Two labels for chunk type X, B-X for chunk begining and I-X for chunk inside. 
+        IOE      Two labels for chunk type X, E-X for chunk ending and I-X for chunk inside.
+        IOBES    Four labels for chunk type X, B-X for chunk begining, I-X for chunk inside, E-X for chunk end and S-X for single word chunk. 
+   
+    To make it clear, let's illustrate by an NER example.
+    Assuming that there are three named entity types including ORG, PER and LOC which are called 'chunk type' here,
+    if 'IOB' scheme were used, the label set will be extended to a set including B-ORG, I-ORG, B-PER, I-PER, B-LOC, I-LOC and O,
+    in which B-ORG for begining of ORG and I-ORG for inside of ORG.
+    Prefixes which are called 'tag type' here are added to chunk types and there are two tag types including B and I.
+    Of course, the training data should be labeled accordingly.
+
+    - Mapping is done correctly by the listed equations and assigning protocol.
+
+    The following table are equations to extract tag type and chunk type from a label.
+
+    .. code-block:: text
+
+        tagType = label % numTagType
+        chunkType = label / numTagType
+        otherChunkType = numChunkTypes
+    
+    The following table shows the mapping rule between tagType and tag type in each scheme.
+
+    .. code-block:: text
+
+        Scheme Begin Inside End   Single
+        plain  0     -      -     -
+        IOB    0     1      -     -
+        IOE    -     0      1     -
+        IOBES  0     1      2     3
+
+    Continue the NER example, and the label dict should look like this to satify above equations:
+
+    .. code-block:: text
 
-       Scheme Begin Inside End   Single
-       plain  0     -      -     -
-       IOB    0     1      -     -
-       IOE    -     0      1     -
-       IOBES  0     1      2     3
+        B-ORG  0
+        I-ORG  1
+        B-PER  2
+        I-PER  3
+        B-LOC  4
+        I-LOC  5
+        O      6
 
-    'plain' means the whole chunk must contain exactly the same chunk label.
+    In this example, chunkType has three values: 0 for ORG, 1 for PER, 2 for LOC, because the scheme is
+    "IOB" so tagType has two values: 0 for B and 1 for I. 
+    Here we will use I-LOC to explain the above mapping rules in detail.
+    For I-LOC, the label id is 5, so we can get tagType=1 and chunkType=2, which means I-LOC is a part of NER chunk LOC
+    and the tag is I.
 
     The simple usage is:
 
@@ -380,6 +419,7 @@ def chunk_evaluator(
 
        eval = chunk_evaluator(input, label, chunk_scheme, num_chunk_types)
 
+    
     :param input: The input layers.
     :type input: LayerOutput
     :param label: An input layer containing the ground truth label.
diff --git a/python/paddle/trainer_config_helpers/layer_math.py b/python/paddle/trainer_config_helpers/layer_math.py
index 544b443825393c9a31c0375724d4ca63dac5c5eb..e1c8f0c3500413b364546bc3352cf3a64d3581de 100644
--- a/python/paddle/trainer_config_helpers/layer_math.py
+++ b/python/paddle/trainer_config_helpers/layer_math.py
@@ -40,6 +40,8 @@ register_unary_math_op('sigmoid', act.SigmoidActivation())
 register_unary_math_op('tanh', act.TanhActivation())
 register_unary_math_op('square', act.SquareActivation())
 register_unary_math_op('relu', act.ReluActivation())
+register_unary_math_op('sqrt', act.SqrtActivation())
+register_unary_math_op('reciprocal', act.ReciprocalActivation())
 
 
 def add(layeroutput, other):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 31652613fb3a55636b32babbc4bde60d65776c61..5667e5ff2bccd38f2da00a3b17ea8bc8e3a6fb8e 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -111,6 +111,7 @@ __all__ = [
     'block_expand_layer',
     'maxout_layer',
     'out_prod_layer',
+    'printer_layer',
     'print_layer',
     'priorbox_layer',
     'cross_channel_norm_layer',
@@ -119,6 +120,7 @@ __all__ = [
     'eos_layer',
     'smooth_l1_cost',
     'layer_support',
+    'multiplex_layer',
 ]
 
 
@@ -185,6 +187,7 @@ class LayerType(object):
     MAXOUT = "maxout"
     SPP_LAYER = "spp"
     PAD_LAYER = "pad"
+    MULTIPLEX_LAYER = "multiplex"
 
     PRINT_LAYER = "print"
     PRIORBOX_LAYER = "priorbox"
@@ -225,8 +228,29 @@ class LayerType(object):
 
 
 class AggregateLevel(object):
-    EACH_TIMESTEP = 'non-seq'
-    EACH_SEQUENCE = 'seq'
+    """
+    PaddlePaddle supports three sequence types:
+
+    - :code:`SequenceType.NO_SEQUENCE` means the sample is not a sequence.
+    - :code:`SequenceType.SEQUENCE` means the sample is a sequence.
+    - :code:`SequenceType.SUB_SEQUENCE` means the sample is a nested sequence,
+      each timestep of which is also a sequence.
+
+    Accordingly, AggregateLevel supports two modes:
+
+    - :code:`AggregateLevel.TO_NO_SEQUENCE` means the aggregation acts on each
+      timestep of a sequence, both :code:`SUB_SEQUENCE` and :code:`SEQUENCE` will
+      be aggregated to :code:`NO_SEQUENCE`.
+
+    - :code:`AggregateLevel.TO_SEQUENCE` means the aggregation acts on each
+      sequence of a nested sequence, :code:`SUB_SEQUENCE` will be aggregated to
+      :code:`SEQUENCE`.
+    """
+    TO_NO_SEQUENCE = 'non-seq'
+    TO_SEQUENCE = 'seq'
+    # compatible with previous configuration
+    EACH_TIMESTEP = TO_NO_SEQUENCE
+    EACH_SEQUENCE = TO_SEQUENCE
 
 
 class LayerOutput(object):
@@ -267,6 +291,7 @@ class LayerOutput(object):
         assert size is not None
         assert LayerType.is_layer_type(layer_type)
         self.name = name
+        self.full_name = MakeLayerNameInSubmodel(name)
         self.layer_type = layer_type
         if parents is not None and type(parents) != list:
             parents = [parents]
@@ -461,7 +486,7 @@ def table_projection(input, size=0, param_attr=None):
     return proj
 
 
-def identity_projection(input, offset=None):
+def identity_projection(input, offset=None, size=None):
     """
     1. IdentityProjection if offset=None. It performs:
 
@@ -502,8 +527,10 @@ def identity_projection(input, offset=None):
         proj = IdentityProjection(input_layer_name=input.name)
         proj.origin = input
     else:
+        if size is None:
+            size = input.size - offset
         proj = IdentityOffsetProjection(
-            input_layer_name=input.name, offset=offset)
+            input_layer_name=input.name, offset=offset, size=size)
         proj.origin = input
     return proj
 
@@ -943,7 +970,7 @@ def fc_layer(input,
 
 
 @wrap_name_default("print")
-def print_layer(input, name=None):
+def printer_layer(input, name=None):
     """
     Print the output value of input layers. This layer is useful for debugging.
 
@@ -965,6 +992,13 @@ def print_layer(input, name=None):
         inputs=[l.name for l in input], )
     # this layer don't return anything, can not be input of other layer.
 
+# Keep print_layer for compatibility with V1 API.
+# 'print_layer' does not work for V2 API because it will be changed to
+# 'print' for V2 API. But 'print' is a reserved key word in python.
+
+
+print_layer = printer_layer
+
 
 @wrap_name_default("priorbox")
 def priorbox_layer(input,
@@ -1060,7 +1094,7 @@ def pooling_layer(input,
                   pooling_type=None,
                   name=None,
                   bias_attr=None,
-                  agg_level=AggregateLevel.EACH_TIMESTEP,
+                  agg_level=AggregateLevel.TO_NO_SEQUENCE,
                   layer_attr=None):
     """
     Pooling layer for sequence inputs, not used for Image.
@@ -1071,10 +1105,10 @@ def pooling_layer(input,
 
        seq_pool = pooling_layer(input=layer,
                                 pooling_type=AvgPooling(),
-                                agg_level=AggregateLevel.EACH_SEQUENCE)
+                                agg_level=AggregateLevel.TO_NO_SEQUENCE)
 
-    :param agg_level: AggregateLevel.EACH_TIMESTEP or
-                      AggregateLevel.EACH_SEQUENCE
+    :param agg_level: AggregateLevel.TO_NO_SEQUENCE or
+                      AggregateLevel.TO_SEQUENCE
     :type agg_level: AggregateLevel
     :param name: layer name.
     :type name: basestring
@@ -1344,7 +1378,7 @@ def grumemory(input,
 @layer_support()
 def last_seq(input,
              name=None,
-             agg_level=AggregateLevel.EACH_TIMESTEP,
+             agg_level=AggregateLevel.TO_NO_SEQUENCE,
              stride=-1,
              layer_attr=None):
     """
@@ -1379,7 +1413,7 @@ def last_seq(input,
                        " series information at all. Maybe you want to use"
                        " first_seq instead.")
 
-    if agg_level == AggregateLevel.EACH_SEQUENCE:
+    if agg_level == AggregateLevel.TO_SEQUENCE:
         assert stride == -1
 
     Layer(
@@ -1400,7 +1434,7 @@ def last_seq(input,
 @layer_support()
 def first_seq(input,
               name=None,
-              agg_level=AggregateLevel.EACH_TIMESTEP,
+              agg_level=AggregateLevel.TO_NO_SEQUENCE,
               stride=-1,
               layer_attr=None):
     """
@@ -1436,7 +1470,7 @@ def first_seq(input,
                        ' time series information at all. Maybe you want to use'
                        ' last_seq instead.')
 
-    if agg_level == AggregateLevel.EACH_SEQUENCE:
+    if agg_level == AggregateLevel.TO_SEQUENCE:
         assert stride == -1
 
     Layer(
@@ -1454,8 +1488,23 @@ def first_seq(input,
 
 
 class ExpandLevel(object):
-    FROM_TIMESTEP = AggregateLevel.EACH_TIMESTEP
-    FROM_SEQUENCE = AggregateLevel.EACH_SEQUENCE
+    """
+    Please refer to AggregateLevel first.
+
+    ExpandLevel supports two modes:
+
+    - :code:`ExpandLevel.FROM_NO_SEQUENCE` means the expansion acts on
+      :code:`NO_SEQUENCE`, which will be expanded to
+      :code:`SEQUENCE` or :code:`SUB_SEQUENCE`.
+
+    - :code:`ExpandLevel.FROM_SEQUENCE` means the expansion acts on
+      :code:`SEQUENCE`, which will be expanded to
+      :code:`SUB_SEQUENCE`.
+    """
+    FROM_NO_SEQUENCE = AggregateLevel.TO_NO_SEQUENCE
+    FROM_SEQUENCE = AggregateLevel.TO_SEQUENCE
+    # compatible with previous configuration
+    FROM_TIMESTEP = FROM_NO_SEQUENCE
 
 
 @wrap_name_default()
@@ -1464,7 +1513,7 @@ def expand_layer(input,
                  expand_as,
                  name=None,
                  bias_attr=False,
-                 expand_level=ExpandLevel.FROM_TIMESTEP,
+                 expand_level=ExpandLevel.FROM_NO_SEQUENCE,
                  layer_attr=None):
     """
     A layer for "Expand Dense data or (sequence data where the length of each
@@ -1476,7 +1525,7 @@ def expand_layer(input,
 
        expand = expand_layer(input=layer1,
                              expand_as=layer2,
-                             expand_level=ExpandLevel.FROM_TIMESTEP)
+                             expand_level=ExpandLevel.FROM_NO_SEQUENCE)
 
     :param input: Input layer
     :type input: LayerOutput
@@ -2763,7 +2812,7 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
     if layer_type == LayerType.CONCAT_LAYER:
         assert not bias_attr
 
-    Layer(
+    layer = Layer(
         name=name,
         type=layer_type,
         inputs=[x.name for x in input] if is_concat_layer else input,
@@ -2771,13 +2820,7 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
         bias=ParamAttr.to_bias(bias_attr),
         **ExtraLayerAttribute.to_kwargs(layer_attr))
 
-    sz = 0
-    for each_input in input:
-        if each_input.size is not None:
-            sz += each_input.size
-        else:
-            sz = None
-            break
+    sz = layer.config.size
 
     return LayerOutput(
         name,
@@ -2881,11 +2924,11 @@ def memory(name,
     to specify the layer needs to be remembered as the following:
 
     .. code-block:: python
+
        mem = memory(size=256)
        state = fc_layer(input=mem, size=256)
        mem.set_input(mem)
 
-
     :param name: the name of the layer which this memory remembers.
                  If name is None, user should call set_input() to specify the
                  name of the layer which this memory remembers.
@@ -2945,7 +2988,7 @@ def memory(name,
 @layer_support()
 def lstm_step_layer(input,
                     state,
-                    size,
+                    size=None,
                     act=None,
                     name=None,
                     gate_act=None,
@@ -3011,6 +3054,9 @@ def lstm_step_layer(input,
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
+
+    assert size is None or state.size == size
+    size = state.size
     Layer(
         name=name,
         type=LayerType.LSTM_STEP_LAYER,
@@ -3018,7 +3064,7 @@ def lstm_step_layer(input,
         active_gate_type=gate_act.name,
         active_state_type=state_act.name,
         bias=ParamAttr.to_bias(bias_attr),
-        size=size,
+        size=state.size,
         inputs=[input.name, state.name],
         **ExtraLayerAttribute.to_kwargs(layer_attr))
 
@@ -3369,7 +3415,7 @@ def recurrent_group(step,
                           else, for training or testing, one of the input type must
                           be LayerOutput.
 
-    : type is_generating: bool
+    :type is_generating: bool
 
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -3458,6 +3504,11 @@ def recurrent_group(step,
 
     RecurrentLayerGroupEnd(name=name)
 
+    for layer_out in layer_outs:
+        # Thee previous full_name is the name is the rnn group
+        # We need a full_name outside the rnn group
+        layer_out.full_name = MakeLayerNameInSubmodel(layer_out.name)
+
     if len(layer_outs) == 1:
         return layer_outs[0]
     else:
@@ -3765,13 +3816,13 @@ def __cost_input__(input, label, weight=None):
 
 @wrap_name_default()
 @layer_support()
-def mse_cost(input, label, weight=None, name=None, layer_attr=None):
+def mse_cost(input, label, weight=None, name=None, coeff=1.0, layer_attr=None):
     """
     mean squared error cost:
 
     ..  math::
 
-        \frac{1}{N}\sum_{i=1}^N(t_i-y_i)^2
+        \\frac{1}{N}\sum_{i=1}^N(t_i-y_i)^2
 
     :param name: layer name.
     :type name: basestring
@@ -3782,6 +3833,8 @@ def mse_cost(input, label, weight=None, name=None, layer_attr=None):
     :param weight: The weight affects the cost, namely the scale of cost.
                    It is an optional argument.
     :type weight: LayerOutput
+    :param coeff: The coefficient affects the gradient in the backward.
+    :type coeff: float
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -3793,6 +3846,7 @@ def mse_cost(input, label, weight=None, name=None, layer_attr=None):
         inputs=ipts,
         type="square_error",
         name=name,
+        coeff=coeff,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(name, LayerType.COST, parents=parents, size=1)
 
@@ -4723,21 +4777,36 @@ def warp_ctc_layer(input,
                    layer_attr=None):
     """
     A layer intergrating the open-source `warp-ctc
-    <https://github.com/baidu-research/warp-ctc>` library, which is used in
+    <https://github.com/baidu-research/warp-ctc>`_ library, which is used in
     `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin
-    <https://arxiv.org/pdf/1512.02595v1.pdf>`, to compute Connectionist Temporal
-    Classification (CTC) loss.
+    <https://arxiv.org/pdf/1512.02595v1.pdf>`_, to compute Connectionist Temporal
+    Classification (CTC) loss. Besides, another `warp-ctc
+    <https://github.com/gangliao/warp-ctc>`_ repository, which is forked from
+    the official one, is maintained to enable more compiling options. During the
+    building process, PaddlePaddle will clone the source codes, build and
+    install it to :code:`third_party/install/warpctc` directory.
+
+    To use warp_ctc layer, you need to specify the path of :code:`libwarpctc.so`,
+    using following methods:
+
+    1. Set it in :code:`paddle.init` (python api) or :code:`paddle_init` (c api),
+    such as :code:`paddle.init(use_gpu=True,
+    warpctc_dir=your_paddle_source_dir/third_party/install/warpctc/lib)`.
+
+    2. Set environment variable LD_LIBRARY_PATH on Linux or DYLD_LIBRARY_PATH
+    on Mac OS. For instance, :code:`export
+    LD_LIBRARY_PATH=your_paddle_source_dir/third_party/install/warpctc/lib:$LD_LIBRARY_PATH`.
 
     More details of CTC can be found by referring to `Connectionist Temporal
     Classification: Labelling Unsegmented Sequence Data with Recurrent
     Neural Networks <http://machinelearning.wustl.edu/mlpapers/paper_files/
-    icml2006_GravesFGS06.pdf>`_
+    icml2006_GravesFGS06.pdf>`_.
 
     Note:
         - Let num_classes represent the category number. Considering the 'blank'
-          label needed by CTC, you need to use (num_classes + 1) as the input
-          size. Thus, the size of both warp_ctc_layer and 'input' layer should
-          be set to num_classes + 1.
+          label needed by CTC, you need to use (num_classes + 1) as the input size.
+          Thus, the size of both warp_ctc layer and 'input' layer should be set to
+          num_classes + 1.
         - You can set 'blank' to any value ranged in [0, num_classes], which
           should be consistent as that used in your labels.
         - As a native 'softmax' activation is interated to the warp-ctc library,
@@ -4798,6 +4867,7 @@ def crf_layer(input,
               weight=None,
               param_attr=None,
               name=None,
+              coeff=1.0,
               layer_attr=None):
     """
     A layer for calculating the cost of sequential conditional random
@@ -4824,6 +4894,8 @@ def crf_layer(input,
     :type param_attr: ParameterAttribute
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring
+    :param coeff: The coefficient affects the gradient in the backward.
+    :type coeff: float
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
@@ -4848,6 +4920,7 @@ def crf_layer(input,
         type=LayerType.CRF_LAYER,
         size=size,
         inputs=ipts,
+        coeff=coeff,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     parents = [input, label]
     if weight is not None:
@@ -4921,12 +4994,14 @@ def crf_decoding_layer(input,
 
 @wrap_act_default(act=SigmoidActivation())
 @wrap_bias_attr_default(has_bias=True)
+@wrap_param_attr_default()
 @wrap_name_default()
 @layer_support()
 def nce_layer(input,
               label,
-              num_classes,
+              num_classes=None,
               act=None,
+              param_attr=None,
               weight=None,
               num_neg_samples=10,
               neg_distribution=None,
@@ -4942,7 +5017,8 @@ def nce_layer(input,
 
     .. code-block:: python
 
-       cost = nce_layer(input=layer1, label=layer2, weight=layer3,
+       cost = nce_layer(input=[layer1, layer2], label=layer2,
+                        param_attr=[attr1, attr2], weight=layer3,
                         num_classes=3, neg_distribution=[0.1,0.3,0.6])
 
     :param name: layer name
@@ -4957,6 +5033,8 @@ def nce_layer(input,
     :type num_classes: int
     :param act: Activation, default is Sigmoid.
     :type act: BaseActivation
+    :param param_attr: The Parameter Attribute|list.
+    :type param_attr: ParameterAttribute
     :param num_neg_samples: number of negative samples. Default is 10.
     :type num_neg_samples: int
     :param neg_distribution: The distribution for generating the random negative labels.
@@ -4972,9 +5050,20 @@ def nce_layer(input,
     """
     if isinstance(input, LayerOutput):
         input = [input]
+        assert not isinstance(param_attr, collections.Sequence)
+        param_attr = [param_attr]
+    else:
+        if isinstance(param_attr, collections.Sequence):
+            assert len(input) == len(param_attr)
+        else:
+            param_attr = [copy.deepcopy(param_attr) for _ in range(len(input))]
+
     assert isinstance(input, collections.Sequence)
+
     assert isinstance(label, LayerOutput)
     assert label.layer_type == LayerType.DATA
+    if num_classes is None:
+        num_classes = label.size
     if neg_distribution is not None:
         assert isinstance(neg_distribution, collections.Sequence)
         assert len(neg_distribution) == num_classes
@@ -4984,9 +5073,9 @@ def nce_layer(input,
 
     ipts_for_layer = []
     parents = []
-    for each_input in input:
+    for each_input, attr in zip(input, param_attr):
         assert isinstance(each_input, LayerOutput)
-        ipts_for_layer.append(each_input.name)
+        ipts_for_layer.append(Input(each_input.name, **attr.attr))
         parents.append(each_input)
     ipts_for_layer.append(label.name)
     parents.append(label)
@@ -5363,7 +5452,7 @@ def multi_binary_label_cross_entropy(input,
 
 @wrap_name_default()
 @layer_support()
-def smooth_l1_cost(input, label, name=None, layer_attr=None):
+def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
     """
     This is a L1 loss but more smooth. It requires that the
     size of input and label are equal. The formula is as follows,
@@ -5392,6 +5481,8 @@ def smooth_l1_cost(input, label, name=None, layer_attr=None):
     :type input: LayerOutput
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring
+    :param coeff: The coefficient affects the gradient in the backward.
+    :type coeff: float
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -5405,6 +5496,58 @@ def smooth_l1_cost(input, label, name=None, layer_attr=None):
         name=name,
         type=LayerType.SMOOTH_L1,
         inputs=[input.name, label.name],
+        coeff=coeff,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
         name, LayerType.SMOOTH_L1, parents=[input, label], size=1)
+
+
+@wrap_name_default()
+def multiplex_layer(input, name=None, layer_attr=None):
+    """
+    This layer multiplex multiple layers according to the index,
+    which is provided by the first input layer.
+    inputs[0]: the index of the layer to output of size batchSize.
+    inputs[1:N]; the candidate output data.
+    For each index i from 0 to batchSize -1, the output is the i-th row of the
+    (index[i] + 1)-th layer.
+
+    For each i-th row of output:
+    .. math::
+        y[i][j] = x_{x_{0}[i] + 1}[i][j], j = 0,1, ... , (x_{1}.width - 1)
+
+    where, y is output. :math:`x_{k}` is the k-th input layer and
+    :math:`k = x_{0}[i] + 1`.
+
+    .. code-block:: python
+
+       maxid = multiplex_layer(input=layers)
+
+    :param input: Input layers.
+    :type input: list of LayerOutput
+    :param name: Layer name.
+    :type name: basestring
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, collections.Sequence)
+    assert len(input) > 2, 'multiplex_layer should have more than 2 inputs'
+    for i in range(1, len(input)):
+        assert isinstance(input[i], LayerOutput)
+        assert input[i].size == input[1].size, \
+            "All the input layers except the first one should have the same size"
+
+    l = Layer(
+        name=name,
+        type='multiplex',
+        inputs=[x.name for x in input],
+        size=input[1].size,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.MULTIPLEX_LAYER,
+        parents=input,
+        size=l.config.size)
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
index a53ebe160be3b5d6d115e3e15d059d3d87e80942..c3495ee110bfaf91a47637a52e88b3bb56dce7a9 100644
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -408,7 +408,8 @@ def settings(batch_size,
 
     args = [
         'batch_size', 'learning_rate', 'learning_rate_decay_a',
-        'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args'
+        'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args',
+        'gradient_clipping_threshold'
     ]
     kwargs = dict()
     kwargs['algorithm'] = algorithm
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index c5dc8e1aab08d38936d8636c219571d0cf6f4906..981ccbf248391b5db4339570d918404df6033f3d 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -5,6 +5,6 @@ last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
 test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
-test_seq_concat_reshape test_pad test_smooth_l1)
+test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
index 3c6dbc95e54898ca1e44c3dc010c9fb73a3bee30..f87237f9b59a833825841bcdd605c2332c2d5941 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
@@ -6,7 +6,7 @@ din = data_layer(name='data', size=30)
 
 seq_op = [first_seq, last_seq]
 
-agg_level = [AggregateLevel.EACH_SEQUENCE, AggregateLevel.EACH_TIMESTEP]
+agg_level = [AggregateLevel.TO_SEQUENCE, AggregateLevel.TO_NO_SEQUENCE]
 
 opts = []
 
@@ -15,6 +15,7 @@ for op in seq_op:
         opts.append(op(input=din, agg_level=al))
 
 for op in seq_op:
-    opts.append(op(input=din, agg_level=AggregateLevel.EACH_TIMESTEP, stride=5))
+    opts.append(
+        op(input=din, agg_level=AggregateLevel.TO_NO_SEQUENCE, stride=5))
 
 outputs(opts)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
index 24c901c8ee3ab1c90fc14fbff761db06345a6313..a607a62c99f69ac4921a465a20f00b6413b31c8e 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
@@ -4,6 +4,8 @@ settings(batch_size=1000, learning_rate=1e-5)
 
 x = data_layer(name='data', size=100)
 x = layer_math.exp(x)
+x = layer_math.sqrt(x)
+x = layer_math.reciprocal(x)
 x = layer_math.log(x)
 x = layer_math.abs(x)
 x = layer_math.sigmoid(x)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
index 9b8a2ad9687d313e6c5017c2d7331eddf539af92..eaaf7fd6f5b4cec1a2f95622831cf95436a1514a 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
@@ -20,13 +20,43 @@ layers {
     }
   }
 }
+layers {
+  name: "__sqrt_0__"
+  type: "mixed"
+  size: 100
+  active_type: "sqrt"
+  inputs {
+    input_layer_name: "__exp_0__"
+    proj_conf {
+      type: "identity"
+      name: "___sqrt_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__reciprocal_0__"
+  type: "mixed"
+  size: 100
+  active_type: "reciprocal"
+  inputs {
+    input_layer_name: "__sqrt_0__"
+    proj_conf {
+      type: "identity"
+      name: "___reciprocal_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
 layers {
   name: "__log_0__"
   type: "mixed"
   size: 100
   active_type: "log"
   inputs {
-    input_layer_name: "__exp_0__"
+    input_layer_name: "__reciprocal_0__"
     proj_conf {
       type: "identity"
       name: "___log_0__.w0"
@@ -351,6 +381,8 @@ sub_models {
   name: "root"
   layer_names: "data"
   layer_names: "__exp_0__"
+  layer_names: "__sqrt_0__"
+  layer_names: "__reciprocal_0__"
   layer_names: "__log_0__"
   layer_names: "__abs_0__"
   layer_names: "__sigmoid_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
index 05fd1c99d2db6e9faa3b3884ec9baf051791f9fe..05847344be60b4de42a7dd709914fd3da524d1ae 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
@@ -215,6 +215,22 @@ layers {
   }
   coeff: 1.0
 }
+layers {
+  name: "__nce_layer_0__"
+  type: "nce"
+  size: 1
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___nce_layer_0__.w0"
+  }
+  inputs {
+    input_layer_name: "labels"
+  }
+  bias_parameter_name: "___nce_layer_0__.wbias"
+  num_classes: 5000
+  num_neg_samples: 10
+}
 parameters {
   name: "___fc_layer_0__.w0"
   size: 800
@@ -245,6 +261,26 @@ parameters {
   initial_strategy: 0
   initial_smart: true
 }
+parameters {
+  name: "___nce_layer_0__.w0"
+  size: 20000
+  initial_mean: 0.0
+  initial_std: 0.0141421356237
+  dims: 5000
+  dims: 4
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___nce_layer_0__.wbias"
+  size: 5000
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 5000
+  initial_strategy: 0
+  initial_smart: false
+}
 input_layer_names: "input"
 input_layer_names: "labels"
 input_layer_names: "crf_label"
@@ -267,6 +303,7 @@ output_layer_names: "__cross_entropy_with_selfnorm_0__"
 output_layer_names: "__huber_cost_0__"
 output_layer_names: "__multi_binary_label_cross_entropy_0__"
 output_layer_names: "__sum_cost_0__"
+output_layer_names: "__nce_layer_0__"
 sub_models {
   name: "root"
   layer_names: "input"
@@ -292,6 +329,7 @@ sub_models {
   layer_names: "__huber_cost_0__"
   layer_names: "__multi_binary_label_cross_entropy_0__"
   layer_names: "__sum_cost_0__"
+  layer_names: "__nce_layer_0__"
   input_layer_names: "input"
   input_layer_names: "labels"
   input_layer_names: "crf_label"
@@ -314,6 +352,7 @@ sub_models {
   output_layer_names: "__huber_cost_0__"
   output_layer_names: "__multi_binary_label_cross_entropy_0__"
   output_layer_names: "__sum_cost_0__"
+  output_layer_names: "__nce_layer_0__"
   is_recurrent_layer_group: false
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
index 3244181a63109335c4fba6ca4dd04ac8f0446313..b7d74f85ab4ca3f434dfa45516dfee7227b6ceee 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
@@ -60,6 +60,31 @@ layers {
   }
   coeff: 1.0
 }
+layers {
+  name: "multi_class_label"
+  type: "data"
+  size: 500
+  active_type: ""
+}
+layers {
+  name: "__nce_layer_0__"
+  type: "nce"
+  size: 1
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___nce_layer_0__.w0"
+  }
+  inputs {
+    input_layer_name: "multi_class_label"
+  }
+  inputs {
+    input_layer_name: "weight"
+  }
+  bias_parameter_name: "___nce_layer_0__.wbias"
+  num_classes: 500
+  num_neg_samples: 10
+}
 parameters {
   name: "___fc_layer_0__.w0"
   size: 3000
@@ -80,9 +105,30 @@ parameters {
   initial_strategy: 0
   initial_smart: false
 }
+parameters {
+  name: "___nce_layer_0__.w0"
+  size: 5000
+  initial_mean: 0.0
+  initial_std: 0.04472135955
+  dims: 500
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___nce_layer_0__.wbias"
+  size: 500
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 500
+  initial_strategy: 0
+  initial_smart: false
+}
 input_layer_names: "input"
 input_layer_names: "label"
 input_layer_names: "weight"
+input_layer_names: "multi_class_label"
 output_layer_names: "__cost_0__"
 output_layer_names: "__mse_cost_0__"
 evaluators {
@@ -100,9 +146,12 @@ sub_models {
   layer_names: "__fc_layer_0__"
   layer_names: "__cost_0__"
   layer_names: "__mse_cost_0__"
+  layer_names: "multi_class_label"
+  layer_names: "__nce_layer_0__"
   input_layer_names: "input"
   input_layer_names: "label"
   input_layer_names: "weight"
+  input_layer_names: "multi_class_label"
   output_layer_names: "__cost_0__"
   output_layer_names: "__mse_cost_0__"
   evaluator_names: "classification_error_evaluator"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multiplex_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multiplex_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..379842ba8d32fa7cdad448dd86559c7d02f58e0a
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multiplex_layer.protostr
@@ -0,0 +1,63 @@
+type: "nn"
+layers {
+  name: "index"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "data1"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "data2"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "data3"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "__multiplex_layer_0__"
+  type: "multiplex"
+  size: 30
+  active_type: ""
+  inputs {
+    input_layer_name: "index"
+  }
+  inputs {
+    input_layer_name: "data1"
+  }
+  inputs {
+    input_layer_name: "data2"
+  }
+  inputs {
+    input_layer_name: "data3"
+  }
+}
+input_layer_names: "index"
+input_layer_names: "data1"
+input_layer_names: "data2"
+input_layer_names: "data3"
+output_layer_names: "__multiplex_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "index"
+  layer_names: "data1"
+  layer_names: "data2"
+  layer_names: "data3"
+  layer_names: "__multiplex_layer_0__"
+  input_layer_names: "index"
+  input_layer_names: "data1"
+  input_layer_names: "data2"
+  input_layer_names: "data3"
+  output_layer_names: "__multiplex_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
index 18ff6b48c495b7a9d61595916ade1a54b1fa6a10..d2a3b702a1d7b650947b344e4719098f68d4dd73 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
@@ -40,4 +40,6 @@ outputs(
             name='huber_label', size=1)),
     multi_binary_label_cross_entropy(
         input=probs, label=xe_label),
-    sum_cost(input=hidden))
+    sum_cost(input=hidden),
+    nce_layer(
+        input=hidden, label=labels))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
index 1c0aa7f9b9ee45b9eaf82dc46a2648d834dcd4ad..c369062930e2b067ceab0dc3b25ba6c1eabe2450 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
@@ -11,4 +11,9 @@ outputs(
     classification_cost(
         input=fc, label=lbl, weight=wt),
     mse_cost(
-        input=fc, label=lbl, weight=wt))
+        input=fc, label=lbl, weight=wt),
+    nce_layer(
+        input=fc,
+        label=data_layer(
+            name='multi_class_label', size=500),
+        weight=wt))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
index 81e5161ebc165f13ebb919fd3c0fe617167be048..c53f10e0a410b27d86b2415d98178c4790e0b0ba 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
@@ -9,4 +9,6 @@ outputs(
     expand_layer(
         input=din, expand_as=data_seq, expand_level=ExpandLevel.FROM_SEQUENCE),
     expand_layer(
-        input=din, expand_as=data_seq, expand_level=ExpandLevel.FROM_TIMESTEP))
+        input=din,
+        expand_as=data_seq,
+        expand_level=ExpandLevel.FROM_NO_SEQUENCE))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d250001932547d63a70de05940957f90cc014dfb
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py
@@ -0,0 +1,12 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+index = data_layer(name='index', size=1)
+din1 = data_layer(name='data1', size=30)
+din2 = data_layer(name='data2', size=30)
+din3 = data_layer(name='data3', size=30)
+
+dout = multiplex_layer([index, din1, din2, din3])
+
+outputs(dout)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
index f67b6364d88560dfedb07428869482795be6af0c..3c49eb56c1363a6a3f365fe56e16a8b484c8a004 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
@@ -6,7 +6,7 @@ din = data_layer(name='dat_in', size=100)
 
 POOL_TYPE = [MaxPooling, AvgPooling, SumPooling]
 
-AGG_LEVEL = [AggregateLevel.EACH_SEQUENCE, AggregateLevel.EACH_TIMESTEP]
+AGG_LEVEL = [AggregateLevel.TO_SEQUENCE, AggregateLevel.TO_NO_SEQUENCE]
 
 opts = []
 
diff --git a/python/paddle/utils/make_model_diagram.py b/python/paddle/utils/make_model_diagram.py
index 1370ea83a49955d3152a1147f8e8108371a8ae12..40f99075de7fb2401b3b704afe1eb44dbe6072dd 100644
--- a/python/paddle/utils/make_model_diagram.py
+++ b/python/paddle/utils/make_model_diagram.py
@@ -39,6 +39,10 @@ def make_layer_label(layer_config):
 
 def make_diagram(config_file, dot_file, config_arg_str):
     config = parse_config(config_file, config_arg_str)
+    make_diagram_from_proto(config.model_config, dot_file)
+
+
+def make_diagram_from_proto(model_config, dot_file):
     # print >> sys.stderr, config
     name2id = {}
     f = open(dot_file, 'w')
@@ -59,12 +63,12 @@ def make_diagram(config_file, dot_file, config_arg_str):
 
     print >> f, 'digraph graphname {'
     print >> f, 'node [width=0.375,height=0.25];'
-    for i in xrange(len(config.model_config.layers)):
-        l = config.model_config.layers[i]
+    for i in xrange(len(model_config.layers)):
+        l = model_config.layers[i]
         name2id[l.name] = i
 
     i = 0
-    for sub_model in config.model_config.sub_models:
+    for sub_model in model_config.sub_models:
         if sub_model.name == 'root':
             continue
         print >> f, 'subgraph cluster_%s {' % i
@@ -78,18 +82,18 @@ def make_diagram(config_file, dot_file, config_arg_str):
         for layer_name in sub_model.layer_names:
             submodel_layers.add(layer_name)
             lid = name2id[layer_name]
-            layer_config = config.model_config.layers[lid]
+            layer_config = model_config.layers[lid]
             label = make_layer_label(layer_config)
             print >> f, 'l%s [label="%s", shape=box];' % (lid, label)
         print >> f, '}'
 
-    for i in xrange(len(config.model_config.layers)):
-        l = config.model_config.layers[i]
+    for i in xrange(len(model_config.layers)):
+        l = model_config.layers[i]
         if l.name not in submodel_layers:
             label = make_layer_label(l)
             print >> f, 'l%s [label="%s", shape=box];' % (i, label)
 
-    for sub_model in config.model_config.sub_models:
+    for sub_model in model_config.sub_models:
         if sub_model.name == 'root':
             continue
         for link in sub_model.in_links:
@@ -99,8 +103,8 @@ def make_diagram(config_file, dot_file, config_arg_str):
         for mem in sub_model.memories:
             print >> f, make_mem(mem)
 
-    for i in xrange(len(config.model_config.layers)):
-        for l in config.model_config.layers[i].inputs:
+    for i in xrange(len(model_config.layers)):
+        for l in model_config.layers[i].inputs:
             print >> f, 'l%s -> l%s [label="%s"];' % (
                 name2id[l.input_layer_name], i, l.input_parameter_name)
 
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 851fe7060fd52120603ebabb4069d67471aa05d0..b9d0a7f29138cae281236b26509a56738f3801f4 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -27,6 +27,7 @@ from . import dataset
 from . import reader
 from . import plot
 import attr
+import op
 import pooling
 import inference
 import networks
diff --git a/python/paddle/v2/config_base.py b/python/paddle/v2/config_base.py
index acda778e0aee1a8339ad6bd0d719868151d4fabe..d9613e001ac784c9fbee3cd182bdd78354c540a7 100644
--- a/python/paddle/v2/config_base.py
+++ b/python/paddle/v2/config_base.py
@@ -14,206 +14,55 @@
 
 import collections
 import re
-from paddle.trainer_config_helpers.default_decorators import wrap_name_default
 import paddle.trainer_config_helpers as conf_helps
-from topology import Topology
-
-
-class LayerType(type):
-    def __new__(cls, name, bases, attrs):
-        method_name = attrs.get('METHOD_NAME', None)
-        if method_name is not None:
-            method = getattr(conf_helps, method_name)
-            if method.__doc__ is not None:
-                mapper = attrs.get("__map_docstr__", None)
-                if mapper is not None:
-                    attrs['__doc__'] = LayerType.__map_docstr__(
-                        mapper(method.__doc__),
-                        method_name=method_name,
-                        name=name)
-                else:
-                    attrs['__doc__'] = LayerType.__map_docstr__(
-                        method.__doc__, method_name=method_name, name=name)
-        return super(LayerType, cls).__new__(cls, name, bases, attrs)
-
-    @staticmethod
-    def __map_docstr__(doc, name, method_name):
-        assert isinstance(doc, basestring)
-
-        # replace LayerOutput to paddle.v2.config_base.Layer
-        doc = doc.replace("LayerOutput", "paddle.v2.config_base.Layer")
-
-        doc = doc.replace('ParameterAttribute',
-                          'paddle.v2.attr.ParameterAttribute')
-
-        doc = re.sub(r'ExtraLayerAttribute[^\s]?',
-                     'paddle.v2.attr.ExtraAttribute', doc)
-
-        # xxx_layer to xxx
-        doc = re.sub(r"(?P<name>[a-z]+)_layer", r"\g<name>", doc)
-
-        # XxxxActivation to paddle.v2.Activation.Xxxx
-        doc = re.sub(r"(?P<name>[A-Z][a-zA-Z]+)Activation",
-                     r"paddle.v2.Activation.\g<name>", doc)
-
-        # TODO(yuyang18): Add more rules if needed.
+
+__layer_map__ = {}
+
+
+def __map_docstr__(doc, name):
+    if doc is None:
         return doc
 
+    assert isinstance(doc, basestring)
+
+    # replace LayerOutput to paddle.v2.config_base.Layer
+    doc = doc.replace("LayerOutput", "paddle.v2.config_base.Layer")
+
+    doc = doc.replace('ParameterAttribute', 'paddle.v2.attr.ParameterAttribute')
+
+    doc = re.sub(r'ExtraLayerAttribute[^\s]?', 'paddle.v2.attr.ExtraAttribute',
+                 doc)
+
+    # xxx_layer to xxx
+    doc = re.sub(r"(?P<name>[a-z]+)_layer", r"\g<name>", doc)
+
+    # XxxxActivation to paddle.v2.activation.Xxxx
+    doc = re.sub(r"(?P<name>[A-Z][a-zA-Z]+)Activation",
+                 r"paddle.v2.activation.\g<name>", doc)
+
+    # xxx_evaluator to paddle.v2.evaluator.xxx
+    doc = re.sub(r"(?P<name>[a-z]+)_evaluator", r"evaluator.\g<name>", doc)
+
+    # TODO(yuyang18): Add more rules if needed.
+    return doc
+
+
+def __convert_to_v2__(f, name, module):
+    def wrapped(*args, **xargs):
+        out = f(*args, **xargs)
+        outs = out
+        if not isinstance(out, collections.Sequence):
+            outs = [out]
+        for l in outs:
+            if isinstance(l, conf_helps.LayerOutput):
+                __layer_map__[l.full_name] = l
+        return out
+
+    wrapped.__doc__ = __map_docstr__(f.__doc__, name)
+    wrapped.__name__ = name
+    wrapped.__module__ = module
+
+    return wrapped
+
 
-class Layer(object):
-    __metaclass__ = LayerType
-
-    def __init__(self, name=None, parent_layers=None):
-        assert isinstance(parent_layers, dict)
-        self.name = name
-        self.__context__ = {}
-        self.__parent_layers__ = parent_layers
-        # some layer may have some extra parent layer
-        self.__extra_parent__ = []
-        # used for evaluator.
-        self.__children_layers__ = []
-
-    def extra_parent(self):
-        return self.__extra_parent__
-
-    def append_extra_parent(self, parent):
-        self.__extra_parent__.append(parent)
-
-    def append_child(self, layer, parent_names):
-        self.__children_layers__.append((layer, parent_names))
-
-    def to_proto(self, context):
-        """
-        function to set proto attribute
-        """
-        self.__context__ = context
-
-        # STEP: short cut if this layer is parsed before.
-        if self.context_name() in context:
-            if self.use_context_name():
-                return context[self.context_name()]
-            else:
-                return context[self.name]
-
-        # STEP: parse extra_parent that is not used by this layer but must
-        # be parsed before this layer.
-        for p in self.__extra_parent__:
-            p.to_proto(context=context)
-
-        # STEP: parse parent that is used by this layer, get the result and
-        # insert into kwargs of the next layer's to_proto_impl method.
-        kwargs = dict()
-        for layer_name in self.__parent_layers__:
-            if not isinstance(self.__parent_layers__[layer_name],
-                              collections.Sequence):
-                v1_layer = self.__parent_layers__[layer_name].to_proto(
-                    context=context)
-            else:
-                v1_layer = map(lambda x: x.to_proto(context=context),
-                               self.__parent_layers__[layer_name])
-            kwargs[layer_name] = v1_layer
-
-        # STEP: parse myself and add myself into context.
-        ret_val = self.to_proto_impl(**kwargs)
-        if self.context_name() is not None \
-                and self.context_name() not in context:
-            context[self.context_name()] = ret_val
-
-        # STEP: parse children that should be pased after this layer.
-        for layer, pnames in self.__children_layers__:
-            drop = False
-
-            # child will only be parsed if all parents are in context.
-            for pname in pnames:
-                if pname not in context:
-                    drop = True
-                    break
-            if drop:
-                continue
-            layer.to_proto(context=context)
-
-        # STEP: return v1 layer result
-        if self.context_name() is None:
-            return ret_val
-        elif self.use_context_name():
-            return context[self.context_name()]
-        else:
-            return context[self.name]
-
-    def to_proto_impl(self, **kwargs):
-        raise NotImplementedError()
-
-    def context_name(self):
-        """
-        Context name means the context which stores `to_proto_impl` result.
-        If multiple layer share same context_name, the `to_proto_impl` of them
-        will be invoked only once.
-        """
-        return self.name
-
-    def use_context_name(self):
-        return False
-
-    def calculate_size(self):
-        """
-        lazy calculate size of the layer, should be called when to_proto_impl of
-        this layer is called.
-        :return:
-        """
-        return self.__context__[self.context_name()].size
-
-    def attr(self):
-        topo = Topology(self)
-        return topo.get_layer_proto(self.name)
-
-
-def __convert_to_v2__(method_name,
-                      parent_names,
-                      is_default_name=True,
-                      attach_parent=False):
-    if is_default_name:
-        wrapper = wrap_name_default(name_prefix=method_name)
-    else:
-        wrapper = None
-
-    class V2LayerImpl(Layer):
-        METHOD_NAME = method_name
-
-        def __init__(self, **kwargs):
-            parent_layers = dict()
-            other_kwargs = dict()
-            for pname in parent_names:
-                if pname in kwargs:
-                    parent_layers[pname] = kwargs[pname]
-
-            if attach_parent:
-                pnames = [x.context_name() for x in parent_layers.values()]
-
-                for pname in parent_layers:
-                    layers = kwargs[pname]
-                    if not isinstance(layers, collections.Sequence):
-                        layers = [layers]
-
-                    for layer in layers:
-                        layer.append_child(self, pnames)
-
-            for key in kwargs.keys():
-                if key not in parent_names:
-                    other_kwargs[key] = kwargs[key]
-
-            name = kwargs.get('name', None)
-            super(V2LayerImpl, self).__init__(name, parent_layers)
-            self.__other_kwargs__ = other_kwargs
-
-        if wrapper is not None:
-            __init__ = wrapper(__init__)
-
-        def to_proto_impl(self, **kwargs):
-            args = dict()
-            for each in kwargs:
-                args[each] = kwargs[each]
-            for each in self.__other_kwargs__:
-                args[each] = self.__other_kwargs__[each]
-            return getattr(conf_helps, method_name)(**args)
-
-    return V2LayerImpl
+Layer = conf_helps.LayerOutput
diff --git a/python/paddle/v2/data_type.py b/python/paddle/v2/data_type.py
index d582f76ddf01ed3430a1d075624bbb8e0bf3f2a9..226997465f2ec97c6224b248427739592e9694df 100644
--- a/python/paddle/v2/data_type.py
+++ b/python/paddle/v2/data_type.py
@@ -16,7 +16,8 @@ import paddle.trainer.PyDataProvider2 as pydp2
 
 import_list = [
     nm for nm in dir(pydp2)
-    if '_' in nm and nm[0] != '_' and ('value' in nm or 'vector' in nm)
+    if '_' in nm and nm[0] != '_' and ('value' in nm or 'vector' in nm or
+                                       'array' in nm)
 ]
 import_list.extend(['InputType'])
 
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
index 80ff6295c34e853d8f69b9e78719af23a56d1fbb..26252d5bbd77ddb70b4f03843679e4737e2f96d3 100644
--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
@@ -24,8 +24,9 @@ import conll05
 import uci_housing
 import sentiment
 import wmt14
+import mq2007
 
 __all__ = [
     'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment'
-    'uci_housing', 'wmt14'
+    'uci_housing', 'wmt14', 'mq2007'
 ]
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index 2eb018b8d60e9a8bd0091836ab56c35b05786fca..418b592a5ac638cc61b86a9b3fbdcee1e3a0bcaf 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -19,8 +19,10 @@ import shutil
 import sys
 import importlib
 import paddle.v2.dataset
+import cPickle
+import glob
 
-__all__ = ['DATA_HOME', 'download', 'md5file']
+__all__ = ['DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader']
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
 
@@ -74,3 +76,76 @@ def fetch_all():
             getattr(
                 importlib.import_module("paddle.v2.dataset.%s" % module_name),
                 "fetch")()
+
+
+def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
+    """
+    you can call the function as:
+
+    split(paddle.v2.dataset.cifar.train10(), line_count=1000,
+        suffix="imikolov-train-%05d.pickle")
+
+    the output files as:
+
+    |-imikolov-train-00000.pickle
+    |-imikolov-train-00001.pickle
+    |- ...
+    |-imikolov-train-00480.pickle
+
+    :param reader: is a reader creator
+    :param line_count: line count for each file
+    :param suffix: the suffix for the output files, should contain "%d"
+                means the id for each file. Default is "%05d.pickle"
+    :param dumper: is a callable function that dump object to file, this
+                function will be called as dumper(obj, f) and obj is the object
+                will be dumped, f is a file object. Default is cPickle.dump.
+    """
+    if not callable(dumper):
+        raise TypeError("dumper should be callable.")
+    lines = []
+    indx_f = 0
+    for i, d in enumerate(reader()):
+        lines.append(d)
+        if i >= line_count and i % line_count == 0:
+            with open(suffix % indx_f, "w") as f:
+                dumper(lines, f)
+                lines = []
+                indx_f += 1
+    if lines:
+        with open(suffix % indx_f, "w") as f:
+            dumper(lines, f)
+
+
+def cluster_files_reader(files_pattern,
+                         trainer_count,
+                         trainer_id,
+                         loader=cPickle.load):
+    """
+    Create a reader that yield element from the given files, select
+    a file set according trainer count and trainer_id
+
+    :param files_pattern: the files which generating by split(...)
+    :param trainer_count: total trainer count
+    :param trainer_id: the trainer rank id
+    :param loader: is a callable function that load object from file, this
+                function will be called as loader(f) and f is a file object.
+                Default is cPickle.load
+    """
+
+    def reader():
+        if not callable(loader):
+            raise TypeError("loader should be callable.")
+        file_list = glob.glob(files_pattern)
+        file_list.sort()
+        my_file_list = []
+        for idx, fn in enumerate(file_list):
+            if idx % trainer_count == trainer_id:
+                print "append file: %s" % fn
+                my_file_list.append(fn)
+        for fn in my_file_list:
+            with open(fn, "r") as f:
+                lines = loader(f)
+                for line in lines:
+                    yield line
+
+    return reader
diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd71b341662ca6f540ce44a86348e782561a97d7
--- /dev/null
+++ b/python/paddle/v2/dataset/mq2007.py
@@ -0,0 +1,337 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MQ2007 dataset
+
+MQ2007 is a query set from Million Query track of TREC 2007. There are about 1700 queries in it with labeled documents. In MQ2007, the 5-fold cross
+validation strategy is adopted and the 5-fold partitions are included in the package. In each fold, there are three subsets for learning: training set,
+validation set and testing set.
+
+MQ2007 dataset from website
+http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar and parse training set and test set into paddle reader creators
+
+"""
+
+import os
+import random
+import functools
+import rarfile
+from common import download
+import numpy as np
+
+# URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
+URL = "http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar"
+MD5 = "7be1640ae95c6408dab0ae7207bdc706"
+
+
+def __initialize_meta_info__():
+    """
+  download and extract the MQ2007 dataset
+  """
+    fn = fetch()
+    rar = rarfile.RarFile(fn)
+    dirpath = os.path.dirname(fn)
+    rar.extractall(path=dirpath)
+    return dirpath
+
+
+class Query(object):
+    """
+  queries used for learning to rank algorithms. It is created from relevance scores,  query-document feature vectors
+
+  Parameters:
+  ----------
+  query_id : int
+    query_id in dataset, mapping from query to relevance documents
+  relevance_score : int 
+    relevance score of query and document pair
+  feature_vector : array, dense feature
+    feature in vector format
+  description : string
+    comment section in query doc pair data
+  """
+
+    def __init__(self,
+                 query_id=-1,
+                 relevance_score=-1,
+                 feature_vector=None,
+                 description=""):
+        self.query_id = query_id
+        self.relevance_score = relevance_score
+        if feature_vector is None:
+            self.feature_vector = []
+        else:
+            self.feature_vector = feature_vector
+        self.description = description
+
+    def __str__(self):
+        string = "%s %s %s" % (str(self.relevance_score), str(self.query_id),
+                               " ".join(str(f) for f in self.feature_vector))
+        return string
+
+    # @classmethod
+    def _parse_(self, text):
+        """
+    parse line into Query
+    """
+        comment_position = text.find('#')
+        line = text[:comment_position].strip()
+        self.description = text[comment_position + 1:].strip()
+        parts = line.split()
+        if len(parts) != 48:
+            sys.stdout.write("expect 48 space split parts, get %d" %
+                             (len(parts)))
+            return None
+        # format : 0 qid:10 1:0.000272 2:0.000000 .... 
+        self.relevance_score = int(parts[0])
+        self.query_id = int(parts[1].split(':')[1])
+        for p in parts[2:]:
+            pair = p.split(':')
+            self.feature_vector.append(float(pair[1]))
+        return self
+
+
+class QueryList(object):
+    """
+  group query into list, every item in list is a Query
+  """
+
+    def __init__(self, querylist=None):
+        self.query_id = -1
+        if querylist is None:
+            self.querylist = []
+        else:
+            self.querylist = querylist
+            for query in self.querylist:
+                if self.query_id == -1:
+                    self.query_id = query.query_id
+                else:
+                    if self.query_id != query.query_id:
+                        raise ValueError("query in list must be same query_id")
+
+    def __iter__(self):
+        for query in self.querylist:
+            yield query
+
+    def __len__(self):
+        return len(self.querylist)
+
+    def __getitem__(self, i):
+        return self.querylist[i]
+
+    def _correct_ranking_(self):
+        if self.querylist is None:
+            return
+        self.querylist.sort(key=lambda x: x.relevance_score, reverse=True)
+
+    def _add_query(self, query):
+        if self.query_id == -1:
+            self.query_id = query.query_id
+        else:
+            if self.query_id != query.query_id:
+                raise ValueError("query in list must be same query_id")
+        self.querylist.append(query)
+
+
+def gen_plain_txt(querylist):
+    """
+  gen plain text in list for other usage
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  query_id : np.array, shape=(samples_num, )
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+    """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    for query in querylist:
+        yield querylist.query_id, query.relevance_score, np.array(
+            query.feature_vector)
+
+
+def gen_point(querylist):
+    """
+  gen item in list for point-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    for query in querylist:
+        yield query.relevance_score, np.array(query.feature_vector)
+
+
+def gen_pair(querylist, partial_order="full"):
+    """
+  gen pair for pair-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+  pairtial_order : "full" or "neighbour"
+    there is redudant in all possiable pair combinations, which can be simplifed
+  gen pairs for neighbour items or the full partial order pairs
+
+  return :
+  ------
+  label : np.array, shape=(1)
+  query_left : np.array, shape=(1, feature_dimension)
+  query_right : same as left
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    labels = []
+    docpairs = []
+
+    # C(n,2)
+    for i in range(len(querylist)):
+        query_left = querylist[i]
+        for j in range(i + 1, len(querylist)):
+            query_right = querylist[j]
+            if query_left.relevance_score > query_right.relevance_score:
+                labels.append(1)
+                docpairs.append([
+                    np.array(query_left.feature_vector),
+                    np.array(query_right.feature_vector)
+                ])
+            elif query_left.relevance_score < query_right.relevance_score:
+                labels.append(1)
+                docpairs.append([
+                    np.array(query_right.feature_vector),
+                    np.array(query_left.feature_vector)
+                ])
+    for label, pair in zip(labels, docpairs):
+        yield label, pair[0], pair[1]
+
+
+def gen_list(querylist):
+    """
+  gen item in list for list-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    relevance_score_list = [query.relevance_score for query in querylist]
+    feature_vector_list = [query.feature_vector for query in querylist]
+    yield np.array(relevance_score_list).T, np.array(feature_vector_list)
+
+
+def query_filter(querylists):
+    """
+    filter query get only document with label 0.
+    label 0, 1, 2 means the relevance score document with query
+    parameters :
+      querylist : QueyList list
+
+    return :
+      querylist : QueyList list
+    """
+    filter_query = []
+    for querylist in querylists:
+        relevance_score_list = [query.relevance_score for query in querylist]
+        if sum(relevance_score_list) != .0:
+            filter_query.append(querylist)
+    return filter_query
+
+
+def load_from_text(filepath, shuffle=True, fill_missing=-1):
+    """
+  parse data file into querys
+  """
+    prev_query_id = -1
+    querylists = []
+    querylist = None
+    fn = __initialize_meta_info__()
+    with open(os.path.join(fn, filepath)) as f:
+        for line in f:
+            query = Query()
+            query = query._parse_(line)
+            if query == None:
+                continue
+            if query.query_id != prev_query_id:
+                if querylist is not None:
+                    querylists.append(querylist)
+                querylist = QueryList()
+                prev_query_id = query.query_id
+            querylist._add_query(query)
+    if querylist is not None:
+        querylists.append(querylist)
+    if shuffle == True:
+        random.shuffle(querylists)
+    return querylists
+
+
+def __reader__(filepath, format="pairwise", shuffle=True, fill_missing=-1):
+    """
+  Parameters
+  --------
+  filename : string
+  shuffle : shuffle query-doc pair under the same query
+  fill_missing : fill the missing value. default in MQ2007 is -1
+  
+  Returns
+  ------
+  yield
+    label query_left, query_right  # format = "pairwise"
+    label querylist # format = "listwise"
+  """
+    querylists = query_filter(
+        load_from_text(
+            filepath, shuffle=shuffle, fill_missing=fill_missing))
+    for querylist in querylists:
+        if format == "plain_txt":
+            yield next(gen_plain_txt(querylist))
+        elif format == "pointwise":
+            yield next(gen_point(querylist))
+        elif format == "pairwise":
+            for pair in gen_pair(querylist):
+                yield pair
+        elif format == "listwise":
+            yield next(gen_list(querylist))
+
+
+train = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/train.txt")
+test = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/test.txt")
+
+
+def fetch():
+    return download(URL, "MQ2007", MD5)
+
+
+if __name__ == "__main__":
+    fetch()
+    mytest = functools.partial(
+        __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise")
+    for label, query in mytest():
+        print label, query
diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/v2/dataset/tests/common_test.py
index 5babcef0eb4345d243904877d323c37d4889a643..f9815d4f9e1ee3bbe9ccf2dae588c51c262468c1 100644
--- a/python/paddle/v2/dataset/tests/common_test.py
+++ b/python/paddle/v2/dataset/tests/common_test.py
@@ -15,6 +15,7 @@
 import paddle.v2.dataset.common
 import unittest
 import tempfile
+import glob
 
 
 class TestCommon(unittest.TestCase):
@@ -32,6 +33,30 @@ class TestCommon(unittest.TestCase):
             paddle.v2.dataset.common.download(
                 yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d'))
 
+    def test_split(self):
+        def test_reader():
+            def reader():
+                for x in xrange(10):
+                    yield x
+
+            return reader
+
+        _, temp_path = tempfile.mkstemp()
+        paddle.v2.dataset.common.split(
+            test_reader(), 4, suffix=temp_path + '/test-%05d.pickle')
+        files = glob.glob(temp_path + '/test-%05d.pickle')
+        self.assertEqual(len(files), 3)
+
+    def test_cluster_file_reader(self):
+        _, temp_path = tempfile.mkstemp()
+        for x in xrange(5):
+            with open(temp_path + '/%05d.test' % x) as f:
+                f.write('%d\n' % x)
+        reader = paddle.v2.dataset.common.cluster_files_reader(
+            temp_path + '/*.test', 5, 0)
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, str("0"))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/demo/introduction/evaluate_model.py b/python/paddle/v2/dataset/tests/mq2007_test.py
old mode 100755
new mode 100644
similarity index 51%
rename from demo/introduction/evaluate_model.py
rename to python/paddle/v2/dataset/tests/mq2007_test.py
index eeda43c5c86f3e49f758bf55b16a68387e64238c..59847b6c18eadb12123cae824e8bce1051a69d4c
--- a/demo/introduction/evaluate_model.py
+++ b/python/paddle/v2/dataset/tests/mq2007_test.py
@@ -1,6 +1,3 @@
-#!/usr/bin/env python
-# -*- coding: UTF-8 -*-
-
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,26 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Print model parameters in last model
-
-Usage:
-    python evaluate_model.py
-"""
-import numpy as np
-import os
 
+import paddle.v2.dataset.mq2007
+import unittest
 
-def load(file_name):
-    with open(file_name, 'rb') as f:
-        f.read(16)  # skip header for float type.
-        return np.fromfile(f, dtype=np.float32)
 
+class TestMQ2007(unittest.TestCase):
+    def test_pairwise(self):
+        for label, query_left, query_right in paddle.v2.dataset.mq2007.test(
+                format="pairwise"):
+            self.assertEqual(query_left.shape(), (46, ))
+            self.assertEqual(query_right.shape(), (46, ))
 
-def main():
-    print 'w=%.6f, b=%.6f from pass 29' % (load('output/pass-00029/w'),
-                                           load('output/pass-00029/b'))
+    def test_listwise(self):
+        for label_array, query_array in paddle.v2.dataset.mq2007.test(
+                format="listwise"):
+            self.assertEqual(len(label_array), len(query_array))
 
 
-if __name__ == '__main__':
-    main()
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/evaluator.py b/python/paddle/v2/evaluator.py
index 588eefa3912799aa55f970c6d7e013ed7779ec9a..eaaadbe53bc776ffde800edb9bd6b313ad026627 100644
--- a/python/paddle/v2/evaluator.py
+++ b/python/paddle/v2/evaluator.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle.trainer_config_helpers.evaluators as evs
-import inspect
 from config_base import __convert_to_v2__
+import inspect
 
 __all__ = []
 
@@ -25,21 +25,10 @@ def initialize():
 
     for __ev_name__ in filter(lambda x: x.endswith('_evaluator'), evs.__all__):
         __ev__ = getattr(evs, __ev_name__)
-        if hasattr(__ev__, 'argspec'):
-            argspec = __ev__.argspec
-        else:
-            argspec = inspect.getargspec(__ev__)
-        parent_names = filter(lambda x: x in ['input', 'label', 'weight'],
-                              argspec.args)
-        v2_ev = __convert_to_v2__(
-            __ev_name__,
-            parent_names=parent_names,
-            is_default_name='name' in argspec.args,
-            attach_parent=True)
-
         __new_name__ = convert_to_new_name(__ev_name__)
 
-        globals()[__new_name__] = v2_ev
+        globals()[__new_name__] = __convert_to_v2__(__ev__, __new_name__,
+                                                    __name__)
         globals()[__new_name__].__name__ = __new_name__
         __all__.append(__new_name__)
 
diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py
index 13f53919aa49694f722d4bf20a7d01af3e3e6533..85ad6984ba08440d8f8c24a6ca5842024dbafe4b 100644
--- a/python/paddle/v2/image.py
+++ b/python/paddle/v2/image.py
@@ -1,10 +1,10 @@
 import numpy as np
 try:
     import cv2
-except ImportError:
-    cv2 = None
-
-from cv2 import resize
+except:
+    print(
+        "import cv2 error, please install opencv-python: pip install opencv-python"
+    )
 
 __all__ = [
     "load_image", "resize_short", "to_chw", "center_crop", "random_crop",
@@ -76,7 +76,7 @@ def resize_short(im, size):
         h_new = size * h / w
     else:
         w_new = size * w / h
-    im = resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
+    im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
     return im
 
 
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index b4bb38496937bb6fb520334331c619f9b6f64b51..34b7308601390a4ccb0c19ef10d2c7a60b3fa576 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -12,9 +12,9 @@ class Inference(object):
     """
     Inference combines neural network output and parameters together
     to do inference.
-    
+
     ..  code-block:: python
-    
+
         inferer = Inference(output_layer=prediction, parameters=parameters)
         for data_batch in batches:
             print inferer.infer(data_batch)
@@ -88,17 +88,28 @@ def infer(output_layer, parameters, input, feeding=None, field='value'):
     Infer a neural network by given neural network output and parameters.  The
     user should pass either a batch of input data or reader method.
 
-    Example usages:
+    Example usage for sinlge output_layer:
 
     ..  code-block:: python
 
-        result = paddle.infer(output_layer=prediction, 
-                              parameters=parameters, 
+        result = paddle.infer(output_layer=prediction,
+                              parameters=parameters,
                               input=SomeData)
         print result
 
+    Example usage for multiple outout_layers and fields:
+
+    ..  code-block:: python
+
+        result = paddle.infer(output_layer=[prediction1, prediction2],
+                              parameters=parameters,
+                              input=SomeData,
+                              field=[id, value]])
+        print result
+
     :param output_layer: output of the neural network that would be inferred
-    :type output_layer: paddle.v2.config_base.Layer
+    :type output_layer: paddle.v2.config_base.Layer or a list of
+                        paddle.v2.config_base.Layer
     :param parameters: parameters of the neural network.
     :type parameters: paddle.v2.parameters.Parameters
     :param input: input data batch. Should be a python iterable object, and each
@@ -106,13 +117,15 @@ def infer(output_layer, parameters, input, feeding=None, field='value'):
     :type input: collections.Iterable
     :param feeding: Reader dictionary. Default could generate from input
                         value.
-    :param field: The prediction field. It should in [`value`, `id`, `prob`]. 
-                  `value` and `prob` mean return the prediction probabilities, 
+    :param field: The prediction field. It should in [`value`, `id`, `prob`].
+                  `value` and `prob` mean return the prediction probabilities,
                   `id` means return the prediction labels. Default is `value`.
-                  Note that `prob` only used when output_layer is beam_search 
+                  Note that `prob` only used when output_layer is beam_search
                   or max_id.
     :type field: str
-    :return: a numpy array
+    :return: The prediction result. If there are multiple outout_layers and fields,
+             the return order is outout_layer1.field1, outout_layer2.field1, ...,
+             outout_layer1.field2, outout_layer2.field2 ...
     :rtype: numpy.ndarray
     """
 
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
index 89cca7acd34b8dea0572169338649b5e9ff6536a..815635f5dd4654fe3a31a9244e6e4473c397dd2f 100644
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -32,390 +32,29 @@ The primary usage shows below.
 """
 
 import collections
-import inspect
+import copy
 import re
+import paddle.trainer_config_helpers.layers as v1_layers
+import paddle.trainer.config_parser as cp
+from paddle.proto.ModelConfig_pb2 import ModelConfig, SubModelConfig
+from config_base import __convert_to_v2__
+import config_base
 
-import paddle.trainer_config_helpers as conf_helps
-from paddle.trainer.config_parser import \
-    RecurrentLayerGroupWithoutOutLinksBegin, RecurrentLayerGroupSetOutLink, \
-    RecurrentLayerGroupEnd, model_type
-from paddle.trainer_config_helpers.config_parser_utils import \
-    parse_network_config as __parse__
-from paddle.trainer_config_helpers.default_decorators import wrap_act_default
-from paddle.trainer_config_helpers.default_decorators import \
-    wrap_bias_attr_default
-from paddle.trainer_config_helpers.default_decorators import wrap_name_default
-from paddle.trainer_config_helpers.layers import RecurrentLayerGroupSetGenerator, Generator
-from paddle.trainer_config_helpers.layers import layer_support
+__all__ = ['data', 'parse_network']
 
-import activation
-import attr
-import data_type
-from config_base import Layer, __convert_to_v2__
 
-__all__ = ['parse_network', 'data']
+def __need_to_keep__(name):
+    if name in ['StaticInput', 'LayerType', 'layer_support']:
+        return False
+    return True
 
 
-def parse_network(output_layers, extra_layers=None):
-    """
-    Parse all layers in the neural network graph and
-    then generate a ModelConfig object.
-
-    ..  note::
-
-        This function is used internally in paddle.v2 module. User should never
-        invoke this method.
-
-    :param output_layers: Output layers.
-    :type output_layers: Layer
-    :param extra_layers: Some layers in the neural network graph are not in the
-                         path of output_layers.
-    :type extra_layers: Layer
-    :return: A ModelConfig object instance.
-    :rtype: ModelConfig
-    """
-    if not isinstance(output_layers, collections.Sequence):
-        output_layers = [output_layers]
-    if extra_layers is not None and not isinstance(extra_layers,
-                                                   collections.Sequence):
-        extra_layers = [extra_layers]
-
-    def __real_func__():
-        """
-        __real_func__ is the function that config_parser.parse invoked. It is
-        the plain old paddle configuration function.
-        """
-        context = dict()
-        real_output = [each.to_proto(context=context) for each in output_layers]
-        if extra_layers is not None:
-            extra_output = [
-                each.to_proto(context=context) for each in extra_layers
-            ]
-        conf_helps.outputs(real_output)
-
-    return __parse__(__real_func__)
-
-
-"""
-Some layer may need some special config, and can not use __convert_to_v2__ to convert.
-So we also need to implement some special LayerV2.
-"""
+def __need_to_wrap__(name):
+    return name not in ['AggregateLevel', 'ExpandLevel']
 
 
-class DataLayerV2(Layer):
-    METHOD_NAME = 'data_layer'
-
-    def __init__(self, name, type, **kwargs):
-        assert isinstance(type, data_type.InputType)
-
-        self.type = type
-        self.__method_name__ = 'data_layer'
-        self.__kwargs__ = kwargs
-
-        super(DataLayerV2, self).__init__(name=name, parent_layers=dict())
-
-    def to_proto_impl(self, **kwargs):
-        args = dict()
-        args['size'] = self.type.dim
-        for each in kwargs:
-            args[each] = kwargs[each]
-        for each in self.__kwargs__:
-            args[each] = self.__kwargs__[each]
-        return getattr(conf_helps, self.__method_name__)(name=self.name, **args)
-
-    def __map_docstr__(doc):
-        doc = re.sub(r'(data = [^\)]+)\).*',
-                     "data = paddle.layer.data(name=\"input\", "
-                     "type=paddle.data_type.dense_vector(1000))", doc)
-
-        doc = re.sub(r':param size:.*',
-                     ':param type: Data type of this data layer', doc)
-        doc = re.sub(r':type size:.*',
-                     ":type size: paddle.v2.data_type.InputType", doc)
-        return doc
-
-
-class MemoryV2(Layer):
-    def __init__(self, name, extra_input=None, **kwargs):
-        """
-        Init memory object, if memory is inited inside recurrent_group step
-        function, it may depend on a boot_layer that should be initialized
-        outside recurrent_group, so we:
-            1. add RecurrentLayerInput to extra_parent of self.
-            2. add boot_layer to the extra_parent of RecurrentLayerInput.
-
-        :param extra_input: list of RecurrentLayerInput
-        :type extra_input: [RecurrentLayerInput]
-        """
-        self.name = name
-        super(MemoryV2, self).__init__(name=name, parent_layers=dict())
-        self.__kwargs__ = kwargs
-        self.__boot_layer_name__ = None
-
-        if 'boot_layer' in kwargs:
-            begin_of_current_rnn = []
-            # TODO(yuyang18): Fix inspect, it could be wrong when user invoke a
-            # function inside step.
-            st = inspect.stack()
-            for i in xrange(len(st)):
-                locs = inspect.stack()[i][0].f_locals
-                keys = locs.keys()
-                for key in keys:
-                    val = locs[key]
-                    if isinstance(val, RecurrentLayerInput):
-                        begin_of_current_rnn.append(val)
-                    elif isinstance(val, collections.Sequence):
-                        for v in val:
-                            if isinstance(v, RecurrentLayerInput):
-                                begin_of_current_rnn.append(v)
-
-                if begin_of_current_rnn:
-                    break
-            assert begin_of_current_rnn is not None
-            for extra in begin_of_current_rnn:
-                self.append_extra_parent(extra)
-                extra.append_extra_parent(kwargs['boot_layer'])
-                self.__boot_layer_name__ = kwargs['boot_layer'].name
-
-    def to_proto_impl(self, **kwargs):
-        args = dict()
-        for each in kwargs:
-            args[each] = kwargs[each]
-        for each in self.__kwargs__:
-            args[each] = self.__kwargs__[each]
-
-        if self.__boot_layer_name__ is not None:
-            args['boot_layer'] = self.__context__[self.__boot_layer_name__]
-
-        size = args.get('size', None)
-        if size is not None:
-            if callable(size):
-                real_size = size()
-            else:
-                real_size = size
-            args['size'] = real_size
-        return conf_helps.memory(name=self.name, **args)
-
-    def context_name(self):
-        return self.name + "#memory"
-
-    def use_context_name(self):
-        """
-        memory layer will have the same name with some layer
-        :return:
-        """
-        return True
-
-
-class StaticInputV2(object):
-    def __init__(self, input, is_seq=False, size=None):
-        assert isinstance(input, LayerV2)
-        self.name = input.name
-        self.input = input
-        self.is_seq = is_seq
-        self.size = size
-        # TODO(add size check)
-        # assert input.size is not None or size is not None
-
-
-class BaseGeneratedInputV2(object):
-    def __init__(self):
-        self.bos_id = None
-        self.eos_id = None
-
-    def before_real_step(self):
-        raise NotImplementedError()
-
-    def after_real_step(self, *args):
-        raise NotImplementedError()
-
-
-class GeneratedInputV2(BaseGeneratedInputV2):
-    def __init__(self, size, embedding_name, embedding_size):
-        super(GeneratedInputV2, self).__init__()
-        self.size = size
-        self.embedding_name = embedding_name
-        self.embedding_size = embedding_size
-
-    def after_real_step(self, input):
-        return max_id(input=input, name='__beam_search_predict__')
-
-    def before_real_step(self):
-        predict_id = memory(
-            name='__beam_search_predict__',
-            size=self.size,
-            boot_with_const_id=self.bos_id)
-
-        trg_emb = embedding(
-            input=predict_id,
-            size=self.embedding_size,
-            param_attr=attr.ParamAttr(name=self.embedding_name))
-        return trg_emb
-
-
-class RecurrentLayerGroupSetGeneratorV2(Layer):
-    def __init__(self, eos_name, max_length, beam_size, num_results_per_sample):
-        self.eos_name = eos_name
-        self.max_length = max_length
-        self.beam_size = beam_size
-        self.num_results_per_sample = num_results_per_sample
-        super(RecurrentLayerGroupSetGeneratorV2, self).__init__(
-            name=eos_name, parent_layers={})
-
-    def to_proto_impl(self, **kwargs):
-        RecurrentLayerGroupSetGenerator(
-            Generator(
-                eos_layer_name=self.eos_name,
-                max_num_frames=self.max_length,
-                beam_size=self.beam_size,
-                num_results_per_sample=self.num_results_per_sample))
-        return self
-
-    def context_name(self):
-        return self.eos_name + ".fake"
-
-    def use_context_name(self):
-        return True
-
-
-class MixedLayerV2(Layer):
-    """
-    This class is use to support `with` grammar. If not, the following code
-    could convert mixed_layer simply.
-
-        mixed = __convert_to_v2__(
-            'mixed_layer', name_prefix='mixed', parent_names=['input'])
-    """
-
-    class AddToSealedMixedLayerExceptionV2(Exception):
-        pass
-
-    def __init__(self,
-                 size=0,
-                 input=None,
-                 name=None,
-                 act=None,
-                 bias_attr=None,
-                 layer_attr=None):
-        self.__method_name__ = 'mixed_layer'
-        self.finalized = False
-        self.__inputs__ = []
-        if input is not None:
-            self.__inputs__ = input
-
-        other_kwargs = dict()
-        other_kwargs['name'] = name
-        other_kwargs['size'] = size
-        other_kwargs['act'] = act
-        other_kwargs['bias_attr'] = bias_attr
-        other_kwargs['layer_attr'] = layer_attr
-        parent_layers = {"input": self.__inputs__}
-        super(MixedLayerV2, self).__init__(name, parent_layers)
-        self.__other_kwargs__ = other_kwargs
-
-    def __iadd__(self, other):
-        if not self.finalized:
-            self.__inputs__.append(other)
-            return self
-        else:
-            raise MixedLayerV2.AddToSealedMixedLayerExceptionV2()
-
-    def __enter__(self):
-        assert len(self.__inputs__) == 0
-        return self
-
-    def __exit__(self, *args, **kwargs):
-        self.finalized = True
-
-    def to_proto_impl(self, **kwargs):
-        args = dict()
-        for each in kwargs:
-            args[each] = kwargs[each]
-        for each in self.__other_kwargs__:
-            args[each] = self.__other_kwargs__[each]
-        size = args.get('size', None)
-        if size is not None:
-            if callable(size):
-                real_size = size()
-            else:
-                real_size = size
-            args['size'] = real_size
-        return getattr(conf_helps, self.__method_name__)(**args)
-
-
-@wrap_name_default("mixed")
-@wrap_act_default(act=activation.Linear())
-@wrap_bias_attr_default(has_bias=False)
-@layer_support(conf_helps.layers.ERROR_CLIPPING, conf_helps.layers.DROPOUT)
-def mixed(size=0,
-          name=None,
-          input=None,
-          act=None,
-          bias_attr=False,
-          layer_attr=None):
-    return MixedLayerV2(size, input, name, act, bias_attr, layer_attr)
-
-
-mixed.__doc__ = conf_helps.mixed_layer.__doc__
-
-
-class RecurrentLayerInput(Layer):
-    def __init__(self, recurrent_name, index, parent_layers):
-        parents_len = len(parent_layers)
-        assert parents_len <= 1
-        if parents_len == 0:
-            self.__parents__ = []
-        else:
-            self.__parents__ = parent_layers.values()[0]
-        self.__recurrent_name__ = recurrent_name
-        name = self.__parents__[
-            index].name if index >= 0 else self.context_name()
-        super(RecurrentLayerInput, self).__init__(
-            name=name, parent_layers=parent_layers)
-
-    def context_name(self):
-        return self.__recurrent_name__ + ".begin"
-
-    def to_proto_impl(self, **kwargs):
-        model_type('recurrent_nn')
-        RecurrentLayerGroupWithoutOutLinksBegin(
-            name=self.__recurrent_name__,
-            in_links=map(lambda x: x.name, self.__parents__))
-        return self
-
-
-class RecurrentLayerOutput(Layer):
-    def __init__(self, recurrent_name, index, parent_layers):
-        assert len(parent_layers) == 1
-        self.__parents__ = parent_layers.values()[0]
-        super(RecurrentLayerOutput, self).__init__(
-            name=self.__parents__[index].name, parent_layers=parent_layers)
-        self.__recurrent_name__ = recurrent_name
-
-    def context_name(self):
-        return self.__recurrent_name__ + ".end"
-
-    def to_proto_impl(self, **kwargs):
-        for l in self.__parents__:
-            RecurrentLayerGroupSetOutLink(l.name)
-        RecurrentLayerGroupEnd(name=self.__recurrent_name__)
-
-
-LayerV2 = Layer
-data = DataLayerV2
-data.__name__ = 'data'
-AggregateLevel = conf_helps.layers.AggregateLevel
-ExpandLevel = conf_helps.layers.ExpandLevel
-memory = MemoryV2
-memory.__name__ = 'memory'
-memory.__doc__ = conf_helps.memory.__doc__
-
-
-def __layer_name_mapping__(inname):
-    if inname in ['data_layer', 'memory', 'mixed_layer', 'recurrent_group']:
-        # Do Not handle these layers
-        return
-    elif inname == 'maxid_layer':
+def __convert_name__(inname):
+    if inname == 'maxid_layer':
         return 'max_id'
     elif inname.endswith('memory') or inname.endswith(
             '_seq') or inname.endswith('_sim') or inname == 'hsigmoid':
@@ -429,187 +68,234 @@ def __layer_name_mapping__(inname):
         return inname
     elif inname.endswith("_layer"):
         return inname[:-len("_layer")]
+    else:
+        return inname
 
 
-def __layer_name_mapping_parent_names__(inname):
-    all_args = getattr(conf_helps, inname).argspec.args
-    return filter(
-        lambda x: x in ['input1', 'input2', 'label', 'input', 'a', 'b',
-                        'expand_as',
-                        'weights', 'vectors', 'weight', 'score', 'left',
-                        'right', 'output_mem'],
-        all_args)
-
-
-def __convert_layer__(_new_name_, _old_name_, _parent_names_):
-    global __all__
-    __all__.append(_new_name_)
-    globals()[new_name] = __convert_to_v2__(_old_name_, _parent_names_)
-    globals()[new_name].__name__ = new_name
-
-
-for each_layer_name in dir(conf_helps):
-    new_name = __layer_name_mapping__(each_layer_name)
-    if new_name is not None:
-        parent_names = __layer_name_mapping_parent_names__(each_layer_name)
-        assert len(parent_names) != 0, each_layer_name
-        __convert_layer__(new_name, each_layer_name, parent_names)
-
-del parent_names
-del new_name
-del each_layer_name
-
-
-@wrap_name_default()
-def recurrent_group(step, input, name=None):
-    if not isinstance(input, collections.Sequence):
-        input = [input]
-
-    non_static_inputs = filter(lambda x: not isinstance(x, StaticInputV2),
-                               input)
-    actual_input = [
-        RecurrentLayerInput(
-            recurrent_name=name,
-            index=i,
-            parent_layers={'recurrent_inputs': non_static_inputs})
-        for i in xrange(len(non_static_inputs))
-    ]
-
-    extra_input = None
-    if len(non_static_inputs) == 0:
-        extra_input = RecurrentLayerInput(
-            recurrent_name=name, index=-1, parent_layers={})
-
-    def __real_step__(*args):
-        rnn_input = list(args)
-        static_inputs = filter(lambda x: isinstance(x, StaticInputV2), input)
-        for static_input in static_inputs:
-            mem_name = "__%s_memory__" % static_input.input.name
-            mem = memory(
-                name=mem_name,
-                extra_input=extra_input,
-                is_seq=static_input.is_seq,
-                size=static_input.input.calculate_size,
-                boot_layer=static_input.input)
-            with mixed(
-                    name=mem_name,
-                    size=static_input.input.calculate_size,
-                    act=activation.Identity()) as mix:
-                mix += identity_projection(input=mem)
-            rnn_input.insert(input.index(static_input), mix)
-        return step(*rnn_input)
-
-    actual_output = __real_step__(*actual_input)
-
-    if not isinstance(actual_output, collections.Sequence):
-        actual_output = [actual_output]
-
-    retv = [
-        RecurrentLayerOutput(
-            recurrent_name=name,
-            index=i,
-            parent_layers={'recurrent_outputs': actual_output})
-        for i in xrange(len(actual_output))
-    ]
-    if len(retv) == 1:
-        return retv[0]
+for name in v1_layers.__all__:
+    obj = getattr(v1_layers, name)
+    if not __need_to_keep__(name):
+        continue
+    new_name = __convert_name__(name)
+    if callable(obj) and __need_to_wrap__(name):
+        globals()[new_name] = __convert_to_v2__(obj, new_name, __name__)
     else:
-        return retv
-
-
-recurrent_group.__doc__ = conf_helps.recurrent_group.__doc__
-
-
-@wrap_name_default()
-def beam_search(step,
-                input,
-                bos_id,
-                eos_id,
-                beam_size,
-                max_length=500,
-                name=None,
-                num_results_per_sample=None):
-    if num_results_per_sample is None:
-        num_results_per_sample = beam_size
-    assert num_results_per_sample <= beam_size
-    # logger.warning("num_results_per_sample should be less than beam_size")
-
-    if isinstance(input, StaticInputV2) or isinstance(input,
-                                                      BaseGeneratedInputV2):
-        input = [input]
-
-    generated_input_index = -1
-
-    real_input = []
-    for i, each_input in enumerate(input):
-        assert isinstance(each_input, StaticInputV2) or isinstance(
-            each_input, BaseGeneratedInputV2)
-        if isinstance(each_input, BaseGeneratedInputV2):
-            assert generated_input_index == -1
-            generated_input_index = i
-        else:
-            real_input.append(each_input)
+        globals()[new_name] = obj
+    __all__.append(new_name)
 
-    assert generated_input_index != -1
 
-    gipt = input[generated_input_index]
-    assert isinstance(gipt, BaseGeneratedInputV2)
+def __data_layer__(name, type, **kwargs):
+    l = v1_layers.data_layer(name, type.dim, **kwargs)
+    l.data_type = type
+    return l
 
-    gipt.bos_id = bos_id
-    gipt.eos_id = eos_id
 
-    def __real_step__(*args):
-        eos_name = "__%s_eos_layer__" % name
-        generator = RecurrentLayerGroupSetGeneratorV2(
-            eos_name, max_length, beam_size, num_results_per_sample)
+def __map_data_docstr__(doc):
+    doc = re.sub(r'(data = [^\)]+)\).*',
+                 "data = paddle.layer.data(name=\"input\", "
+                 "type=paddle.data_type.dense_vector(1000))", doc)
 
-        args = list(args)
-        before_step_layer = gipt.before_real_step()
-        before_step_layer.append_child(
-            layer=generator, parent_names=[before_step_layer.name])
-        args.insert(generated_input_index, before_step_layer)
+    doc = re.sub(r':param size:.*', ':param type: Data type of this data layer',
+                 doc)
+    doc = re.sub(r':type size:.*', ":type size: paddle.v2.data_type.InputType",
+                 doc)
+    return doc
+
+
+__data_layer__.__doc__ = __map_data_docstr__(v1_layers.data_layer.__doc__)
+
+data = __convert_to_v2__(__data_layer__, 'name', __name__)
+
+
+def __get_used_layers__(output_layers, extra_layers=None):
+    layer_names = set()
+    parents = {}
+
+    def add_parent(child, parent):
+        if child in parents:
+            parents[child].append(parent)
+        else:
+            parents[child] = [parent]
+
+    def add_additional_parents():
+        for sub_model in cp.g_config.model_config.sub_models:
+            if sub_model.name == 'root':
+                continue
+            for link in sub_model.in_links:
+                add_parent(link.link_name, link.layer_name)
+                add_parent(sub_model.name, link.layer_name)
+            for link in sub_model.out_links:
+                add_parent(link.link_name, link.layer_name)
+                add_parent(link.link_name, sub_model.name)
+            for mem in sub_model.memories:
+                if mem.boot_layer_name:
+                    add_parent(mem.layer_name, mem.boot_layer_name)
+                add_parent(mem.link_name, mem.layer_name)
+
+    def dfs_travel(layer_name):
+        if layer_name in layer_names:
+            return
+        layer_names.add(layer_name)
+        layer = cp.g_layer_map[layer_name]
+
+        for inp in layer.inputs:
+            dfs_travel(inp.input_layer_name)
+        if layer.name in parents:
+            for p in parents[layer.name]:
+                dfs_travel(p)
+
+    add_additional_parents()
+
+    for layer in output_layers:
+        dfs_travel(layer.full_name)
+
+    # print layer needs to be specially handled because no other
+    # layer depends on it. It is used to print the result of some
+    # layers when running the model for debug purpose. So we explicitly
+    # add a print layer to the topolty if its input is in the toplogy.
+    for layer in cp.g_config.model_config.layers:
+        if layer.type == 'print':
+            used = True
+            for inp in layer.inputs:
+                if inp.input_layer_name not in layer_names:
+                    used = False
+                    break
+            if used:
+                layer_names.add(layer.name)
+
+    return layer_names
+
+
+def __get_used_parameters__(layer_names, sub_models):
+    parameter_names = set()
+    for name in layer_names:
+        l = cp.g_layer_map[name]
+        for inp in l.inputs:
+            if inp.input_parameter_name:
+                parameter_names.add(inp.input_parameter_name)
+        if l.bias_parameter_name:
+            parameter_names.add(l.bias_parameter_name)
+
+    for sub_model in sub_models:
+        for mem in sub_model.memories:
+            if mem.HasField("boot_bias_parameter_name"):
+                parameter_names.add(mem.boot_bias_parameter_name)
+
+    return parameter_names
+
+
+def __get_used_submodels__(layer_names):
+    submodel_names = set()
+    for submodel in cp.g_config.model_config.sub_models:
+        if submodel.name in layer_names:
+            submodel_names.add(submodel.name)
+    return submodel_names
+
+
+def __get_used_evaluators__(layer_names):
+    evaluator_names = set()
+    for e in cp.g_config.model_config.evaluators:
+        used = True
+        for name in e.input_layers:
+            if name not in layer_names:
+                used = False
+                break
+        if used:
+            evaluator_names.add(e.name)
+    return evaluator_names
+
+
+def __trim_submodel__(old_submodel, layer_names, input_layer_names,
+                      output_layer_names, evaluator_names):
+
+    submodel = SubModelConfig()
+    submodel.name = old_submodel.name
+    submodel.layer_names.extend(
+        filter(lambda x: x in layer_names, old_submodel.layer_names))
+    submodel.input_layer_names.extend(
+        filter(lambda x: x in input_layer_names, submodel.layer_names))
+    submodel.output_layer_names.extend(
+        filter(lambda x: x in output_layer_names, submodel.layer_names))
+    submodel.evaluator_names.extend(
+        filter(lambda x: x in evaluator_names, old_submodel.evaluator_names))
+
+    submodel.is_recurrent_layer_group = old_submodel.is_recurrent_layer_group
+    submodel.reversed = old_submodel.reversed
+
+    submodel.memories.extend(
+        filter(lambda x: x.link_name in layer_names, old_submodel.memories))
+    target_inlinkid = (old_submodel.target_inlinkid
+                       if old_submodel.HasField('target_inlinkid') else -1)
+    in_links = []
+    for i, link in enumerate(old_submodel.in_links):
+        if link.link_name in layer_names or i == target_inlinkid:
+            in_links.append(link)
+            if i == target_inlinkid:
+                target_inlinkid = len(in_links) - 1
+    submodel.in_links.extend(in_links)
+
+    submodel.out_links.extend(
+        filter(lambda x: x.link_name in layer_names, old_submodel.out_links))
+    if old_submodel.HasField('generator'):
+        submodel.generator.CopyFrom(old_submodel.generator)
+
+    if old_submodel.HasField('target_inlinkid'):
+        submodel.target_inlinkid = target_inlinkid
+    return submodel
+
+
+def parse_network(output_layers, extra_layers=None):
+    if not isinstance(output_layers, collections.Sequence):
+        output_layers = [output_layers]
+    if extra_layers is not None and not isinstance(extra_layers,
+                                                   collections.Sequence):
+        extra_layers = [extra_layers]
+    else:
+        extra_layers = []
 
-        predict = gipt.after_real_step(step(*args))
+    layer_names = __get_used_layers__(output_layers + extra_layers)
+    submodel_names = __get_used_submodels__(layer_names)
+    submodel_names.add('root')
+    evaluator_names = __get_used_evaluators__(layer_names)
+    input_layer_names = set()
+    output_layer_names = set()
 
-        eos_layer = eos(input=predict, eos_id=eos_id, name=eos_name)
-        predict.append_child(layer=eos_layer, parent_names=[predict.name])
+    model_config = ModelConfig()
+    model_config.type = cp.g_config.model_config.type
+    for l in cp.g_config.model_config.layers:
+        if l.name not in layer_names:
+            continue
+        model_config.layers.extend([l])
+        if l.type == 'data':
+            model_config.input_layer_names.append(l.name)
+            input_layer_names.add(l.name)
 
-        return predict
+    for layer in output_layers:
+        model_config.output_layer_names.append(layer.full_name)
+        output_layer_names.add(layer.full_name)
 
-    # tmp = paddle.layer.recurrent_group(
-    #     step=__real_step__,
-    #     input=real_input,
-    #     reverse=False,
-    #     name=name,
-    #     is_generating=True)
-    tmp = recurrent_group(step=__real_step__, input=real_input, name=name)
+    for e in cp.g_config.model_config.evaluators:
+        if e.name in evaluator_names:
+            model_config.evaluators.extend([e])
 
-    return tmp
+    for s in cp.g_config.model_config.sub_models:
+        if s.name in submodel_names:
+            s = __trim_submodel__(s, layer_names, input_layer_names,
+                                  output_layer_names, evaluator_names)
+            model_config.sub_models.extend([s])
 
+    parameter_names = __get_used_parameters__(layer_names,
+                                              model_config.sub_models)
 
-beam_search.__doc__ = conf_helps.beam_search.__doc__
+    for p in cp.g_config.model_config.parameters:
+        if p.name in parameter_names:
+            model_config.parameters.extend([p])
 
-__projection_names__ = filter(lambda x: x.endswith('_projection'),
-                              dir(conf_helps))
+    return model_config
 
-__all__ += __projection_names__
 
-__operator_names__ = filter(lambda x: x.endswith('_operator'), dir(conf_helps))
-__all__ += __operator_names__
+def get_layer(name):
+    return config_base.__layer_map__.get(name)
 
-# convert projection
-for prj in __projection_names__:
-    globals()[prj] = __convert_to_v2__(
-        prj, parent_names=['input'], is_default_name=False)
-    globals()[prj].__name__ = prj
 
-# convert operator
-operator_list = [
-    # [V1_method_name, parent_names],
-    ['dotmul_operator', ['a', 'b']],
-    ['conv_operator', ['img', 'filter']]
-]
-for op in operator_list:
-    globals()[op[0]] = __convert_to_v2__(
-        op[0], parent_names=op[1], is_default_name=False)
-    globals()[op[0]].__name__ = op[0]
+cp.begin_parse()
diff --git a/python/paddle/v2/networks.py b/python/paddle/v2/networks.py
index 9e6644196c8242cc3fed7a4fb1503697e5b59ffb..8ae9f3b202d8c101b051c38d5850b03f54217a95 100644
--- a/python/paddle/v2/networks.py
+++ b/python/paddle/v2/networks.py
@@ -24,20 +24,7 @@ def __initialize__():
         if each_subnetwork in ['inputs', 'outputs']:
             continue
         func = getattr(conf_nw, each_subnetwork)
-        if hasattr(func, 'argspec'):
-            argspec = func.argspec
-        else:
-            argspec = inspect.getargspec(func)
-        if each_subnetwork == 'simple_attention':
-            parents = ['encoded_sequence', 'encoded_proj', 'decoder_state']
-        else:
-            parents = filter(lambda x: x.startswith('input'), argspec.args)
-        assert len(parents) != 0, each_subnetwork
-        v2_subnet = __convert_to_v2__(
-            each_subnetwork,
-            parent_names=parents,
-            is_default_name='name' in argspec.args)
-        globals()[each_subnetwork] = v2_subnet
+        globals()[each_subnetwork] = func
         globals()[each_subnetwork].__name__ = each_subnetwork
         global __all__
         __all__.append(each_subnetwork)
diff --git a/python/paddle/v2/op.py b/python/paddle/v2/op.py
new file mode 100644
index 0000000000000000000000000000000000000000..03f3b9b9ef273613cb60c0530005e0984f904ded
--- /dev/null
+++ b/python/paddle/v2/op.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import layer
+import activation as act
+from config_base import Layer
+from paddle.trainer_config_helpers.attrs import is_compatible_with
+from paddle.trainer_config_helpers.default_decorators import wrap_name_default
+
+__all__ = []
+
+
+def __register_unary_math_op__(op_name, act):
+    def op(input, name=None):
+        return layer.mixed(
+            input=[layer.identity_projection(input=input)], name=name, act=act)
+
+    op = wrap_name_default(op_name)(op)
+    op.__doc__ = type(act).__doc__
+    globals()[op_name] = op
+    __all__.append(op_name)
+
+
+__register_unary_math_op__('exp', act.Exp())
+__register_unary_math_op__('log', act.Log())
+__register_unary_math_op__('abs', act.Abs())
+__register_unary_math_op__('sigmoid', act.Sigmoid())
+__register_unary_math_op__('tanh', act.Tanh())
+__register_unary_math_op__('square', act.Square())
+__register_unary_math_op__('relu', act.Relu())
+__register_unary_math_op__('sqrt', act.Sqrt())
+__register_unary_math_op__('reciprocal', act.Reciprocal())
+__register_unary_math_op__('softmax', act.Softmax())
+
+
+def __add__(layeroutput, other):
+    if is_compatible_with(other, float):
+        return layer.slope_intercept(input=layeroutput, intercept=other)
+    if not isinstance(other, Layer):
+        raise TypeError("Layer can only be added with"
+                        " another Layer or a number")
+    if layeroutput.size == other.size:
+        return layer.mixed(input=[
+            layer.identity_projection(input=layeroutput),
+            layer.identity_projection(input=other)
+        ])
+    if other.size != 1 and layeroutput.size != 1:
+        raise TypeError("Two Layer can be added only if they have equal size"
+                        " or one of their sizes is 1. sizes are %s and %s" %
+                        (layeroutput.size, other.size))
+    elif layeroutput.size == 1:
+        tmp = layeroutput
+        layeroutput = other
+        other = tmp
+    other = layer.repeat(other, layeroutput.size)
+    return layer.mixed(input=[
+        layer.identity_projection(input=layeroutput),
+        layer.identity_projection(input=other)
+    ])
+
+
+Layer.__radd__ = __add__
+Layer.__add__ = __add__
+
+
+def __neg__(layeroutput):
+    return layer.slope_intercept(input=layeroutput, slope=-1.0)
+
+
+Layer.__neg__ = __neg__
+
+
+def __sub__(layeroutput, other):
+    if is_compatible_with(other, float):
+        return layer.slope_intercept(input=layeroutput, intercept=other)
+    if not isinstance(other, Layer):
+        raise TypeError("Layer can only be subtracted with"
+                        " another Layeroutput or a number")
+    return __add__(layeroutput, -other)
+
+
+Layer.__sub__ = __sub__
+
+
+def __rsub__(layeroutput, other):
+    neg = layer.slope_intercept(input=layeroutput, slope=-1.0)
+    return __add__(neg, other)
+
+
+Layer.__rsub__ = __rsub__
+
+
+def __mul__(layeroutput, other):
+    if is_compatible_with(other, float):
+        return layer.slope_intercept(input=layeroutput, slope=other)
+    if not isinstance(other, Layer):
+        raise TypeError("Layer can only be multiplied with"
+                        " another Layer or a number")
+    elif layeroutput.size == 1:
+        return layer.scaling(input=other, weight=layeroutput)
+    elif other.size == 1:
+        return layer.scaling(input=layeroutput, weight=other)
+    else:
+        raise TypeError("At least one of the operand of '*' must be a number"
+                        " or a Layer with size=1")
+
+
+Layer.__mul__ = __mul__
+Layer.__rmul__ = __mul__
diff --git a/python/paddle/v2/plot/tests/CMakeLists.txt b/python/paddle/v2/plot/tests/CMakeLists.txt
index b1132f131737e26bfeeb31f6b3f062710bdf6f75..da5cd764889b48a3af8461a2793d948aa609d6c1 100644
--- a/python/paddle/v2/plot/tests/CMakeLists.txt
+++ b/python/paddle/v2/plot/tests/CMakeLists.txt
@@ -1 +1,5 @@
-add_python_test(test_ploter test_ploter.py)
+if (NOT APPLE)
+  # The Mac OS X backend will not be able to function correctly if Python is
+  # not installed as a framework.
+  add_python_test(test_ploter test_ploter.py)
+endif()
diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt
index eb02e53706b4834eb9dc75d0e3a809772b124725..058f22befd0657d06ff130ace55fe7322148213d 100644
--- a/python/paddle/v2/tests/CMakeLists.txt
+++ b/python/paddle/v2/tests/CMakeLists.txt
@@ -1,2 +1,2 @@
-add_python_test(test_v2_api test_data_feeder.py test_parameters.py
+add_python_test(test_v2_api test_data_feeder.py test_op.py test_parameters.py
 test_layer.py test_rnn_layer.py test_topology.py test_image.py)
diff --git a/python/paddle/v2/tests/test_data_feeder.py b/python/paddle/v2/tests/test_data_feeder.py
index 71eb3bf31425c22b47accc11c9550042e077ef12..83da678da387ed1c86868847f140c6c09fbec3b5 100644
--- a/python/paddle/v2/tests/test_data_feeder.py
+++ b/python/paddle/v2/tests/test_data_feeder.py
@@ -233,6 +233,30 @@ class DataFeederTest(unittest.TestCase):
             self.assertEqual(out_sparse.getSparseRowCols(i), data[i][1])
             self.assertEqual(out_index[i], data[i][0])
 
+    def test_dense_set_shape(self):
+        # test 2-D data
+        def gen_data(batch_size, shape):
+            data = []
+            for i in xrange(batch_size):
+                each_sample = []
+                each_sample.append(np.random.random(shape))
+                data.append(each_sample)
+            return data
+
+        feeder = DataFeeder([('image', data_type.dense_array(2352))],
+                            {'image': 0})
+        arg = feeder(gen_data(32, (3, 28, 28)))
+        h = arg.getSlotFrameHeight(0)
+        w = arg.getSlotFrameWidth(0)
+        self.assertEqual(h, 28)
+        self.assertEqual(w, 28)
+
+        arg = feeder(gen_data(32, (3, 30, 32)))
+        h = arg.getSlotFrameHeight(0)
+        w = arg.getSlotFrameWidth(0)
+        self.assertEqual(h, 30)
+        self.assertEqual(w, 32)
+
 
 if __name__ == '__main__':
     api.initPaddle("--use_gpu=0")
diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py
index c67f3b84d96eb92d94ad80cc54c5e056103c1a1a..f2097e195f41637977e71f65f36dad005d3e7941 100644
--- a/python/paddle/v2/tests/test_layer.py
+++ b/python/paddle/v2/tests/test_layer.py
@@ -73,7 +73,7 @@ class AggregateLayerTest(unittest.TestCase):
         pool = layer.pooling(
             input=pixel,
             pooling_type=pooling.Avg(),
-            agg_level=layer.AggregateLevel.EACH_SEQUENCE)
+            agg_level=layer.AggregateLevel.TO_SEQUENCE)
         last_seq = layer.last_seq(input=pixel)
         first_seq = layer.first_seq(input=pixel)
         concat = layer.concat(input=[last_seq, first_seq])
@@ -109,7 +109,7 @@ class ReshapeLayerTest(unittest.TestCase):
         expand = layer.expand(
             input=weight,
             expand_as=pixel,
-            expand_level=layer.ExpandLevel.FROM_TIMESTEP)
+            expand_level=layer.ExpandLevel.FROM_NO_SEQUENCE)
         repeat = layer.repeat(input=pixel, num_repeats=4)
         reshape = layer.seq_reshape(input=pixel, reshape_size=4)
         rotate = layer.rotate(input=pixel, height=16, width=49)
@@ -164,6 +164,7 @@ class OtherLayerTest(unittest.TestCase):
         maxid = layer.max_id(input=inference)
         sampling_id = layer.sampling_id(input=inference)
         eos = layer.eos(input=maxid, eos_id=5)
+        layer.printer(maxid)
         print layer.parse_network([maxid, sampling_id, eos])
 
     def test_slicing_joining_layer(self):
@@ -173,9 +174,9 @@ class OtherLayerTest(unittest.TestCase):
 
 class ProjOpTest(unittest.TestCase):
     def test_projection(self):
-        input = layer.data(name='data', type=data_type.dense_vector(784))
+        input = layer.data(name='data2', type=data_type.dense_vector(784))
         word = layer.data(
-            name='word', type=data_type.integer_value_sequence(10000))
+            name='word2', type=data_type.integer_value_sequence(10000))
         fc0 = layer.fc(input=input, size=100, act=activation.Sigmoid())
         fc1 = layer.fc(input=input, size=200, act=activation.Sigmoid())
         mixed0 = layer.mixed(
@@ -204,8 +205,8 @@ class ProjOpTest(unittest.TestCase):
             dotmul1 += dotmul
 
         context = layer.context_projection(input=fc0, context_len=5)
-        context0 = layer.mixed(size=100, input=context)
-        with layer.mixed(size=100) as context1:
+        context0 = layer.mixed(size=500, input=context)
+        with layer.mixed(size=500) as context1:
             context1 += context
 
         conv = layer.conv_projection(
@@ -231,8 +232,8 @@ class ProjOpTest(unittest.TestCase):
         print layer.parse_network(conv1)
 
     def test_operator(self):
-        ipt0 = layer.data(name='data', type=data_type.dense_vector(784))
-        ipt1 = layer.data(name='word', type=data_type.dense_vector(128))
+        ipt0 = layer.data(name='data1', type=data_type.dense_vector(784))
+        ipt1 = layer.data(name='word1', type=data_type.dense_vector(128))
         fc0 = layer.fc(input=ipt0, size=100, act=activation.Sigmoid())
         fc1 = layer.fc(input=ipt0, size=100, act=activation.Sigmoid())
 
@@ -261,7 +262,7 @@ class ProjOpTest(unittest.TestCase):
 
 class NetworkTests(unittest.TestCase):
     def test_vgg(self):
-        img = layer.data(name='pixel', type=data_type.dense_vector(784))
+        img = layer.data(name='pixel1', type=data_type.dense_vector(784))
         vgg_out = networks.small_vgg(
             input_image=img, num_channels=1, num_classes=2)
         print layer.parse_network(vgg_out)
@@ -269,12 +270,12 @@ class NetworkTests(unittest.TestCase):
 
 class EvaluatorTest(unittest.TestCase):
     def test_evaluator(self):
-        img = layer.data(name='pixel', type=data_type.dense_vector(784))
+        img = layer.data(name='pixel2', type=data_type.dense_vector(784))
         output = layer.fc(input=img,
                           size=10,
                           act=activation.Softmax(),
                           name='fc_here')
-        lbl = layer.data(name='label', type=data_type.integer_value(10))
+        lbl = layer.data(name='label2', type=data_type.integer_value(10))
         cost = layer.cross_entropy_cost(input=output, label=lbl)
 
         evaluator.classification_error(input=output, label=lbl)
diff --git a/python/paddle/v2/tests/test_op.py b/python/paddle/v2/tests/test_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..69acccddf42bb22ab54e0cf9e2a5eaef34e47b50
--- /dev/null
+++ b/python/paddle/v2/tests/test_op.py
@@ -0,0 +1,50 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import paddle.v2.data_type as data_type
+import paddle.v2.layer as layer
+import paddle.v2.op as op
+
+
+class OpTest(unittest.TestCase):
+    def test_op(self):
+        x = layer.data(name='data', type=data_type.dense_vector(128))
+        x = op.exp(x)
+        x = op.sqrt(x)
+        x = op.reciprocal(x)
+        x = op.log(x)
+        x = op.abs(x)
+        x = op.sigmoid(x)
+        x = op.tanh(x)
+        x = op.square(x)
+        x = op.relu(x)
+        y = 1 + x
+        y = y + 1
+        y = x + y
+        y = y - x
+        y = y - 2
+        y = 2 - y
+        y = 2 * y
+        y = y * 3
+        z = layer.data(name='data_2', type=data_type.dense_vector(1))
+        y = y * z
+        y = z * y
+        y = y + z
+        y = z + y
+        print layer.parse_network(y)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/tests/test_rnn_layer.py b/python/paddle/v2/tests/test_rnn_layer.py
index 5fbbd20eb76bb9daab2bcf98c4adad989106a377..192b0ee678bcee752327b8c4d41fba29ea361bb6 100644
--- a/python/paddle/v2/tests/test_rnn_layer.py
+++ b/python/paddle/v2/tests/test_rnn_layer.py
@@ -20,6 +20,8 @@ import paddle.v2.data_type as data_type
 import paddle.v2.layer as layer
 from paddle.trainer_config_helpers.config_parser_utils import \
     parse_network_config as parse_network
+from paddle.trainer_config_helpers.config_parser_utils import \
+    reset_parser
 
 
 class RNNTest(unittest.TestCase):
@@ -29,6 +31,8 @@ class RNNTest(unittest.TestCase):
         hidden_dim = 8
 
         def parse_old_rnn():
+            reset_parser()
+
             def step(y):
                 mem = conf_helps.memory(name="rnn_state", size=hidden_dim)
                 out = conf_helps.fc_layer(
@@ -42,11 +46,14 @@ class RNNTest(unittest.TestCase):
             def test():
                 data = conf_helps.data_layer(name="word", size=dict_dim)
                 embd = conf_helps.embedding_layer(input=data, size=word_dim)
-                conf_helps.recurrent_group(name="rnn", step=step, input=embd)
+                conf_helps.recurrent_group(
+                    name="rnn", step=step, input=embd, reverse=True)
 
             return str(parse_network(test))
 
         def parse_new_rnn():
+            reset_parser()
+
             def new_step(y):
                 mem = layer.memory(name="rnn_state", size=hidden_dim)
                 out = layer.fc(input=[y, mem],
@@ -60,7 +67,7 @@ class RNNTest(unittest.TestCase):
                 name="word", type=data_type.integer_value(dict_dim))
             embd = layer.embedding(input=data, size=word_dim)
             rnn_layer = layer.recurrent_group(
-                name="rnn", step=new_step, input=embd)
+                name="rnn", step=new_step, input=embd, reverse=True)
             return str(layer.parse_network(rnn_layer))
 
         diff = difflib.unified_diff(parse_old_rnn().splitlines(1),
@@ -74,6 +81,8 @@ class RNNTest(unittest.TestCase):
         label_dim = 3
 
         def parse_old_rnn():
+            reset_parser()
+
             def test():
                 data = conf_helps.data_layer(name="word", size=dict_dim)
                 label = conf_helps.data_layer(name="label", size=label_dim)
@@ -113,6 +122,7 @@ class RNNTest(unittest.TestCase):
             return str(parse_network(test))
 
         def parse_new_rnn():
+            reset_parser()
             data = layer.data(
                 name="word", type=data_type.dense_vector(dict_dim))
             label = layer.data(
diff --git a/python/paddle/v2/tests/test_topology.py b/python/paddle/v2/tests/test_topology.py
index 5c6dbcdb4f49b960fb8b71aecbad4f013d2cd283..7fd2ee82fde21d90be541a28f23742e51a9a1665 100644
--- a/python/paddle/v2/tests/test_topology.py
+++ b/python/paddle/v2/tests/test_topology.py
@@ -46,8 +46,8 @@ class TestTopology(unittest.TestCase):
         self.assertEqual(label_data_type[1].dim, 10)
 
     def test_get_layer(self):
-        pixel = layer.data(name='pixel', type=data_type.dense_vector(784))
-        label = layer.data(name='label', type=data_type.integer_value(10))
+        pixel = layer.data(name='pixel2', type=data_type.dense_vector(784))
+        label = layer.data(name='label2', type=data_type.integer_value(10))
         hidden = layer.fc(input=pixel,
                           size=100,
                           act=conf_helps.SigmoidActivation())
@@ -56,14 +56,14 @@ class TestTopology(unittest.TestCase):
                              act=conf_helps.SoftmaxActivation())
         cost = layer.classification_cost(input=inference, label=label)
         topo = topology.Topology(cost)
-        pixel_layer = topo.get_layer("pixel")
-        label_layer = topo.get_layer("label")
+        pixel_layer = topo.get_layer("pixel2")
+        label_layer = topo.get_layer("label2")
         self.assertEqual(pixel_layer, pixel)
         self.assertEqual(label_layer, label)
 
     def test_parse(self):
-        pixel = layer.data(name='pixel', type=data_type.dense_vector(784))
-        label = layer.data(name='label', type=data_type.integer_value(10))
+        pixel = layer.data(name='pixel3', type=data_type.dense_vector(784))
+        label = layer.data(name='label3', type=data_type.integer_value(10))
         hidden = layer.fc(input=pixel,
                           size=100,
                           act=conf_helps.SigmoidActivation())
diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py
index 1e46e4973f467a017de3d2b45186690af16dd123..f3bb4d5f10dd6c5b220161e32dfc3a94642ac7a2 100644
--- a/python/paddle/v2/topology.py
+++ b/python/paddle/v2/topology.py
@@ -15,36 +15,13 @@
 import collections
 
 from paddle.proto.ModelConfig_pb2 import ModelConfig
-
+import paddle.trainer_config_helpers as conf_helps
 import layer as v2_layer
+import config_base
 
 __all__ = ['Topology']
 
 
-def __flatten__(lis):
-    """
-    Given a list, possibly nested to any level, return it flattened.
-    """
-    new_lis = []
-    for item in lis:
-        if isinstance(item, collections.Sequence):
-            new_lis.extend(__flatten__(item))
-        else:
-            new_lis.append(item)
-    return new_lis
-
-
-def __bfs_travel__(callback, *layers):
-    layers = __flatten__(layers)
-    for each_layer in layers:
-        __break__ = callback(each_layer)
-        if __break__:
-            return
-        __layers__ = each_layer.__parent_layers__.values() + \
-                     each_layer.extra_parent()
-        __bfs_travel__(callback, *__layers__)
-
-
 class Topology(object):
     """
     Topology is used to store the information about all layers
@@ -94,31 +71,18 @@ class Topology(object):
         :param name:
         :return:
         """
-        result_layer = [None]
-
-        def __impl__(l):
-            if l.name == name:
-                result_layer[0] = l
-                return True  # break
-            return False
-
-        __bfs_travel__(__impl__, *self.layers)
-        if result_layer[0] is None:
-            raise ValueError("No such layer %s" % name)
-        return result_layer[0]
+        return v2_layer.get_layer(name)
 
     def data_layers(self):
         """
         get all data layer
         :return:
         """
-        data_layers = dict()
-
-        def __impl__(l):
-            if isinstance(l, v2_layer.DataLayerV2):
-                data_layers[l.name] = l
-
-        __bfs_travel__(__impl__, *self.layers)
+        data_layers = {}
+        for layer in self.proto().layers:
+            l = v2_layer.get_layer(layer.name)
+            if l and l.layer_type == conf_helps.LayerType.DATA:
+                data_layers[layer.name] = l
         return data_layers
 
     def data_type(self):
@@ -127,7 +91,7 @@ class Topology(object):
         [('image', dense_vector(768)), ('label', integer_value(10))]
         """
         data_layers = self.data_layers()
-        return [(nm, data_layers[nm].type)
+        return [(nm, data_layers[nm].data_type)
                 for nm in self.proto().input_layer_names]
 
     def get_layer_proto(self, name):
@@ -138,5 +102,5 @@ class Topology(object):
 
 
 def __check_layer_type__(layer):
-    if not isinstance(layer, v2_layer.LayerV2):
-        raise ValueError('layer should have type paddle.layer.Layer')
+    if not isinstance(layer, config_base.Layer):
+        raise ValueError('layer should have type paddle.v2.config_base.Layer')
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index ec9fcfb749f1a858713d3d6672118e521fbdcb32..8fdb67cc2688a67ed815af396b214e339195c73f 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -177,7 +177,7 @@ class SGD(object):
         Testing method. Will test input data.
 
         :param reader: A reader that reads and yeilds data items.
-        :type reader: collections.Iterable  
+        :type reader: collections.Iterable
         :param feeding: Feeding is a map of neural network input name and array
                         index that reader returns.
         :type feeding: dict
diff --git a/python/setup.py.in b/python/setup.py.in
index 7d9438e3f8132c2a7fa4774750f5fd15f3beed14..d1c38823080fb3a5c879d8b59cb5371c07902e57 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,5 +1,6 @@
 from setuptools import setup
 
+
 packages=['paddle',
           'paddle.proto',
           'paddle.trainer',
@@ -19,6 +20,7 @@ setup(name='paddle',
           "protobuf==${PROTOBUF_VERSION}",
           "matplotlib",
           "opencv-python",
+          "rarfile"
       ],
       packages=packages,
       package_dir={
diff --git a/v1_api_demo/README.md b/v1_api_demo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9442f76941287a710220f07cf7dbb29ebcadabdc
--- /dev/null
+++ b/v1_api_demo/README.md
@@ -0,0 +1,5 @@
+The examples in v1_api_demo are using v1_api now, and will be upgraded into v2_api later.
+Thus, v1_api_demo is a temporary directory. We decide not to maintain it and will delete it in future.
+
+Please go to [PaddlePaddle/book](https://github.com/PaddlePaddle/book) and 
+[PaddlePaddle/models](https://github.com/PaddlePaddle/models) to learn PaddlePaddle.
diff --git a/demo/gan/.gitignore b/v1_api_demo/gan/.gitignore
similarity index 100%
rename from demo/gan/.gitignore
rename to v1_api_demo/gan/.gitignore
diff --git a/demo/gan/README.md b/v1_api_demo/gan/README.md
similarity index 100%
rename from demo/gan/README.md
rename to v1_api_demo/gan/README.md
diff --git a/demo/gan/data/download_cifar.sh b/v1_api_demo/gan/data/download_cifar.sh
similarity index 100%
rename from demo/gan/data/download_cifar.sh
rename to v1_api_demo/gan/data/download_cifar.sh
diff --git a/demo/gan/data/get_mnist_data.sh b/v1_api_demo/gan/data/get_mnist_data.sh
similarity index 100%
rename from demo/gan/data/get_mnist_data.sh
rename to v1_api_demo/gan/data/get_mnist_data.sh
diff --git a/demo/gan/gan_conf.py b/v1_api_demo/gan/gan_conf.py
similarity index 100%
rename from demo/gan/gan_conf.py
rename to v1_api_demo/gan/gan_conf.py
diff --git a/demo/gan/gan_conf_image.py b/v1_api_demo/gan/gan_conf_image.py
similarity index 100%
rename from demo/gan/gan_conf_image.py
rename to v1_api_demo/gan/gan_conf_image.py
diff --git a/demo/gan/gan_trainer.py b/v1_api_demo/gan/gan_trainer.py
similarity index 100%
rename from demo/gan/gan_trainer.py
rename to v1_api_demo/gan/gan_trainer.py
diff --git a/demo/mnist/.gitignore b/v1_api_demo/mnist/.gitignore
similarity index 100%
rename from demo/mnist/.gitignore
rename to v1_api_demo/mnist/.gitignore
diff --git a/demo/mnist/api_train.py b/v1_api_demo/mnist/api_train.py
similarity index 100%
rename from demo/mnist/api_train.py
rename to v1_api_demo/mnist/api_train.py
diff --git a/demo/mnist/data/generate_list.py b/v1_api_demo/mnist/data/generate_list.py
similarity index 100%
rename from demo/mnist/data/generate_list.py
rename to v1_api_demo/mnist/data/generate_list.py
diff --git a/demo/mnist/data/get_mnist_data.sh b/v1_api_demo/mnist/data/get_mnist_data.sh
similarity index 100%
rename from demo/mnist/data/get_mnist_data.sh
rename to v1_api_demo/mnist/data/get_mnist_data.sh
diff --git a/v1_api_demo/mnist/light_mnist.py b/v1_api_demo/mnist/light_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..33409054357d2f0c6a765b3ab3164eb2e584467e
--- /dev/null
+++ b/v1_api_demo/mnist/light_mnist.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+is_predict = get_config_arg("is_predict", bool, False)
+
+####################Data Configuration ##################
+
+if not is_predict:
+    data_dir = './data/'
+    define_py_data_sources2(
+        train_list=data_dir + 'train.list',
+        test_list=data_dir + 'test.list',
+        module='mnist_provider',
+        obj='process')
+
+######################Algorithm Configuration #############
+settings(batch_size=50, learning_rate=0.001, learning_method=AdamOptimizer())
+
+#######################Network Configuration #############
+
+data_size = 1 * 28 * 28
+label_size = 10
+img = data_layer(name='pixel', size=data_size)
+
+
+# light cnn
+# A shallower cnn model: [CNN, BN, ReLU, Max-Pooling] x4 + FC x1
+# Easier to train for mnist dataset and quite efficient
+# Final performance is close to deeper ones on tasks such as digital and character classification 
+def light_cnn(input_image, num_channels, num_classes):
+    def __light__(ipt,
+                  num_filter=128,
+                  times=1,
+                  conv_filter_size=3,
+                  dropouts=0,
+                  num_channels_=None):
+        return img_conv_group(
+            input=ipt,
+            num_channels=num_channels_,
+            pool_size=2,
+            pool_stride=2,
+            conv_padding=0,
+            conv_num_filter=[num_filter] * times,
+            conv_filter_size=conv_filter_size,
+            conv_act=ReluActivation(),
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type=MaxPooling())
+
+    tmp = __light__(input_image, num_filter=128, num_channels_=num_channels)
+    tmp = __light__(tmp, num_filter=128)
+    tmp = __light__(tmp, num_filter=128)
+    tmp = __light__(tmp, num_filter=128, conv_filter_size=1)
+
+    tmp = fc_layer(input=tmp, size=num_classes, act=SoftmaxActivation())
+    return tmp
+
+
+predict = light_cnn(input_image=img, num_channels=1, num_classes=label_size)
+
+if not is_predict:
+    lbl = data_layer(name="label", size=label_size)
+    inputs(img, lbl)
+    outputs(classification_cost(input=predict, label=lbl))
+else:
+    outputs(predict)
diff --git a/demo/mnist/mnist_provider.py b/v1_api_demo/mnist/mnist_provider.py
similarity index 100%
rename from demo/mnist/mnist_provider.py
rename to v1_api_demo/mnist/mnist_provider.py
diff --git a/demo/mnist/mnist_util.py b/v1_api_demo/mnist/mnist_util.py
similarity index 100%
rename from demo/mnist/mnist_util.py
rename to v1_api_demo/mnist/mnist_util.py
diff --git a/demo/mnist/train.sh b/v1_api_demo/mnist/train.sh
similarity index 100%
rename from demo/mnist/train.sh
rename to v1_api_demo/mnist/train.sh
diff --git a/demo/mnist/vgg_16_mnist.py b/v1_api_demo/mnist/vgg_16_mnist.py
similarity index 100%
rename from demo/mnist/vgg_16_mnist.py
rename to v1_api_demo/mnist/vgg_16_mnist.py
diff --git a/demo/model_zoo/embedding/.gitignore b/v1_api_demo/model_zoo/embedding/.gitignore
similarity index 100%
rename from demo/model_zoo/embedding/.gitignore
rename to v1_api_demo/model_zoo/embedding/.gitignore
diff --git a/demo/model_zoo/embedding/extract_para.py b/v1_api_demo/model_zoo/embedding/extract_para.py
similarity index 100%
rename from demo/model_zoo/embedding/extract_para.py
rename to v1_api_demo/model_zoo/embedding/extract_para.py
diff --git a/demo/model_zoo/embedding/paraconvert.py b/v1_api_demo/model_zoo/embedding/paraconvert.py
similarity index 100%
rename from demo/model_zoo/embedding/paraconvert.py
rename to v1_api_demo/model_zoo/embedding/paraconvert.py
diff --git a/demo/model_zoo/embedding/pre_DictAndModel.sh b/v1_api_demo/model_zoo/embedding/pre_DictAndModel.sh
similarity index 100%
rename from demo/model_zoo/embedding/pre_DictAndModel.sh
rename to v1_api_demo/model_zoo/embedding/pre_DictAndModel.sh
diff --git a/demo/model_zoo/resnet/.gitignore b/v1_api_demo/model_zoo/resnet/.gitignore
similarity index 100%
rename from demo/model_zoo/resnet/.gitignore
rename to v1_api_demo/model_zoo/resnet/.gitignore
diff --git a/demo/model_zoo/resnet/classify.py b/v1_api_demo/model_zoo/resnet/classify.py
similarity index 100%
rename from demo/model_zoo/resnet/classify.py
rename to v1_api_demo/model_zoo/resnet/classify.py
diff --git a/demo/model_zoo/resnet/example/.gitignore b/v1_api_demo/model_zoo/resnet/example/.gitignore
similarity index 100%
rename from demo/model_zoo/resnet/example/.gitignore
rename to v1_api_demo/model_zoo/resnet/example/.gitignore
diff --git a/demo/model_zoo/resnet/example/__init__.py b/v1_api_demo/model_zoo/resnet/example/__init__.py
similarity index 100%
rename from demo/model_zoo/resnet/example/__init__.py
rename to v1_api_demo/model_zoo/resnet/example/__init__.py
diff --git a/demo/model_zoo/resnet/example/cat.jpg b/v1_api_demo/model_zoo/resnet/example/cat.jpg
similarity index 100%
rename from demo/model_zoo/resnet/example/cat.jpg
rename to v1_api_demo/model_zoo/resnet/example/cat.jpg
diff --git a/demo/model_zoo/resnet/example/dog.jpg b/v1_api_demo/model_zoo/resnet/example/dog.jpg
similarity index 100%
rename from demo/model_zoo/resnet/example/dog.jpg
rename to v1_api_demo/model_zoo/resnet/example/dog.jpg
diff --git a/demo/model_zoo/resnet/example/image_list_provider.py b/v1_api_demo/model_zoo/resnet/example/image_list_provider.py
similarity index 100%
rename from demo/model_zoo/resnet/example/image_list_provider.py
rename to v1_api_demo/model_zoo/resnet/example/image_list_provider.py
diff --git a/demo/model_zoo/resnet/example/test.list b/v1_api_demo/model_zoo/resnet/example/test.list
similarity index 100%
rename from demo/model_zoo/resnet/example/test.list
rename to v1_api_demo/model_zoo/resnet/example/test.list
diff --git a/demo/model_zoo/resnet/extract_fea_c++.sh b/v1_api_demo/model_zoo/resnet/extract_fea_c++.sh
similarity index 100%
rename from demo/model_zoo/resnet/extract_fea_c++.sh
rename to v1_api_demo/model_zoo/resnet/extract_fea_c++.sh
diff --git a/demo/model_zoo/resnet/extract_fea_py.sh b/v1_api_demo/model_zoo/resnet/extract_fea_py.sh
similarity index 100%
rename from demo/model_zoo/resnet/extract_fea_py.sh
rename to v1_api_demo/model_zoo/resnet/extract_fea_py.sh
diff --git a/demo/model_zoo/resnet/get_model.sh b/v1_api_demo/model_zoo/resnet/get_model.sh
similarity index 100%
rename from demo/model_zoo/resnet/get_model.sh
rename to v1_api_demo/model_zoo/resnet/get_model.sh
diff --git a/demo/model_zoo/resnet/load_feature.py b/v1_api_demo/model_zoo/resnet/load_feature.py
similarity index 100%
rename from demo/model_zoo/resnet/load_feature.py
rename to v1_api_demo/model_zoo/resnet/load_feature.py
diff --git a/demo/model_zoo/resnet/net_diagram.sh b/v1_api_demo/model_zoo/resnet/net_diagram.sh
similarity index 100%
rename from demo/model_zoo/resnet/net_diagram.sh
rename to v1_api_demo/model_zoo/resnet/net_diagram.sh
diff --git a/demo/model_zoo/resnet/predict.sh b/v1_api_demo/model_zoo/resnet/predict.sh
similarity index 100%
rename from demo/model_zoo/resnet/predict.sh
rename to v1_api_demo/model_zoo/resnet/predict.sh
diff --git a/demo/model_zoo/resnet/resnet.py b/v1_api_demo/model_zoo/resnet/resnet.py
similarity index 100%
rename from demo/model_zoo/resnet/resnet.py
rename to v1_api_demo/model_zoo/resnet/resnet.py
diff --git a/demo/quick_start/.gitignore b/v1_api_demo/quick_start/.gitignore
similarity index 100%
rename from demo/quick_start/.gitignore
rename to v1_api_demo/quick_start/.gitignore
diff --git a/demo/quick_start/api_predict.py b/v1_api_demo/quick_start/api_predict.py
similarity index 100%
rename from demo/quick_start/api_predict.py
rename to v1_api_demo/quick_start/api_predict.py
diff --git a/demo/quick_start/api_predict.sh b/v1_api_demo/quick_start/api_predict.sh
similarity index 100%
rename from demo/quick_start/api_predict.sh
rename to v1_api_demo/quick_start/api_predict.sh
diff --git a/demo/quick_start/api_train.py b/v1_api_demo/quick_start/api_train.py
similarity index 100%
rename from demo/quick_start/api_train.py
rename to v1_api_demo/quick_start/api_train.py
diff --git a/demo/quick_start/api_train.sh b/v1_api_demo/quick_start/api_train.sh
similarity index 100%
rename from demo/quick_start/api_train.sh
rename to v1_api_demo/quick_start/api_train.sh
diff --git a/demo/quick_start/cluster/cluster_train.sh b/v1_api_demo/quick_start/cluster/cluster_train.sh
similarity index 100%
rename from demo/quick_start/cluster/cluster_train.sh
rename to v1_api_demo/quick_start/cluster/cluster_train.sh
diff --git a/demo/quick_start/cluster/env.sh b/v1_api_demo/quick_start/cluster/env.sh
similarity index 100%
rename from demo/quick_start/cluster/env.sh
rename to v1_api_demo/quick_start/cluster/env.sh
diff --git a/demo/quick_start/cluster/pserver.sh b/v1_api_demo/quick_start/cluster/pserver.sh
similarity index 100%
rename from demo/quick_start/cluster/pserver.sh
rename to v1_api_demo/quick_start/cluster/pserver.sh
diff --git a/demo/quick_start/data/README.md b/v1_api_demo/quick_start/data/README.md
similarity index 100%
rename from demo/quick_start/data/README.md
rename to v1_api_demo/quick_start/data/README.md
diff --git a/demo/quick_start/data/get_data.sh b/v1_api_demo/quick_start/data/get_data.sh
similarity index 100%
rename from demo/quick_start/data/get_data.sh
rename to v1_api_demo/quick_start/data/get_data.sh
diff --git a/demo/quick_start/data/proc_from_raw_data/get_data.sh b/v1_api_demo/quick_start/data/proc_from_raw_data/get_data.sh
similarity index 100%
rename from demo/quick_start/data/proc_from_raw_data/get_data.sh
rename to v1_api_demo/quick_start/data/proc_from_raw_data/get_data.sh
diff --git a/demo/quick_start/data/proc_from_raw_data/preprocess.py b/v1_api_demo/quick_start/data/proc_from_raw_data/preprocess.py
similarity index 100%
rename from demo/quick_start/data/proc_from_raw_data/preprocess.py
rename to v1_api_demo/quick_start/data/proc_from_raw_data/preprocess.py
diff --git a/demo/quick_start/dataprovider_bow.py b/v1_api_demo/quick_start/dataprovider_bow.py
similarity index 100%
rename from demo/quick_start/dataprovider_bow.py
rename to v1_api_demo/quick_start/dataprovider_bow.py
diff --git a/demo/quick_start/dataprovider_emb.py b/v1_api_demo/quick_start/dataprovider_emb.py
similarity index 100%
rename from demo/quick_start/dataprovider_emb.py
rename to v1_api_demo/quick_start/dataprovider_emb.py
diff --git a/demo/quick_start/predict.sh b/v1_api_demo/quick_start/predict.sh
similarity index 100%
rename from demo/quick_start/predict.sh
rename to v1_api_demo/quick_start/predict.sh
diff --git a/demo/quick_start/train.sh b/v1_api_demo/quick_start/train.sh
similarity index 100%
rename from demo/quick_start/train.sh
rename to v1_api_demo/quick_start/train.sh
diff --git a/demo/quick_start/trainer_config.bidi-lstm.py b/v1_api_demo/quick_start/trainer_config.bidi-lstm.py
similarity index 100%
rename from demo/quick_start/trainer_config.bidi-lstm.py
rename to v1_api_demo/quick_start/trainer_config.bidi-lstm.py
diff --git a/demo/quick_start/trainer_config.cnn.py b/v1_api_demo/quick_start/trainer_config.cnn.py
similarity index 100%
rename from demo/quick_start/trainer_config.cnn.py
rename to v1_api_demo/quick_start/trainer_config.cnn.py
diff --git a/demo/quick_start/trainer_config.db-lstm.py b/v1_api_demo/quick_start/trainer_config.db-lstm.py
similarity index 100%
rename from demo/quick_start/trainer_config.db-lstm.py
rename to v1_api_demo/quick_start/trainer_config.db-lstm.py
diff --git a/demo/quick_start/trainer_config.emb.py b/v1_api_demo/quick_start/trainer_config.emb.py
similarity index 100%
rename from demo/quick_start/trainer_config.emb.py
rename to v1_api_demo/quick_start/trainer_config.emb.py
diff --git a/demo/quick_start/trainer_config.lr.py b/v1_api_demo/quick_start/trainer_config.lr.py
similarity index 100%
rename from demo/quick_start/trainer_config.lr.py
rename to v1_api_demo/quick_start/trainer_config.lr.py
diff --git a/demo/quick_start/trainer_config.lstm.py b/v1_api_demo/quick_start/trainer_config.lstm.py
similarity index 100%
rename from demo/quick_start/trainer_config.lstm.py
rename to v1_api_demo/quick_start/trainer_config.lstm.py
diff --git a/demo/quick_start/trainer_config.resnet-lstm.py b/v1_api_demo/quick_start/trainer_config.resnet-lstm.py
similarity index 100%
rename from demo/quick_start/trainer_config.resnet-lstm.py
rename to v1_api_demo/quick_start/trainer_config.resnet-lstm.py
diff --git a/demo/sequence_tagging/data/get_data.sh b/v1_api_demo/sequence_tagging/data/get_data.sh
similarity index 100%
rename from demo/sequence_tagging/data/get_data.sh
rename to v1_api_demo/sequence_tagging/data/get_data.sh
diff --git a/demo/sequence_tagging/data/test.list b/v1_api_demo/sequence_tagging/data/test.list
similarity index 100%
rename from demo/sequence_tagging/data/test.list
rename to v1_api_demo/sequence_tagging/data/test.list
diff --git a/demo/sequence_tagging/data/train.list b/v1_api_demo/sequence_tagging/data/train.list
similarity index 100%
rename from demo/sequence_tagging/data/train.list
rename to v1_api_demo/sequence_tagging/data/train.list
diff --git a/demo/sequence_tagging/dataprovider.py b/v1_api_demo/sequence_tagging/dataprovider.py
similarity index 100%
rename from demo/sequence_tagging/dataprovider.py
rename to v1_api_demo/sequence_tagging/dataprovider.py
diff --git a/demo/sequence_tagging/linear_crf.py b/v1_api_demo/sequence_tagging/linear_crf.py
similarity index 100%
rename from demo/sequence_tagging/linear_crf.py
rename to v1_api_demo/sequence_tagging/linear_crf.py
diff --git a/demo/sequence_tagging/readme.md b/v1_api_demo/sequence_tagging/readme.md
similarity index 100%
rename from demo/sequence_tagging/readme.md
rename to v1_api_demo/sequence_tagging/readme.md
diff --git a/demo/sequence_tagging/rnn_crf.py b/v1_api_demo/sequence_tagging/rnn_crf.py
similarity index 100%
rename from demo/sequence_tagging/rnn_crf.py
rename to v1_api_demo/sequence_tagging/rnn_crf.py
diff --git a/demo/sequence_tagging/train.sh b/v1_api_demo/sequence_tagging/train.sh
similarity index 100%
rename from demo/sequence_tagging/train.sh
rename to v1_api_demo/sequence_tagging/train.sh
diff --git a/demo/sequence_tagging/train_linear.sh b/v1_api_demo/sequence_tagging/train_linear.sh
similarity index 100%
rename from demo/sequence_tagging/train_linear.sh
rename to v1_api_demo/sequence_tagging/train_linear.sh
diff --git a/demo/traffic_prediction/README b/v1_api_demo/traffic_prediction/README
similarity index 100%
rename from demo/traffic_prediction/README
rename to v1_api_demo/traffic_prediction/README
diff --git a/demo/traffic_prediction/data/get_data.sh b/v1_api_demo/traffic_prediction/data/get_data.sh
similarity index 100%
rename from demo/traffic_prediction/data/get_data.sh
rename to v1_api_demo/traffic_prediction/data/get_data.sh
diff --git a/demo/traffic_prediction/dataprovider.py b/v1_api_demo/traffic_prediction/dataprovider.py
similarity index 100%
rename from demo/traffic_prediction/dataprovider.py
rename to v1_api_demo/traffic_prediction/dataprovider.py
diff --git a/demo/traffic_prediction/gen_result.py b/v1_api_demo/traffic_prediction/gen_result.py
similarity index 100%
rename from demo/traffic_prediction/gen_result.py
rename to v1_api_demo/traffic_prediction/gen_result.py
diff --git a/demo/traffic_prediction/predict.sh b/v1_api_demo/traffic_prediction/predict.sh
similarity index 100%
rename from demo/traffic_prediction/predict.sh
rename to v1_api_demo/traffic_prediction/predict.sh
diff --git a/demo/traffic_prediction/train.sh b/v1_api_demo/traffic_prediction/train.sh
similarity index 100%
rename from demo/traffic_prediction/train.sh
rename to v1_api_demo/traffic_prediction/train.sh
diff --git a/demo/traffic_prediction/trainer_config.py b/v1_api_demo/traffic_prediction/trainer_config.py
similarity index 100%
rename from demo/traffic_prediction/trainer_config.py
rename to v1_api_demo/traffic_prediction/trainer_config.py
diff --git a/v1_api_demo/vae/README.md b/v1_api_demo/vae/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e55d483b023773900729622a6cac44116fc79c76
--- /dev/null
+++ b/v1_api_demo/vae/README.md
@@ -0,0 +1,13 @@
+#Variational Autoencoder (VAE)
+
+This demo implements VAE training described in the original paper (https://arxiv.org/abs/1312.6114).
+
+
+In order to run the model, first download the MNIST dataset by running the shell script in ./data.
+
+Then you can run the command below. The flag --useGpu specifies whether to use gpu for training (0 is cpu, 1 is gpu).  
+
+$python vae_train.py [--use_gpu 1]
+
+The generated images will be stored in ./samples/
+The corresponding models will be stored in ./params/
diff --git a/v1_api_demo/vae/data/get_mnist_data.sh b/v1_api_demo/vae/data/get_mnist_data.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a77c81bf5af9ddb6634ff89460797ca543c5e517
--- /dev/null
+++ b/v1_api_demo/vae/data/get_mnist_data.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env sh
+# This script downloads the mnist data and unzips it.
+set -e
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+rm -rf "$DIR/mnist_data"
+mkdir "$DIR/mnist_data"
+cd "$DIR/mnist_data"
+
+echo "Downloading..."
+
+for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
+do
+    if [ ! -e $fname ]; then
+        wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
+        gunzip ${fname}.gz
+    fi
+done
diff --git a/v1_api_demo/vae/dataloader.py b/v1_api_demo/vae/dataloader.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9ff95d44f825cd941b5687f754618e66d491e7f
--- /dev/null
+++ b/v1_api_demo/vae/dataloader.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+class MNISTloader():
+    def __init__(self,
+                 data_path="./data/mnist_data/",
+                 batch_size=60,
+                 process='train'):
+        self.batch_size = batch_size
+        self.data_path = data_path
+        self._pointer = 0
+        self.image_batches = np.array([])
+        self.process = process
+
+    def _extract_images(self, filename, n):
+        f = open(filename, 'rb')
+        f.read(16)
+        data = np.fromfile(f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28))
+        #Mapping data into [-1, 1]
+        data = data / 255. * 2. - 1
+        data_batches = np.split(data, 60000 / self.batch_size, 0)
+
+        f.close()
+
+        return data_batches
+
+    @property
+    def pointer(self):
+        return self._pointer
+
+    def load_data(self):
+        TRAIN_IMAGES = '%s/train-images-idx3-ubyte' % self.data_path
+        TEST_IMAGES = '%s/t10k-images-idx3-ubyte' % self.data_path
+
+        if self.process == 'train':
+            self.image_batches = self._extract_images(TRAIN_IMAGES, 60000)
+        else:
+            self.image_batches = self._extract_images(TEST_IMAGES, 10000)
+
+    def next_batch(self):
+        batch = self.image_batches[self._pointer]
+        self._pointer = (self._pointer + 1) % (60000 / self.batch_size)
+        return np.array(batch)
+
+    def reset_pointer(self):
+        self._pointer = 0
diff --git a/v1_api_demo/vae/vae_conf.py b/v1_api_demo/vae/vae_conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..301dd23793d19ec5946cc7bb07e32c53c04a972b
--- /dev/null
+++ b/v1_api_demo/vae/vae_conf.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+import numpy as np
+
+is_generating = get_config_arg("is_generating", bool, False)
+
+settings(batch_size=32, learning_rate=1e-3, learning_method=AdamOptimizer())
+
+X_dim = 28 * 28
+h_dim = 128
+z_dim = 100
+
+
+def reparameterization(mu, logvar):
+    eps = ParamAttr(initial_mean=0., initial_std=1)
+    with mixed_layer() as sigma:
+        sigma += dotmul_projection(layer_math.exp(logvar) * 0.5, param_attr=eps)
+    return mu + sigma
+
+
+def q_func(X):
+    """
+    xavier initialization
+    """
+    param_attr = ParamAttr(
+        name='share.w', initial_mean=0., initial_std=1. / np.sqrt(X_dim / 2.))
+    mu_param = ParamAttr(
+        name='mu.w', initial_mean=0., initial_std=1. / np.sqrt(h_dim / 2.))
+    logvar_param = ParamAttr(
+        name='logvar.w', initial_mean=0., initial_std=1. / np.sqrt(h_dim / 2.))
+
+    bias_attr = ParamAttr(name='share.bias', initial_mean=0., initial_std=0.)
+    mu_bias = ParamAttr(name='mu.bias', initial_mean=0., initial_std=0.)
+    logvar_bias = ParamAttr(name='logvar.bias', initial_mean=0., initial_std=0.)
+
+    share_layer = fc_layer(
+        X,
+        size=h_dim,
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        act=ReluActivation())
+
+    return (fc_layer(
+        share_layer,
+        size=z_dim,
+        param_attr=mu_param,
+        bias_attr=mu_bias,
+        act=LinearActivation()), fc_layer(
+            share_layer,
+            size=z_dim,
+            param_attr=logvar_param,
+            bias_attr=logvar_bias,
+            act=LinearActivation()))
+
+
+def generator(z):
+
+    hidden_param = ParamAttr(
+        name='hidden.w', initial_mean=0., initial_std=1. / np.sqrt(z_dim / 2.))
+    hidden_bias = ParamAttr(name='hidden.bias', initial_mean=0., initial_std=0.)
+    prob_param = ParamAttr(
+        name='prob.w', initial_mean=0., initial_std=1. / np.sqrt(h_dim / 2.))
+    prob_bias = ParamAttr(name='prob.bias', initial_mean=0., initial_std=0.)
+
+    hidden_layer = fc_layer(
+        z,
+        size=h_dim,
+        act=ReluActivation(),
+        param_attr=hidden_param,
+        bias_attr=hidden_bias)
+    prob = fc_layer(
+        hidden_layer,
+        size=X_dim,
+        act=SigmoidActivation(),
+        param_attr=prob_param,
+        bias_attr=prob_bias)
+
+    return prob
+
+
+def reconstruct_error(prob, X):
+    cost = multi_binary_label_cross_entropy(input=prob, label=X)
+    return cost
+
+
+def KL_loss(mu, logvar):
+    with mixed_layer() as mu_square:
+        mu_square += dotmul_operator(mu, mu, scale=1.)
+
+    cost = 0.5 * sum_cost(layer_math.exp(logvar) + mu_square - 1. - logvar)
+
+    return cost
+
+
+if not is_generating:
+    x_batch = data_layer(name='x_batch', size=X_dim)
+    mu, logvar = q_func(x_batch)
+    z_samples = reparameterization(mu, logvar)
+    prob = generator(z_samples)
+    outputs(reconstruct_error(prob, x_batch) + KL_loss(mu, logvar))
+else:
+    z_samples = data_layer(name='noise', size=z_dim)
+    outputs(generator(z_samples))
diff --git a/v1_api_demo/vae/vae_train.py b/v1_api_demo/vae/vae_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..1babb011c77b92861cc680a2e1aaa8c9ae5d97b5
--- /dev/null
+++ b/v1_api_demo/vae/vae_train.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import random
+import numpy as np
+import cPickle
+import sys, os
+from PIL import Image
+
+from paddle.trainer.config_parser import parse_config
+from paddle.trainer.config_parser import logger
+import py_paddle.swig_paddle as api
+import dataloader
+import matplotlib.pyplot as plt
+
+
+def plot_samples(samples):
+    fig = plt.figure(figsize=(4, 4))
+    gs = gridspec.GridSpec(4, 4)
+    gs.update(wspace=0.05, hspace=0.05)
+    for i, sample in enumerate(samples):
+        plt.subplot(gs[i])
+        plt.axis('off')
+        plt.imshow(sample.reshape(28, 28), cmap='Greys_r')
+
+    return fig
+
+
+def CHECK_EQ(a, b):
+    assert a == b, "a=%s, b=%s" % (a, b)
+
+
+def get_fake_samples(generator_machine, batch_size, noise):
+    gen_inputs = api.Arguments.createArguments(1)
+    gen_inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
+    gen_outputs = api.Arguments.createArguments(0)
+    generator_machine.forward(gen_inputs, gen_outputs, api.PASS_TEST)
+    fake_samples = gen_outputs.getSlotValue(0).copyToNumpyMat()
+    return fake_samples
+
+
+def copy_shared_parameters(src, dst):
+    '''
+    copy the parameters from src to dst
+    :param src: the source of the parameters
+    :type src: GradientMachine
+    :param dst: the destination of the parameters
+    :type dst: GradientMachine
+    '''
+    src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]
+    src_params = dict([(p.getName(), p) for p in src_params])
+
+    for i in xrange(dst.getParameterSize()):
+        dst_param = dst.getParameter(i)
+        src_param = src_params.get(dst_param.getName(), None)
+        if src_param is None:
+            continue
+        src_value = src_param.getBuf(api.PARAMETER_VALUE)
+        dst_value = dst_param.getBuf(api.PARAMETER_VALUE)
+        CHECK_EQ(len(src_value), len(dst_value))
+        dst_value.copyFrom(src_value)
+        dst_param.setValueUpdated()
+
+
+def find(iterable, cond):
+    for item in iterable:
+        if cond(item):
+            return item
+    return None
+
+
+def get_layer_size(model_conf, layer_name):
+    layer_conf = find(model_conf.layers, lambda x: x.name == layer_name)
+    assert layer_conf is not None, "Cannot find '%s' layer" % layer_name
+    return layer_conf.size
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--use_gpu", default="1", help="1 means use gpu for training")
+    parser.add_argument("--gpu_id", default="0", help="the gpu_id parameter")
+    args = parser.parse_args()
+    use_gpu = args.use_gpu
+    assert use_gpu in ["0", "1"]
+
+    if not os.path.exists("./samples/"):
+        os.makedirs("./samples/")
+
+    if not os.path.exists("./params/"):
+        os.makedirs("./params/")
+
+    api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10',
+                   '--log_period=1000', '--gpu_id=' + args.gpu_id,
+                   '--save_dir=' + "./params/")
+
+    conf = "vae_conf.py"
+
+    trainer_conf = parse_config(conf, "is_generating=False")
+    gener_conf = parse_config(conf, "is_generating=True")
+
+    batch_size = trainer_conf.opt_config.batch_size
+
+    noise_dim = get_layer_size(gener_conf.model_config, "noise")
+
+    mnist = dataloader.MNISTloader(batch_size=batch_size)
+    mnist.load_data()
+
+    training_machine = api.GradientMachine.createFromConfigProto(
+        trainer_conf.model_config)
+
+    generator_machine = api.GradientMachine.createFromConfigProto(
+        gener_conf.model_config)
+
+    trainer = api.Trainer.create(trainer_conf, training_machine)
+
+    trainer.startTrain()
+
+    for train_pass in xrange(100):
+        trainer.startTrainPass()
+        mnist.reset_pointer()
+        i = 0
+        it = 0
+        while mnist.pointer != 0 or i == 0:
+            X = mnist.next_batch().astype('float32')
+
+            inputs = api.Arguments.createArguments(1)
+            inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(X))
+
+            trainer.trainOneDataBatch(batch_size, inputs)
+
+            if it % 1000 == 0:
+
+                outputs = api.Arguments.createArguments(0)
+                training_machine.forward(inputs, outputs, api.PASS_TEST)
+                loss = np.mean(outputs.getSlotValue(0).copyToNumpyMat())
+                print "\niter: {}".format(str(it).zfill(3))
+                print "VAE loss: {}".format(str(loss).zfill(3))
+
+                #Sync parameters between networks (GradientMachine) at the beginning
+                copy_shared_parameters(training_machine, generator_machine)
+
+                z_samples = np.random.randn(batch_size,
+                                            noise_dim).astype('float32')
+                samples = get_fake_samples(generator_machine, batch_size,
+                                           z_samples)
+
+                #Generating the first 16 images for a picture. 
+                figure = plot_samples(samples[:16])
+                plt.savefig(
+                    "./samples/{}_{}.png".format(
+                        str(train_pass).zfill(3), str(i).zfill(3)),
+                    bbox_inches='tight')
+                plt.close(figure)
+                i += 1
+            it += 1
+
+        trainer.finishTrainPass()
+    trainer.finishTrain()
+
+
+if __name__ == '__main__':
+    main()