diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa4f1eaff9125f2ff11a6ef83e503acd56b79e21..fc85f83b94f22459002b17d66cb6ac98cbff9bd0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,17 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
+cmake_minimum_required(VERSION 3.0)
+
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
 
 include(system)
 
-if(ANDROID)
-    cmake_minimum_required(VERSION 3.7)
-else()
-    cmake_minimum_required(VERSION 3.0)
-endif()
-
 project(paddle CXX C)
 
 find_package(Sphinx)
diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..9724c16122ab2e6be55864c8716698c9b9d7c3f0
--- /dev/null
+++ b/cmake/cross_compiling/android.cmake
@@ -0,0 +1,191 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is a toolchain file for cross-compiling for Android, and the
+# configuration refers to the open-source resposity:
+#     https://github.com/taka-no-me/android-cmake
+# Most of the variables are compatible with that used in
+#     https://developer.android.com/ndk/guides/cmake.html
+# The supported variables are listed belows:
+# 
+# ANDROID_STANDALONE_TOOLCHAIN
+# ANDROID_ABI
+# ANDROID_NATIVE_API_LEVEL
+# ANDROID_ARM_MODE
+# ANDROID_ARM_NEON
+#
+# For CMake >= 3.7.0, all the settings will be delivered to CMake system
+# variables to let CMake do the cross-compiling configurations itself.
+# More detail of cross-compiling settings
+#     https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html
+
+IF(NOT ANDROID)
+    return()
+ENDIF()
+
+# check the exist of android standalone toolchain
+IF(NOT DEFINED ANDROID_STANDALONE_TOOLCHAIN)
+    SET(ANDROID_STANDALONE_TOOLCHAIN $ENV{ANDROID_STANDALONE_TOOLCHAIN}
+        CACHE PATH "Folder holds the standalone toolchain of Android NDK")
+ENDIF()
+IF(NOT ANDROID_STANDALONE_TOOLCHAIN)
+    MESSAGE(WARNING "It is recommended to set ANDROID_STANDALONE_TOOLCHAIN to "
+            "use a standalone toolchain.\n"
+            "To cross-compile for Android, you need to:\n"
+            "1. Download an Android NDK from"
+            " https://developer.android.com/ndk/downloads/index.html\n"
+            "2. Setup a standalone toolchain"
+            "https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn\n")
+ENDIF()
+
+IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL)
+    IF(ANDROID_NATIVE_API_LEVEL MATCHES "^android-[0-9]+$")
+        STRING(REPLACE "android-" "" CMAKE_SYSTEM_VERSION "${CMAKE_MATCH_0}")
+    ELSEIF(ANDROID_NATIVE_API_LEVEL MATCHES "^[0-9]+$")
+        SET(CMAKE_SYSTEM_VERSION ${ANDROID_NATIVE_API_LEVEL})
+    ENDIF()
+ENDIF()
+
+IF(NOT DEFINED ANDROID_ABI)
+    SET(ANDROID_ABI "armeabi-v7a")
+ENDIF()
+
+IF(NOT DEFINED ANDROID_ARM_MODE)
+    SET(ANDROID_ARM_MODE ON)
+ENDIF()
+IF(ANDROID_ARM_MODE)
+    SET(ANDROID_ARM_MODE_NAME "arm")
+ELSE(ANDROID_ARM_MODE)
+    SET(ANDROID_ARM_MODE_NAME "thumb")
+ENDIF(ANDROID_ARM_MODE)
+
+IF(NOT DEFINED ANDROID_ARM_NEON)
+    SET(ANDROID_ARM_NEON ON)
+ENDIF()
+
+IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
+    IF("${CMAKE_VERSION}" VERSION_LESS "3.1.0")
+        SET(CMAKE_SYSTEM_NAME "Linux")
+    ENDIF()
+    MESSAGE(WARNING "It is recommended to use CMake >= 3.7.0 (current version: "
+            "${CMAKE_VERSION}), when cross-compiling for Android.")
+
+    IF(ANDROID_STANDALONE_TOOLCHAIN)
+        SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot")
+
+        IF(NOT CMAKE_SYSTEM_VERSION)
+            SET(ANDROID_STANDALONE_TOOLCHAIN_API "")
+            SET(ANDROID_API_LEVEL_H_REGEX "^[\t ]*#[\t ]*define[\t ]+__ANDROID_API__[\t ]+([0-9]+)")
+            FILE(STRINGS "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include/android/api-level.h"
+                ANDROID_API_LEVEL_H_CONTENT REGEX "${ANDROID_API_LEVEL_H_REGEX}")
+            IF(ANDROID_API_LEVEL_H_CONTENT MATCHES "${ANDROID_API_LEVEL_H_REGEX}")
+                SET(ANDROID_STANDALONE_TOOLCHAIN_API "${CMAKE_MATCH_1}")
+            ENDIF()
+            SET(CMAKE_SYSTEM_VERSION ${ANDROID_STANDALONE_TOOLCHAIN_API})
+        ENDIF()
+
+        # Toolchain
+        SET(ANDROID_TOOLCHAIN "gcc")
+        SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN})
+        IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+            SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
+            IF(ANDROID_ABI STREQUAL "armeabi")
+                SET(CMAKE_SYSTEM_PROCESSOR armv5te)
+            ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
+                SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
+            ENDIF()
+        ENDIF()
+        SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
+    ENDIF()
+
+    # C compiler
+    IF(NOT CMAKE_C_COMPILER)
+        SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc")
+    ELSE()
+        GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
+    ENDIF()
+    IF(NOT EXISTS ${ANDROID_C_COMPILER})
+        MESSAGE(FATAL_ERROR "Cannot find C compiler: ${ANDROID_C_COMPILER}")
+    ENDIF()
+
+    # CXX compiler
+    IF(NOT CMAKE_CXX_COMPILER)
+        SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}g++")
+    ELSE()
+        GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
+    ENDIF()
+    IF(NOT EXISTS ${ANDROID_CXX_COMPILER})
+        MESSAGE(FATAL_ERROR "Cannot find CXX compiler: ${ANDROID_CXX_COMPILER}")
+    ENDIF()
+
+    SET(CMAKE_C_COMPILER ${ANDROID_C_COMPILER} CACHE PATH "C compiler" FORCE)
+    SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
+
+    # Toolchain and ABI specific flags.
+    SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections -finline-limit=64")
+    SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections")
+
+    IF(ANDROID_ABI STREQUAL "armeabi")
+        LIST(APPEND ANDROID_COMPILER_FLAGS
+             -march=armv5te
+             -mtune=xscale
+             -msoft-float)
+    ENDIF()
+    IF(ANDROID_ABI STREQUAL "armeabi-v7a")
+        LIST(APPEND ANDROID_COMPILER_FLAGS
+             -march=armv7-a
+             -mfloat-abi=softfp)
+        IF(ANDROID_ARM_NEON)
+            LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=neon)
+        ELSE()
+            LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16)
+        ENDIF()
+        LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8)
+    ENDIF()
+
+    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+        IF(ANDROID_ARM_MODE)
+            LIST(APPEND ANDROID_COMPILER_FLAGS -marm)
+        ELSE()
+            LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb)
+        ENDIF()
+    ENDIF()
+
+    STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
+    STRING(REPLACE ";" " " ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}")
+
+    SET(CMAKE_C_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_C_FLAGS}"
+        CACHE STRING "C flags")
+    SET(CMAKE_CXX_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_CXX_FLAGS}"
+        CACHE STRING "CXX flags")
+    SET(CMAKE_SHARED_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}"
+        CACHE STRING "shared linker flags")
+
+    SET(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
+    SET(CMAKE_EXE_LINKER_FLAGS "-pie -fPIE ${ANDROID_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}"
+        CACHE STRING "executable linker flags")
+
+    MESSAGE(STATUS "Android: Targeting API '${CMAKE_SYSTEM_VERSION}' "
+            "with architecture '${ANDROID_ARM_MODE_NAME}', "
+            "ABI '${ANDROID_ABI}', and processor '${CMAKE_SYSTEM_PROCESSOR}'")
+    MESSAGE(STATUS "System CMAKE_C_FLAGS: " ${CMAKE_C_FLAGS})
+    MESSAGE(STATUS "System CMAKE_CXX_FLAGS: " ${CMAKE_CXX_FLAGS})
+ELSE()
+    IF(ANDROID_STANDALONE_TOOLCHAIN)
+        SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN})
+    ENDIF()
+    SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
+    SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
+    SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+ENDIF()
diff --git a/cmake/cross_compiling/host.cmake b/cmake/cross_compiling/host.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..14c35266ec60b439aaef30e5e4e0540c534160ae
--- /dev/null
+++ b/cmake/cross_compiling/host.cmake
@@ -0,0 +1,49 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# find host C compiler
+IF(HOST_C_COMPILER)
+    SET(HOST_C_COMPILER_NAME ${HOST_C_COMPILER})
+ELSEIF(NOT $ENV{CC} STREQUAL "")
+    SET(HOST_C_COMPILER_NAME $ENV{CC})
+ELSE()
+    SET(HOST_C_COMPILER_NAME cc)
+ENDIF()
+
+GET_FILENAME_COMPONENT(HOST_C_COMPILER_PATH ${HOST_C_COMPILER_NAME} PROGRAM)
+IF(NOT HOST_C_COMPILER_PATH OR NOT EXISTS ${HOST_C_COMPILER_PATH})
+    MESSAGE(FATAL_ERROR "Cannot find host C compiler, set host C compiler:\n"
+            "\tcmake .. -DHOST_C_COMPILER=...")
+ENDIF()
+
+# find host CXX compiler
+IF(HOST_CXX_COMPILER)
+    SET(HOST_CXX_COMPILER_NAME ${HOST_CXX_COMPILER})
+ELSEIF(NOT $ENV{CXX} STREQUAL "")
+    SET(HOST_CXX_COMPILER_NAME $ENV{CXX})
+ELSE()
+    SET(HOST_CXX_COMPILER_NAME c++)
+ENDIF()
+
+GET_FILENAME_COMPONENT(HOST_CXX_COMPILER_PATH ${HOST_CXX_COMPILER_NAME} PROGRAM)
+IF(NOT HOST_CXX_COMPILER_PATH OR NOT EXISTS ${HOST_CXX_COMPILER_PATH})
+    MESSAGE(FATAL_ERROR "Cannot find host CXX compiler, set host CXX compiler:\n"
+            "\tcmake .. -DHOST_CXX_COMPILER=...")
+ENDIF()
+
+SET(HOST_C_COMPILER ${HOST_C_COMPILER_PATH} CACHE PATH "Host C compiler")
+SET(HOST_CXX_COMPILER ${HOST_CXX_COMPILER_PATH} CACHE PATH "Host CXX compiler")
+
+MESSAGE(STATUS "Found host C compiler: " ${HOST_C_COMPILER})
+MESSAGE(STATUS "Found host CXX compiler: " ${HOST_CXX_COMPILER})
diff --git a/cmake/cross_compiling/raspberry_pi.cmake b/cmake/cross_compiling/raspberry_pi.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..817b39f6833e37c340d4ee465048480cfc3db151
--- /dev/null
+++ b/cmake/cross_compiling/raspberry_pi.cmake
@@ -0,0 +1,84 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is a toolchain file for cross-compiling for Raspberry Pi.
+# The supported variables are listed belows:
+#
+# RPI_TOOLCHAIN
+# RPI_ARM_NEON
+#
+# Also you can set CMAKE_C/CXX_COMPILER yourself, through cmake arguments.
+
+IF(NOT RPI)
+    return()
+ENDIF()
+ 
+SET(CMAKE_SYSTEM_NAME Linux)
+SET(CMAKE_SYSTEM_VERSION 1)
+SET(CMAKE_SYSTEM_PROCESSOR arm)
+
+# check the exist of raspberry pi toolchain
+IF(NOT DEFINED RPI_TOOLCHAIN)
+    SET(RPI_TOOLCHAIN $ENV{RPI_TOOLCHAIN}
+        CACHE PATH "Folder holds the toolchain of Raspberr Pi")
+ENDIF()
+IF(NOT RPI_TOOLCHAIN)
+    MESSAGE(WARNING "It is recommended to set RPI_TOOLCHAIN to use toolchain.\n"
+            "To cross-compile for Raspberry Pi, you need to download the tools using:\n"
+            " git clone https://github.com/raspberrypi/tools\n")
+ENDIF()
+
+IF(NOT DEFINED RPI_ARM_NEON)
+    SET(RPI_ARM_NEON ON)
+ENDIF()
+
+IF(RPI_TOOLCHAIN)
+    SET(RPI_TOOLCHAIN_ROOT ${RPI_TOOLCHAIN})
+    IF(RPI_TOOLCHAIN_ROOT MATCHES "gcc-linaro-arm-linux-gnueabihf-raspbian(-x64)?$")
+        # gcc-linaro-arm-linux-gnueabihf-raspbian
+        # gcc-linaro-arm-linux-gnueabihf-raspbian-x64
+        SET(RPI_TOOLCHAIN_NAME arm-linux-gnueabihf)
+    ENDIF()
+    SET(RPI_TOOLCHAIN_PREFIX "${RPI_TOOLCHAIN_ROOT}/bin/${RPI_TOOLCHAIN_NAME}-")
+ENDIF()
+
+# C compiler
+IF(NOT CMAKE_C_COMPILER)
+    SET(RPI_C_COMPILER "${RPI_TOOLCHAIN_PREFIX}gcc")
+ELSE()
+    GET_FILENAME_COMPONENT(RPI_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
+ENDIF()
+IF(NOT EXISTS ${RPI_C_COMPILER})
+    MESSAGE(FATAL_ERROR "Cannot find C compiler: ${RPI_C_COMPILER}")
+ENDIF()
+
+# CXX compiler
+IF(NOT CMAKE_CXX_COMPILER)
+    SET(RPI_CXX_COMPILER "${RPI_TOOLCHAIN_PREFIX}g++")
+ELSE()
+    GET_FILENAME_COMPONENT(RPI_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
+ENDIF()
+IF(NOT EXISTS ${RPI_CXX_COMPILER})
+    MESSAGE(FATAL_ERROR "Cannot find CXX compiler: ${RPI_CXX_COMPILER}")
+ENDIF()
+
+SET(CMAKE_C_COMPILER ${RPI_C_COMPILER} CACHE PATH "C compiler" FORCE)
+SET(CMAKE_CXX_COMPILER ${RPI_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
+
+IF(RPI_ARM_NEON)
+    SET(RPI_C_FLAGS "${RPI_C_FLAGS} -mfpu=neon")
+ENDIF()
+
+SET(CMAKE_C_FLAGS "${RPI_C_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
+SET(CMAKE_CXX_FLAGS "${RPI_C_FLAGS} ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 18ac74aa6f7531c4001fe91960f8332619c99342..b6bd24fe8ae28b290f93d74dc5ca2b98302bf2a5 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -21,21 +21,34 @@ IF(NOT ${CBLAS_FOUND})
     SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
     SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
 
-    IF(WIN32)
-        SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/openblas.lib" CACHE FILEPATH "openblas library." FORCE)
-    ELSE(WIN32)
-        SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/libopenblas.a" CACHE FILEPATH "openblas library" FORCE)
-    ENDIF(WIN32)
+    SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/${LIBRARY_PREFIX}openblas${STATIC_LIBRARY_SUFFIX}"
+        CACHE FILEPATH "openblas library." FORCE)
+
+    SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1)
+
+    IF(ANDROID)
+        # arm_soft_fp_abi branch of OpenBLAS to support softfp
+        #   https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
+        SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
+        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0 libs)
+    ELSEIF(RPI)
+        # use hardfp
+        SET(OPENBLAS_COMMIT "v0.2.19")
+        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0 libs)
+    ELSE()
+        SET(OPENBLAS_COMMIT "v0.2.19")
+        SET(OPENBLAS_ARGS DYNAMIC_ARCH=1 libs)
+    ENDIF()
 
     ExternalProject_Add(
         openblas
         ${EXTERNAL_PROJECT_LOG_ARGS}
         GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
-        GIT_TAG             v0.2.19
+        GIT_TAG             ${OPENBLAS_COMMIT}
         PREFIX              ${CBLAS_SOURCES_DIR}
         INSTALL_DIR         ${CBLAS_INSTALL_DIR}
         BUILD_IN_SOURCE     1
-        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_LAPACK=1 DYNAMIC_ARCH=1 NO_SHARED=1 libs netlib
+        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
         INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR>
         UPDATE_COMMAND      ""
         CONFIGURE_COMMAND   ""
@@ -43,4 +56,5 @@ IF(NOT ${CBLAS_FOUND})
     LIST(APPEND external_project_dependencies openblas)
 ENDIF(NOT ${CBLAS_FOUND})
 
+MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
 INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index a9db4e8ba410c718f1ee4d69f4551e2773c60125..b35e6839cdc2ee062a9066585f0c83948d87e385 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -14,42 +14,42 @@
 
 INCLUDE(ExternalProject)
 
-set(PROTOBUF_VERSION 3.1)
-FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION})
+FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
+    SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_NAME})
+    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_NAME})
 
-IF(PROTOBUF_FOUND)
-    EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
-    STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}")
-    IF ("${PROTOBUF_VERSION}" VERSION_LESS "3.1.0")
-        SET(PROTOBUF_FOUND OFF)
-    ENDIF()
-ENDIF(PROTOBUF_FOUND)
-
-IF(NOT PROTOBUF_FOUND)
-    SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf)
-    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf)
-    SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE)
+    SET(${TARGET_NAME}_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE)
+    SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE)
+    SET(${TARGET_NAME}_LITE_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${STATIC_LIBRARY_SUFFIX}"
+         PARENT_SCOPE)
+    SET(${TARGET_NAME}_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${STATIC_LIBRARY_SUFFIX}"
+         PARENT_SCOPE)
+    SET(${TARGET_NAME}_PROTOC_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${STATIC_LIBRARY_SUFFIX}"
+         PARENT_SCOPE)
+    SET(${TARGET_NAME}_PROTOC_EXECUTABLE
+        "${PROTOBUF_INSTALL_DIR}/bin/protoc${EXECUTABLE_SUFFIX}"
+         PARENT_SCOPE)
 
-    IF(WIN32)
-        SET(PROTOBUF_LITE_LIBRARY
-            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE)
-        SET(PROTOBUF_LIBRARY
-            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE)
-        SET(PROTOBUF_PROTOC_LIBRARY
-            "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE)
-        SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE)
-    ELSE(WIN32)
-        SET(PROTOBUF_LITE_LIBRARY
-            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
-        SET(PROTOBUF_LIBRARY
-            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
-        SET(PROTOBUF_PROTOC_LIBRARY
-            "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
-        SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE)
-    ENDIF(WIN32)
+    SET(OPTIONAL_CACHE_ARGS "")
+    SET(OPTIONAL_ARGS "")
+    IF(BUILD_FOR_HOST)
+        SET(OPTIONAL_ARGS "-Dprotobuf_WITH_ZLIB=OFF")
+    ELSE()
+        SET(OPTIONAL_ARGS
+            "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+            "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
+            "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
+            "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
+            "-Dprotobuf_WITH_ZLIB=ON"
+            "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}")
+        SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
+    ENDIF()
 
     ExternalProject_Add(
-        protobuf
+        ${TARGET_NAME}
         ${EXTERNAL_PROJECT_LOG_ARGS}
         PREFIX          ${PROTOBUF_SOURCES_DIR}
         UPDATE_COMMAND  ""
@@ -57,11 +57,9 @@ IF(NOT PROTOBUF_FOUND)
         GIT_REPOSITORY  "https://github.com/google/protobuf.git"
         GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
         CONFIGURE_COMMAND
-        ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
+        ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
+            ${OPTIONAL_ARGS}
             -Dprotobuf_BUILD_TESTS=OFF
-            -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
-            -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
             -DCMAKE_BUILD_TYPE=Release
             -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
@@ -71,10 +69,44 @@ IF(NOT PROTOBUF_FOUND)
             -DCMAKE_BUILD_TYPE:STRING=Release
             -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
             -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-            -DZLIB_ROOT:STRING=${ZLIB_ROOT}
+            ${OPTIONAL_CACHE_ARGS}
     )
+ENDFUNCTION()
+
+SET(PROTOBUF_VERSION 3.1)
+IF(NOT CMAKE_CROSSCOMPILING)
+    FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION})
 
+    IF(PROTOBUF_FOUND)
+        EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
+        STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}")
+        IF("${PROTOBUF_VERSION}" VERSION_LESS "3.1.0")
+            SET(PROTOBUF_FOUND OFF)
+        ENDIF()
+    ENDIF(PROTOBUF_FOUND)
+ELSE()
+    build_protobuf(protobuf_host TRUE)
+    LIST(APPEND external_project_dependencies protobuf_host)
+
+    SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_host_PROTOC_EXECUTABLE}
+        CACHE FILEPATH "protobuf executable." FORCE)
+ENDIF()
+
+IF(NOT PROTOBUF_FOUND)
+    build_protobuf(protobuf FALSE)
     LIST(APPEND external_project_dependencies protobuf)
+
+    SET(PROTOBUF_INCLUDE_DIR ${protobuf_INCLUDE_DIR}
+        CACHE PATH "protobuf include directory." FORCE)
+    IF(NOT CMAKE_CROSSCOMPILING)
+        SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_PROTOC_EXECUTABLE}
+            CACHE FILEPATH "protobuf executable." FORCE)
+    ENDIF()
+    SET(PROTOBUF_LITE_LIBRARY ${protobuf_LITE_LIBRARY} CACHE FILEPATH "protobuf lite library." FORCE)
+    SET(PROTOBUF_LIBRARY ${protobuf_LIBRARY} CACHE FILEPATH "protobuf library." FORCE)
+    SET(PROTOBUF_PROTOC_LIBRARY ${protobuf_PROTOC_LIBRARY} CACHE FILEPATH "protoc library." FORCE)
 ENDIF(NOT PROTOBUF_FOUND)
 
+MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}")
+MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}")
 INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 75a9d8fc25674e1dd0f5b73cd0ccde48204f63aa..904652413e026e3a7f3f2a19f48f4e906ce6babb 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 # Detects the OS and sets appropriate variables.
-# CMAKE_SYSTEM_NAME only give us a coarse-grained name,
-# but the name like centos is necessary in some scenes
-# to distinguish system for customization.
+# CMAKE_SYSTEM_NAME only give us a coarse-grained name of the OS CMake is
+# building for, but the host processor name like centos is necessary
+# in some scenes to distinguish system for customization.
 #
 # for instance, protobuf libs path is <install_dir>/lib64
 # on CentOS, but <install_dir>/lib on other systems.
@@ -72,12 +72,36 @@ MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
 MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
 MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
 
+# configuration for cross-compiling
 IF(DEFINED CMAKE_SYSTEM_NAME)
+    INCLUDE(cross_compiling/host)
     IF(${CMAKE_SYSTEM_NAME} STREQUAL "Android")
         SET(ANDROID TRUE)
+        INCLUDE(cross_compiling/android)
+    ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "RPi")
+        SET(RPI TRUE)
+        INCLUDE(cross_compiling/raspberry_pi)
     ENDIF()
 ENDIF()
 
+# prefix and suffix on different os
+IF(WIN32)
+    SET(LIBRARY_PREFIX "")
+    SET(SHARED_LIBRARY_SUFFIX ".dll")
+    SET(STATIC_LIBRARY_SUFFIX ".lib")
+    SET(EXECUTABLE_SUFFIX ".exe")
+ELSE(WIN32)
+    SET(LIBRARY_PREFIX "lib")
+    IF(APPLE)
+        SET(SHARED_LIBRARY_SUFFIX ".dylib")
+    ELSE(APPLE)
+        SET(SHARED_LIBRARY_SUFFIX ".so")
+    ENDIF(APPLE)
+
+    SET(STATIC_LIBRARY_SUFFIX ".a")
+    SET(EXECUTABLE_SUFFIX "")
+ENDIF(WIN32)
+
 # external dependencies log output
 SET(EXTERNAL_PROJECT_LOG_ARGS
     LOG_DOWNLOAD    0     # Wrap download in script to log output
diff --git a/demo/mnist/light_mnist.py b/demo/mnist/light_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..33409054357d2f0c6a765b3ab3164eb2e584467e
--- /dev/null
+++ b/demo/mnist/light_mnist.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+is_predict = get_config_arg("is_predict", bool, False)
+
+####################Data Configuration ##################
+
+if not is_predict:
+    data_dir = './data/'
+    define_py_data_sources2(
+        train_list=data_dir + 'train.list',
+        test_list=data_dir + 'test.list',
+        module='mnist_provider',
+        obj='process')
+
+######################Algorithm Configuration #############
+settings(batch_size=50, learning_rate=0.001, learning_method=AdamOptimizer())
+
+#######################Network Configuration #############
+
+data_size = 1 * 28 * 28
+label_size = 10
+img = data_layer(name='pixel', size=data_size)
+
+
+# light cnn
+# A shallower cnn model: [CNN, BN, ReLU, Max-Pooling] x4 + FC x1
+# Easier to train for mnist dataset and quite efficient
+# Final performance is close to deeper ones on tasks such as digital and character classification 
+def light_cnn(input_image, num_channels, num_classes):
+    def __light__(ipt,
+                  num_filter=128,
+                  times=1,
+                  conv_filter_size=3,
+                  dropouts=0,
+                  num_channels_=None):
+        return img_conv_group(
+            input=ipt,
+            num_channels=num_channels_,
+            pool_size=2,
+            pool_stride=2,
+            conv_padding=0,
+            conv_num_filter=[num_filter] * times,
+            conv_filter_size=conv_filter_size,
+            conv_act=ReluActivation(),
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type=MaxPooling())
+
+    tmp = __light__(input_image, num_filter=128, num_channels_=num_channels)
+    tmp = __light__(tmp, num_filter=128)
+    tmp = __light__(tmp, num_filter=128)
+    tmp = __light__(tmp, num_filter=128, conv_filter_size=1)
+
+    tmp = fc_layer(input=tmp, size=num_classes, act=SoftmaxActivation())
+    return tmp
+
+
+predict = light_cnn(input_image=img, num_channels=1, num_classes=label_size)
+
+if not is_predict:
+    lbl = data_layer(name="label", size=label_size)
+    inputs(img, lbl)
+    outputs(classification_cost(input=predict, label=lbl))
+else:
+    outputs(predict)
diff --git a/doc/design/file_manager/README.md b/doc/design/file_manager/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3df10d801e568834729f902aace483d033340e2d
--- /dev/null
+++ b/doc/design/file_manager/README.md
@@ -0,0 +1,87 @@
+# FileManager设计文档
+## 目标
+在本文档中，我们设计说明了名为FileManager系统，方便用户上传自己的训练数据以进行分布式训练
+
+主要功能包括：
+
+- 提供常用的命令行管理命令管理文件和目录
+- 支持大文件的断点上传、下载  
+
+## 名词解释
+- PFS：是`Paddlepaddle cloud File System`的缩写，是对用户文件存储空间的抽象，与之相对的是local filesystem。目前我们用CephFS来搭建。
+- [CephFS](http://docs.ceph.com/docs/master/cephfs/)：一个POSIX兼容的文件系统。
+- Chunk：逻辑划上文件分块的单位。
+
+## 模块
+### 架构图
+<image src=./src/filemanager.png width=900>
+
+### PFSClient
+- 功能： 详细设计[link](./pfs/pfsclient.md)
+	- 提供用户管理文件的命令
+	- 需要可以跨平台执行
+
+- 双向验证   
+	PFSClient需要和Ingress之间做双向验证<sup>[tls](#tls)</sup>，所以用户需要首先在`cloud.paddlepaddle.org`上注册一下，申请用户空间，并且把系统生成的CA(certificate authority)、Key、CRT(CA signed certificate)下载到本地，然后才能使用PFSClient。
+		
+### [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/)
+- 功能：  
+	提供七层协议的反向代理、基于粘性会话的负载均衡功能。
+	
+- 透传用户身份的办法  
+	Ingress需要把PFSClient的身份信息传给PFSServer，配置的方法参考[link](http://www.integralist.co.uk/posts/clientcertauth.html#3)
+
+### PFSServer
+PFSServer提供RESTful API接口，接收处理PFSClient端的文件管理请求，并且把结果返回PFSClient端。
+
+RESTful API
+
+- /api/v1/files
+	- `GET /api/v1/files`: Get metadata of files or directories.
+	- `POST /api/v1/files`: Create files or directories.
+	- `PATCH /api/v1/files`: Update files or directories.
+	- `DELETE /api/v1/files`: Delete files or directories.
+
+- /api/v1/file/chunks
+	- `GET /api/v1/storage/file/chunks`: Get chunks's metadata of a file.
+
+- /api/v1/storage/files
+	- `GET /api/v1/storage/files`: Download files or directories.
+	- `POST /api/v1/storage/files`: Upload files or directories.
+
+- /api/v1/storage/file/chunks
+	- `GET /api/v1/storage/file/chunks`: Download chunks's data.
+	- `POST /api/v1/storage/file/chunks`: Upload chunks's data.
+
+## 文件传输优化
+
+### 分块文件传输
+用户文件可能是比较大的，上传到Cloud或者下载到本地的时间可能比较长，而且在传输的过程中也可能出现网络不稳定的情况。为了应对以上的问题，我们提出了Chunk的概念，一个Chunk由所在的文件偏移、数据、数据长度及校验值组成。文件的上传和下载都是通过对Chunk的操作来实现的。由于Chunk比较小（默认256K），完成一个传输动作完成的时间也比较短，不容易出错。PFSClient需要在传输完毕最后一个Chunk的时候检查destination文件的MD5值是否和source文件一致。
+
+一个典型的Chunk如下所示：
+
+```
+type Chunk struct {
+	fileOffset int64
+	checksum uint32
+	len     uint32
+	data    []byte
+}
+```  
+
+### 生成sparse文件
+当destination文件不存在或者大小和source文件不一致时，可以用[Fallocate](https://Go.org/pkg/syscall/#Fallocate)生成sparse文件，然后就可以并发写入多个Chunk。
+
+### 覆盖不一致的部分
+文件传输的的关键在于需要PFSClient端对比source和destination的文件Chunks的checksum是否保持一致，不一致的由PFSClient下载或者传输Chunk完成。这样已经传输成功的部分就不用重新传输了。
+
+## 用户使用流程
+参考[link](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md)
+
+## 框架生成
+用[swagger](https://github.com/swagger-api/swagger-codegen)生成PFSClient和PFSServer的框架部分，以便我们可以把更多的精力放到逻辑本身上。
+
+## 参考文档
+- <a name=tls></a>[TLS complete guide](https://github.com/k8sp/tls/blob/master/tls.md)
+- [aws.s3](http://docs.aws.amazon.com/cli/latest/reference/s3/)
+- [linux man document](https://linux.die.net/man/)
diff --git a/doc/design/file_manager/pfs/pfsclient.md b/doc/design/file_manager/pfs/pfsclient.md
new file mode 100644
index 0000000000000000000000000000000000000000..56bc70c54bbc92b78d66e04fb495b1300cf8ebe0
--- /dev/null
+++ b/doc/design/file_manager/pfs/pfsclient.md
@@ -0,0 +1,129 @@
+# PFSClient
+
+## Description
+The `pfs` command is a Command Line Interface to manage your files on PaddlePaddle Cloud
+
+## Synopsis
+```
+paddle [options] pfs <subcommand> [parameters]
+```
+
+## Options
+```
+--profile (string)
+	Use a specific profile from your credential file.
+
+--help (string)
+	Display more information about command
+
+--version
+	Output version information and exit
+
+--debug
+	Show detailed debugging log	
+	
+--only-show-errors (boolean) 
+	Only errors and warnings are displayed. All other output is suppressed.
+```
+
+## Path Arguments
+When using a command, we need to specify path arguments. There are two path argument type: `localpath` and `pfspath`.  
+
+A `pfspath` begin with `/pfs`, eg: `/pfs/$DATACENTER/home/$USER/folder`.
+
+[Here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md#上传训练文件) is how to config datacenters.
+
+## order of Path Arguments
+Commonly, if there are two path arguments, the first is the source, and the second is the destination.
+
+## Subcommonds
+- rm - remove files or directories
+
+```
+Synopsis:
+	rm [-r] [-v] <PFSPath> ...
+
+Options:
+	-r 
+		Remove directories and their contents recursively 
+	-v      
+		Cause rm to be verbose, showing files after they are removed.
+	
+Examples:
+	paddle pfs rm /pfs/$DATACENTER/home/$USER/file
+	paddle pfs rm -r /pfs/$DATACENTER/home/$USER/folder
+```
+- mv - move (rename) files
+
+```
+Synopsis:
+	mv [-f | -n] [-v] <LocalPath> <PFSPath>
+	mv [-f | -n] [-v] <LocalPath> ... <PFSPath>
+	mv [-f | -n] [-v] <PFSPath> <LocalPath> 
+	mv [-f | -n] [-v] <PFSPath> ... <LocalPath> 
+	mv [-f | -n] [-v] <PFSPath> <PFSPath> 
+	mv [-f | -n] [-v] <PFSPath> ... <PFSPath> 
+	
+Options:
+	-f      
+		Do not prompt for confirmation before overwriting the destination path.  (The -f option overrides previous -n options.)
+	-n      
+		Do not overwrite an existing file.  (The -n option overrides previous -f options.)
+	-v      
+		Cause mv to be verbose, showing files after they are moved.
+		
+Examples:
+	paddle pfs mv ./text1.txt /pfs/$DATACENTER/home/$USER/text1.txt
+```
+- cp - copy files or directories
+
+```
+Synopsis:
+	cp [-r] [-f | -n] [-v] [--preserve--links] <LocalPath> <PFSPath>
+	cp [-r] [-f | -n] [-v] [--preserve--links] <LocalPath> ... <PFSPath>
+	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> <LocalPath> 
+	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> ... <LocalPath>
+	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> <PFSPath> 
+	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> ... <PFSPath>
+
+Options:
+	-r
+   		Copy directories recursively
+   	-f      
+		Do not prompt for confirmation before overwriting the destination path.  (The -f option overrides previous -n options.)
+	-n      
+		Do not overwrite an existing file.  (The -n option overrides previous -f options.)
+	-v      
+		Cause cp to be verbose, showing files after they are copied.
+	--preserve--links
+	   Reserve links when copy links
+	   
+Examples:
+	paddle pfs cp ./file /pfs/$DATACENTER/home/$USER/file
+	paddle pfs cp /pfs/$DATACENTER/home/$USER/file ./file
+```
+- ls- list files
+
+```
+Synopsis:
+	ls [-r] <PFSPath> ...
+	
+Options:
+	-R
+   		List directory(ies) recursively
+
+Examples:
+	paddle pfs ls  /pfs/$DATACENTER/home/$USER/file
+	paddle pfs ls  /pfs/$DATACENTER/home/$USER/folder
+```
+
+- mkdir - mkdir directory(ies)
+Create intermediate directory(ies) as required.
+
+```
+Synopsis:
+	mkdir <PFSPath> ...
+
+Examples:
+	paddle pfs mkdir  /pfs/$DATACENTER/home/$USER/folder
+```
diff --git a/doc/design/file_manager/src/filemanager.graffle b/doc/design/file_manager/src/filemanager.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..7861a33072bc1908f69d12b37c20491dd8663103
Binary files /dev/null and b/doc/design/file_manager/src/filemanager.graffle differ
diff --git a/doc/design/file_manager/src/filemanager.png b/doc/design/file_manager/src/filemanager.png
new file mode 100644
index 0000000000000000000000000000000000000000..8139a19f5722f56d3c211f3ab0d3982f751134b9
Binary files /dev/null and b/doc/design/file_manager/src/filemanager.png differ
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md
index 775938612e8d213b92e2eb69dae805838dc5ae96..a48b143c760c6fc6fc08e793e4cf2f82f6713dc0 100644
--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -7,6 +7,7 @@
 - 确保编译器选项 `WITH_STYLE_CHECK` 已打开，并且编译能通过代码样式检查。
 - 所有代码必须具有单元测试。
 - 通过所有单元测试。
+- 请遵守[提交代码的一些约定](#提交代码的一些约定)。
 
 以下教程将指导您提交代码。
 ## [Fork](https://help.github.com/articles/fork-a-repo/)
@@ -217,3 +218,22 @@ upstream
 ```
 
 至此，我们就完成了一次代码贡献的过程。
+
+## 提交代码的一些约定
+
+为了使评审人在评审代码时更好地专注于代码本身，请您每次提交代码时，遵守以下约定：
+1. 请保证Travis-CI 中单元测试能顺利通过。如果没过，说明提交的代码存在问题，评审人一般不做评审。
+2. 提交PUll Request前：
+   - 请注意commit的数量：
+     - 原因：如果仅仅修改一个文件但提交了十几个commit，每个commit只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改，且不排除commit之间的修改存在相互覆盖的情况。
+     - 建议：每次提交时，保持尽量少的commit，可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit，可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
+   - 请注意每个commit的名称：应能反映当前commit的内容，不能太随意。
+3. 如果解决了某个Issue的问题，请在该PUll Request的**第一个**评论框中加上：`fix #issue_number`，这样当该PUll Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
+
+此外，在回复评审人意见时，请您遵守以下约定：
+1. 评审人的每个意见都必须回复（这是开源社区的基本礼貌，别人帮了忙，应该说谢谢）：
+   - 对评审意见同意且按其修改完的，给个简单的`Done`即可；
+   - 对评审意见不同意的，请给出您自己的反驳理由。
+2. 如果评审意见比较多：
+   - 请给出总体的修改情况。
+   - 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复，而非直接回复的方式。原因是每个回复都会发送一封邮件，会造成邮件灾难。
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index c6fd9cc54ae3a671c5bdcf54cbaa873c59280694..769955490976401ea93ed61987064026829a9f41 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -9,7 +9,7 @@ add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
 
-find_package(boost QUIET)
+find_package(Boost QUIET)
 
 if(Boost_FOUND)
   include_directories(${Boost_INCLUDE_DIRS})
diff --git a/paddle/majel/CMakeLists.txt b/paddle/majel/CMakeLists.txt
index baa3bb9e914b3053a18dc638146325ffe3d28a12..4b1438d570ae6dda95c72d2582df833d2d4c4d93 100644
--- a/paddle/majel/CMakeLists.txt
+++ b/paddle/majel/CMakeLists.txt
@@ -1,8 +1,6 @@
 cmake_minimum_required(VERSION 3.0)
 
-if(GTEST_INCLUDE_DIR AND GTEST_LIBRARIES)
-    message("-- Found gtest (include: ${GTEST_INCLUDE_DIR}, library: ${GTEST_LIBRARIES})")
-else()
+if(${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR})
     # find #include <majel/xx.h>
     get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
     include_directories(${PARENT_DIR})
@@ -18,6 +16,8 @@ else()
     set(THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/third_party)
     set(WITH_TESTING ON)
     include(external/gtest)
+else()
+    message("-- Found gtest (include: ${GTEST_INCLUDE_DIR}, library: ${GTEST_LIBRARIES})")
 endif()
 
 ########################### Build Majel #############################
@@ -29,6 +29,9 @@ if(CUDA_FOUND)
 else()
     add_library(majel ${MAJEL_CXX_FILES})
 endif()
+add_dependencies(majel ${external_project_dependencies})
 #####################################################################
 
-add_subdirectory(test)
+if(WITH_TESTING)
+    add_subdirectory(test)
+endif()
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 101b44e6c62ecf0b84d65ee7b6e90e64bd7b3272..2dfa712427d81d2be502f1dbbe880c81b6d9a3f4 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -131,8 +131,6 @@ cat > /paddle/build/Dockerfile <<EOF
 FROM ${BASE_IMAGE}
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ENV HOME /root
-ENV LANG en_US.UTF-8
-# Use Fix locales to en_US.UTF-8
 EOF
 
 if [[ -n ${APT_MIRROR} ]]; then
@@ -153,6 +151,7 @@ RUN apt-get update &&\
     paddle version
 ${DOCKERFILE_CUDNN_DSO}
 ${DOCKERFILE_GPU_ENV}
+
 # default command shows the paddle version and exit
 CMD ["paddle", "version"]
 EOF
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 31652613fb3a55636b32babbc4bde60d65776c61..3b6f0270de16627821624dd1266a0a1c089323b0 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -3765,7 +3765,7 @@ def __cost_input__(input, label, weight=None):
 
 @wrap_name_default()
 @layer_support()
-def mse_cost(input, label, weight=None, name=None, layer_attr=None):
+def mse_cost(input, label, weight=None, name=None, coeff=1.0, layer_attr=None):
     """
     mean squared error cost:
 
@@ -3782,6 +3782,8 @@ def mse_cost(input, label, weight=None, name=None, layer_attr=None):
     :param weight: The weight affects the cost, namely the scale of cost.
                    It is an optional argument.
     :type weight: LayerOutput
+    :param coeff: The coefficient affects the gradient in the backward.
+    :type coeff: float
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -3793,6 +3795,7 @@ def mse_cost(input, label, weight=None, name=None, layer_attr=None):
         inputs=ipts,
         type="square_error",
         name=name,
+        coeff=coeff,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(name, LayerType.COST, parents=parents, size=1)
 
@@ -4798,6 +4801,7 @@ def crf_layer(input,
               weight=None,
               param_attr=None,
               name=None,
+              coeff=1.0,
               layer_attr=None):
     """
     A layer for calculating the cost of sequential conditional random
@@ -4824,6 +4828,8 @@ def crf_layer(input,
     :type param_attr: ParameterAttribute
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring
+    :param coeff: The coefficient affects the gradient in the backward.
+    :type coeff: float
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
@@ -4848,6 +4854,7 @@ def crf_layer(input,
         type=LayerType.CRF_LAYER,
         size=size,
         inputs=ipts,
+        coeff=coeff,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     parents = [input, label]
     if weight is not None:
@@ -4921,12 +4928,14 @@ def crf_decoding_layer(input,
 
 @wrap_act_default(act=SigmoidActivation())
 @wrap_bias_attr_default(has_bias=True)
+@wrap_param_attr_default()
 @wrap_name_default()
 @layer_support()
 def nce_layer(input,
               label,
-              num_classes,
+              num_classes=None,
               act=None,
+              param_attr=None,
               weight=None,
               num_neg_samples=10,
               neg_distribution=None,
@@ -4942,7 +4951,8 @@ def nce_layer(input,
 
     .. code-block:: python
 
-       cost = nce_layer(input=layer1, label=layer2, weight=layer3,
+       cost = nce_layer(input=[layer1, layer2], label=layer2,
+                        param_attr=[attr1, attr2], weight=layer3,
                         num_classes=3, neg_distribution=[0.1,0.3,0.6])
 
     :param name: layer name
@@ -4957,6 +4967,8 @@ def nce_layer(input,
     :type num_classes: int
     :param act: Activation, default is Sigmoid.
     :type act: BaseActivation
+    :param param_attr: The Parameter Attribute|list.
+    :type param_attr: ParameterAttribute
     :param num_neg_samples: number of negative samples. Default is 10.
     :type num_neg_samples: int
     :param neg_distribution: The distribution for generating the random negative labels.
@@ -4972,9 +4984,20 @@ def nce_layer(input,
     """
     if isinstance(input, LayerOutput):
         input = [input]
+        assert not isinstance(param_attr, collections.Sequence)
+        param_attr = [param_attr]
+    else:
+        if isinstance(param_attr, collections.Sequence):
+            assert len(input) == len(param_attr)
+        else:
+            param_attr = [copy.deepcopy(param_attr) for _ in range(len(input))]
+
     assert isinstance(input, collections.Sequence)
+
     assert isinstance(label, LayerOutput)
     assert label.layer_type == LayerType.DATA
+    if num_classes is None:
+        num_classes = label.size
     if neg_distribution is not None:
         assert isinstance(neg_distribution, collections.Sequence)
         assert len(neg_distribution) == num_classes
@@ -4984,9 +5007,9 @@ def nce_layer(input,
 
     ipts_for_layer = []
     parents = []
-    for each_input in input:
+    for each_input, attr in zip(input, param_attr):
         assert isinstance(each_input, LayerOutput)
-        ipts_for_layer.append(each_input.name)
+        ipts_for_layer.append(Input(each_input.name, **attr.attr))
         parents.append(each_input)
     ipts_for_layer.append(label.name)
     parents.append(label)
@@ -5363,7 +5386,7 @@ def multi_binary_label_cross_entropy(input,
 
 @wrap_name_default()
 @layer_support()
-def smooth_l1_cost(input, label, name=None, layer_attr=None):
+def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
     """
     This is a L1 loss but more smooth. It requires that the
     size of input and label are equal. The formula is as follows,
@@ -5392,6 +5415,8 @@ def smooth_l1_cost(input, label, name=None, layer_attr=None):
     :type input: LayerOutput
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring
+    :param coeff: The coefficient affects the gradient in the backward.
+    :type coeff: float
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -5405,6 +5430,7 @@ def smooth_l1_cost(input, label, name=None, layer_attr=None):
         name=name,
         type=LayerType.SMOOTH_L1,
         inputs=[input.name, label.name],
+        coeff=coeff,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
         name, LayerType.SMOOTH_L1, parents=[input, label], size=1)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
index 05fd1c99d2db6e9faa3b3884ec9baf051791f9fe..05847344be60b4de42a7dd709914fd3da524d1ae 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
@@ -215,6 +215,22 @@ layers {
   }
   coeff: 1.0
 }
+layers {
+  name: "__nce_layer_0__"
+  type: "nce"
+  size: 1
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___nce_layer_0__.w0"
+  }
+  inputs {
+    input_layer_name: "labels"
+  }
+  bias_parameter_name: "___nce_layer_0__.wbias"
+  num_classes: 5000
+  num_neg_samples: 10
+}
 parameters {
   name: "___fc_layer_0__.w0"
   size: 800
@@ -245,6 +261,26 @@ parameters {
   initial_strategy: 0
   initial_smart: true
 }
+parameters {
+  name: "___nce_layer_0__.w0"
+  size: 20000
+  initial_mean: 0.0
+  initial_std: 0.0141421356237
+  dims: 5000
+  dims: 4
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___nce_layer_0__.wbias"
+  size: 5000
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 5000
+  initial_strategy: 0
+  initial_smart: false
+}
 input_layer_names: "input"
 input_layer_names: "labels"
 input_layer_names: "crf_label"
@@ -267,6 +303,7 @@ output_layer_names: "__cross_entropy_with_selfnorm_0__"
 output_layer_names: "__huber_cost_0__"
 output_layer_names: "__multi_binary_label_cross_entropy_0__"
 output_layer_names: "__sum_cost_0__"
+output_layer_names: "__nce_layer_0__"
 sub_models {
   name: "root"
   layer_names: "input"
@@ -292,6 +329,7 @@ sub_models {
   layer_names: "__huber_cost_0__"
   layer_names: "__multi_binary_label_cross_entropy_0__"
   layer_names: "__sum_cost_0__"
+  layer_names: "__nce_layer_0__"
   input_layer_names: "input"
   input_layer_names: "labels"
   input_layer_names: "crf_label"
@@ -314,6 +352,7 @@ sub_models {
   output_layer_names: "__huber_cost_0__"
   output_layer_names: "__multi_binary_label_cross_entropy_0__"
   output_layer_names: "__sum_cost_0__"
+  output_layer_names: "__nce_layer_0__"
   is_recurrent_layer_group: false
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
index 3244181a63109335c4fba6ca4dd04ac8f0446313..b7d74f85ab4ca3f434dfa45516dfee7227b6ceee 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
@@ -60,6 +60,31 @@ layers {
   }
   coeff: 1.0
 }
+layers {
+  name: "multi_class_label"
+  type: "data"
+  size: 500
+  active_type: ""
+}
+layers {
+  name: "__nce_layer_0__"
+  type: "nce"
+  size: 1
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___nce_layer_0__.w0"
+  }
+  inputs {
+    input_layer_name: "multi_class_label"
+  }
+  inputs {
+    input_layer_name: "weight"
+  }
+  bias_parameter_name: "___nce_layer_0__.wbias"
+  num_classes: 500
+  num_neg_samples: 10
+}
 parameters {
   name: "___fc_layer_0__.w0"
   size: 3000
@@ -80,9 +105,30 @@ parameters {
   initial_strategy: 0
   initial_smart: false
 }
+parameters {
+  name: "___nce_layer_0__.w0"
+  size: 5000
+  initial_mean: 0.0
+  initial_std: 0.04472135955
+  dims: 500
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___nce_layer_0__.wbias"
+  size: 500
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 500
+  initial_strategy: 0
+  initial_smart: false
+}
 input_layer_names: "input"
 input_layer_names: "label"
 input_layer_names: "weight"
+input_layer_names: "multi_class_label"
 output_layer_names: "__cost_0__"
 output_layer_names: "__mse_cost_0__"
 evaluators {
@@ -100,9 +146,12 @@ sub_models {
   layer_names: "__fc_layer_0__"
   layer_names: "__cost_0__"
   layer_names: "__mse_cost_0__"
+  layer_names: "multi_class_label"
+  layer_names: "__nce_layer_0__"
   input_layer_names: "input"
   input_layer_names: "label"
   input_layer_names: "weight"
+  input_layer_names: "multi_class_label"
   output_layer_names: "__cost_0__"
   output_layer_names: "__mse_cost_0__"
   evaluator_names: "classification_error_evaluator"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
index 18ff6b48c495b7a9d61595916ade1a54b1fa6a10..d2a3b702a1d7b650947b344e4719098f68d4dd73 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
@@ -40,4 +40,6 @@ outputs(
             name='huber_label', size=1)),
     multi_binary_label_cross_entropy(
         input=probs, label=xe_label),
-    sum_cost(input=hidden))
+    sum_cost(input=hidden),
+    nce_layer(
+        input=hidden, label=labels))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
index 1c0aa7f9b9ee45b9eaf82dc46a2648d834dcd4ad..c369062930e2b067ceab0dc3b25ba6c1eabe2450 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
@@ -11,4 +11,9 @@ outputs(
     classification_cost(
         input=fc, label=lbl, weight=wt),
     mse_cost(
-        input=fc, label=lbl, weight=wt))
+        input=fc, label=lbl, weight=wt),
+    nce_layer(
+        input=fc,
+        label=data_layer(
+            name='multi_class_label', size=500),
+        weight=wt))
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index b4bb38496937bb6fb520334331c619f9b6f64b51..139339902e9e2228f72068bed4c4ebe58ebc4428 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -88,7 +88,7 @@ def infer(output_layer, parameters, input, feeding=None, field='value'):
     Infer a neural network by given neural network output and parameters.  The
     user should pass either a batch of input data or reader method.
 
-    Example usages:
+    Example usage for sinlge output_layer:
 
     ..  code-block:: python
 
@@ -97,8 +97,19 @@ def infer(output_layer, parameters, input, feeding=None, field='value'):
                               input=SomeData)
         print result
 
+    Example usage for multiple outout_layers and fields:
+
+    ..  code-block:: python
+
+        result = paddle.infer(output_layer=[prediction1, prediction2], 
+                              parameters=parameters, 
+                              input=SomeData,
+                              field=[id, value]])
+        print result
+
     :param output_layer: output of the neural network that would be inferred
-    :type output_layer: paddle.v2.config_base.Layer
+    :type output_layer: paddle.v2.config_base.Layer or a list of 
+                        paddle.v2.config_base.Layer
     :param parameters: parameters of the neural network.
     :type parameters: paddle.v2.parameters.Parameters
     :param input: input data batch. Should be a python iterable object, and each
@@ -112,7 +123,9 @@ def infer(output_layer, parameters, input, feeding=None, field='value'):
                   Note that `prob` only used when output_layer is beam_search 
                   or max_id.
     :type field: str
-    :return: a numpy array
+    :return: The prediction result. If there are multiple outout_layers and fields, 
+             the return order is outout_layer1.field1, outout_layer2.field1, ..., 
+             outout_layer1.field2, outout_layer2.field2 ...
     :rtype: numpy.ndarray
     """