diff --git a/CMakeLists.txt b/CMakeLists.txt
index 861bb50a2de0249e4e5ac2e2fa1d7a8a7c61bca0..1a59db8c71bf3b1ea472c1ee56a1cd97de42dad8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,19 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-cmake_minimum_required(VERSION 3.0)
-
-project(paddle CXX C)
-
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
 
+include(system)
+
+if(ANDROID)
+    cmake_minimum_required(VERSION 3.7)
+else()
+    cmake_minimum_required(VERSION 3.0)
+endif()
+
+project(paddle CXX C)
+
 find_package(Sphinx)
-find_package(CUDA QUIET)
+if(NOT CMAKE_CROSSCOMPILING)
+    find_package(CUDA QUIET)
+endif(NOT CMAKE_CROSSCOMPILING)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
 
-include(system)
 include(simd)
 
 ################################ Configurations #######################################
@@ -51,6 +58,21 @@ if(NOT CMAKE_BUILD_TYPE)
       FORCE)
 endif()
 
+if(ANDROID)
+    if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
+        message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21")
+    endif()
+
+    set(WITH_GPU OFF CACHE STRING
+        "Disable GPU when cross-compiling for Android" FORCE)
+    set(WITH_AVX OFF CACHE STRING
+        "Disable AVX when cross-compiling for Android" FORCE)
+    set(WITH_PYTHON OFF CACHE STRING
+        "Disable PYTHON when cross-compiling for Android" FORCE)
+    set(WITH_RDMA OFF CACHE STRING
+        "Disable RDMA when cross-compiling for Android" FORCE)
+endif(ANDROID)
+
 set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
 ########################################################################################
@@ -75,7 +97,6 @@ include(flags)              # set paddle compile flags
 include(cudnn)              # set cudnn libraries
 include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
-
 include(configure)          # add paddle env configuration
 
 include_directories("${PROJ_ROOT}")
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 235c95f017f2b6ef24195a0210ccafff36b6ed61..b8bf1bb07a1f779354b2c10071264bf41d279f6c 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -19,9 +19,9 @@ set(CBLAS_FOUND OFF)
 set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
 set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
 
-find_path(MKL_INCLUDE_DIR mkl.h PATHS
+find_path(MKL_INC_DIR mkl.h PATHS
   ${MKL_ROOT}/include)
-find_path(MKL_INCLUDE_DIR mkl_lapacke.h PATHS
+find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
   ${MKL_ROOT}/include)
 find_library(MKL_CORE_LIB NAMES mkl_core PATHS
   ${MKL_ROOT}/lib
@@ -34,15 +34,19 @@ find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
   ${MKL_ROOT}/lib/intel64)
 
 
-if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
+if(MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
   set(CBLAS_PROVIDER MKL)
-  set(CBLAS_INC_DIR ${MKL_INCLUDE_DIR})
+  set(CBLAS_INC_DIR ${MKL_INC_DIR})
   set(CBLAS_LIBRARIES ${MKL_INTEL_LP64}
           ${MKL_SEQUENTIAL_LIB}
           ${MKL_CORE_LIB})
   add_definitions(-DPADDLE_USE_MKL)
   message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
   set(CBLAS_FOUND ON)
+  if(${MKL_LAPACK_INC_DIR})
+    add_definitions(-DPADDLE_USE_LAPACK)
+    message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})")
+  endif()
   return() # return file.
 endif()
 
@@ -68,13 +72,17 @@ find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
 find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
   PATHS ${ATLAS_LIB_SEARCH_PATHS})
 
-if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB)
+if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB AND NOT CBLAS_FOUND)
   set(CBLAS_PROVIDER ATLAS)
-  set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
+  set(CBLAS_INC_DIR ${ATLAS_INC_DIR})
   set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
   add_definitions(-DPADDLE_USE_ATLAS)  
-  message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  message(STATUS "Found ATLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
   set(CBLAS_FOUND ON)
+  if(ATLAS_CLAPACK_INC_DIR)
+    add_definitions(-DPADDLE_USE_LAPACK)
+    message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
+  endif()
   return()
 endif()
 
@@ -103,8 +111,12 @@ if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
   set(CBLAS_PROVIDER OPENBLAS)
   set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR})
   set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
-  message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  message(STATUS "Found OpenBLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
   set(CBLAS_FOUND ON)
+  if(OPENBLAS_LAPACKE_INC_DIR)
+    add_definitions(-DPADDLE_USE_LAPACK)
+    message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
+  endif()
   return()
 endif()
 
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 0bb016201dd8ae912ac8ec9f925bc5277fad7aed..5e507e78f74eee885922f502f35e3c15fafb622d 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -32,6 +32,14 @@ if(NOT WITH_PROFILER)
     add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
 
+if(NOT CMAKE_CROSSCOMPILING)
+    if(WITH_AVX AND AVX_FOUND)
+        set(SIMD_FLAG ${AVX_FLAG})
+    elseif(SSE3_FOUND)
+        set(SIMD_FLAG ${SSE3_FLAG})
+    endif()
+endif()
+
 if(NOT WITH_GPU)
     add_definitions(-DPADDLE_ONLY_CPU)
     add_definitions(-DHPPL_STUB_FUNC)
@@ -48,21 +56,12 @@ else()
         message(FATAL_ERROR "Paddle need cudnn to compile")
     endif()
 
-    if(WITH_AVX)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
-    else(WITH_AVX)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
-    endif(WITH_AVX)
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
 
     # Include cuda and cudnn
     include_directories(${CUDNN_INCLUDE_DIR})
     include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
 
-if(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
-else(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
-endif(WITH_AVX)
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 5754407d66c18872bf0cf314ee6e0a32e0d4329d..af9be86961833dcd62371227165d411a3b61d79e 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -1,3 +1,7 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
 set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT")
 find_path(CUDNN_INCLUDE_DIR cudnn.h
     PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 8acb4065e35ab4981c389b2ccb22e9e2cddabf88..0afb3ab9af48046af01f03838eefa0bd2fcb2821 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -33,6 +33,8 @@ ExternalProject_Add(
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
     CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
     CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
     CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
     CMAKE_ARGS      -DBUILD_TESTING=OFF
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index cdef63188db001ffe9f06691cb3abffad9cdde5f..4a9e2ecc6bbe74c5856a55fb0c982777d7ac25b7 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -35,6 +35,8 @@ ExternalProject_Add(
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
     CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
     CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
     CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
     CMAKE_ARGS      -DWITH_GFLAGS=ON
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 555469e5c11dd3ee7458888e72fd3a5fec08ec0e..49c7d71443cda700a14af6be65ff6658eec7229f 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -43,6 +43,8 @@ IF(WITH_TESTING)
         UPDATE_COMMAND  ""
         CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
         CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+        CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+        CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
         CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
         CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
         CMAKE_ARGS      -DBUILD_GMOCK=ON
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index a11d18617bf3a81065dce9a99d948561c181b9d4..92ea23c7633e974fd09251f967965364b1928307 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -54,6 +54,8 @@ IF(NOT ${CBLAS_FOUND})
                 "you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...")
     ENDIF(NOT CMAKE_Fortran_COMPILER)
 
+    ADD_DEFINITIONS(-DPADDLE_USE_LAPACK)
+
     ExternalProject_Add(
         openblas
         ${EXTERNAL_PROJECT_LOG_ARGS}
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 93d7275df05d723d7dd66ef0c5ac15672c051c34..9fd3afd0998b38c18b4490e6fb1c6fe0222ed142 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -219,9 +219,9 @@ ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
 
 ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
 
-INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
-INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
-
-IF(NOT WITH_PYTHON)
+IF(WITH_PYTHON)
+    INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
+    INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
+ELSE()
     SET(PYTHON_LIBRARIES "")
 ENDIF()
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index c1c7bd4a432a0fb36580276b7997b91e370a997e..293070c3cfcc1196001f64469f3254289b0de792 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -50,6 +50,8 @@ ExternalProject_Add(
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
     CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
     CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
     CMAKE_ARGS      -DWITH_GPU=${WITH_GPU}
     CMAKE_ARGS      -DWITH_OMP=${USE_OMP}
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 519d9e0f21692fc2b69f8c57b87ffb1ef56b1239..45ca5542b7dc30216b45487782f849b93c5f8fca 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -22,7 +22,7 @@ SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include dire
 IF(WIN32)
   SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE)
 ELSE(WIN32)
-  set(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
+  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
 ENDIF(WIN32)
 
 INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
@@ -36,6 +36,8 @@ ExternalProject_Add(
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
     CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
     CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
     CMAKE_ARGS      -DBUILD_SHARED_LIBS=OFF
     CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index b76852fc6c50e80633c8294fb2724b83f15293a7..7eb92efcb00fa18461e61e0508b485c13ef23a1f 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -2,6 +2,7 @@
 include(CheckCXXCompilerFlag)
 include(CheckCCompilerFlag)
 include(CheckCXXSymbolExists)
+include(CheckTypeSize)
 
 function(CheckCompilerCXX11Flag)
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
@@ -25,7 +26,7 @@ function(CheckCompilerCXX11Flag)
 endfunction()
 
 CheckCompilerCXX11Flag()
-LIST(APPEND CMAKE_CXX_FLAGS -std=c++11)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 
 # safe_set_flag
 #
@@ -83,6 +84,17 @@ if(NOT UINT64_MAX_EXISTS)
   endif()
 endif()
 
+SET(CMAKE_EXTRA_INCLUDE_FILES "pthread.h")
+CHECK_TYPE_SIZE(pthread_spinlock_t SPINLOCK_FOUND)
+CHECK_TYPE_SIZE(pthread_barrier_t BARRIER_FOUND)
+if(SPINLOCK_FOUND)
+  add_definitions(-DPADDLE_USE_PTHREAD_SPINLOCK)
+endif(SPINLOCK_FOUND)
+if(BARRIER_FOUND)
+  add_definitions(-DPADDLE_USE_PTHREAD_BARRIER)
+endif(BARRIER_FOUND)
+SET(CMAKE_EXTRA_INCLUDE_FILES "")
+
 # Common flags. the compiler flag used for C/C++ sources whenever release or debug
 # Do not care if this flag is support for gcc.
 set(COMMON_FLAGS
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index d380c996dfa95f0caa2b9cd9daa0ac9141e51fe0..46035a908b588861607a25d3a21cf34b7b6fd4b8 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -2,6 +2,7 @@
 # so that PaddlePaddle can unleash the vectorization power of muticore.
 
 INCLUDE(CheckCXXSourceRuns)
+INCLUDE(CheckCXXSourceCompiles)
 
 IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
     set(MMX_FLAG "-mmmx")
@@ -17,6 +18,8 @@ ELSEIF(MSVC)
     SET(AVX2_FLAG "/arch:AVX2")
 ENDIF()
 
+set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
+
 # Check  MMX
 set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
 CHECK_CXX_SOURCE_RUNS("
@@ -73,4 +76,5 @@ int main()
     return 0;
 }" AVX2_FOUND)
 
+set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 3e472da7e0bd9c433f92f3e8b52970cd2cc6dcba..3ca06665ab2385e34302a6bcce7ada549ea1e247 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -67,6 +67,12 @@ MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
 MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
 MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
 
+IF(DEFINED CMAKE_SYSTEM_NAME)
+    IF(${CMAKE_SYSTEM_NAME} STREQUAL "Android")
+        SET(ANDROID TRUE)
+    ENDIF()
+ENDIF()
+
 # external dependencies log output
 SET(EXTERNAL_PROJECT_LOG_ARGS
     LOG_DOWNLOAD    0     # Wrap download in script to log output
diff --git a/cmake/util.cmake b/cmake/util.cmake
index bacb64eb9ee65fffc824e4587a22fc432c092b19..099a85809d93e01772bf8c5c329fd9055ee4f054 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -90,6 +90,10 @@ function(link_paddle_exe TARGET_NAME)
         ${RDMA_LD_FLAGS}
         ${RDMA_LIBS})
 
+    if(ANDROID)
+        target_link_libraries(${TARGET_NAME} log)
+    endif(ANDROID)
+
     add_dependencies(${TARGET_NAME} ${external_project_dependencies})
 endfunction()
 
diff --git a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
index f35bfbc5c8253d632f8089f5037421f527633aad..9c49a4bd2083794e98b099b25944bedec3d5a2ff 100644
--- a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
@@ -17,7 +17,11 @@ limitations under the License. */
 
 #include <stdio.h>
 #include "hl_base.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include "hl_neon_matrix_kernel.cuh"
+#else
 #include "hl_sse_matrix_kernel.cuh"
+#endif
 
 /**
  * @brief   cpu element wise unary operator.
diff --git a/paddle/cuda/include/hl_matrix_base.cuh b/paddle/cuda/include/hl_matrix_base.cuh
index db35ee2037433163ebb3673edb350e3fab71fba9..8b755c1095c2c4fdb7e74d8cddc948e6a6af380b 100644
--- a/paddle/cuda/include/hl_matrix_base.cuh
+++ b/paddle/cuda/include/hl_matrix_base.cuh
@@ -66,6 +66,8 @@ typedef BaseOp SSESquaredDiff;
 typedef BaseOp SSEFirst;
 typedef BaseOp SSESecond;
 typedef BaseOp SSEClassificationError;
+#elif defined(__ARM__NEON__) || defined(__ARM_NEON)
+#include "hl_matrix_base_neon.cuh"
 #else
 #include "hl_matrix_base_sse.cuh"
 #endif
diff --git a/paddle/cuda/include/hl_matrix_base_neon.cuh b/paddle/cuda/include/hl_matrix_base_neon.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e13019f5ee24ad600005c99678426ee3808b0e54
--- /dev/null
+++ b/paddle/cuda/include/hl_matrix_base_neon.cuh
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_MATRIX_BASE_NEON_CUH_
+#define HL_MATRIX_BASE_NEON_CUH_
+
+namespace aggregate {
+class SSESum {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vaddq_f32(a, b);
+  }
+};
+
+class SSEMax {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vmaxq_f32(a, b);
+  }
+};
+
+class SSEMin {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vminq_f32(a, b);
+  }
+};
+}  // namespace aggregate
+
+namespace base {
+namespace unary {
+class SSEIdentity {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a) const {
+    return a;
+  }
+};
+}  // namespace unary
+
+namespace binary {
+class SSEAdd {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vaddq_f32(a, b);
+  }
+};
+
+class SSEAdd2 {
+public:
+  static const bool sse = true;
+  const real p1;
+  const real p2;
+  float32x4_t mp1;
+  float32x4_t mp2;
+
+public:
+  SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
+    mp1 = vdupq_n_f32(p1);
+    mp2 = vdupq_n_f32(p2);
+  }
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    float32x4_t tmp1, tmp2;
+    tmp1 = vmulq_f32(mp1, a);
+    tmp2 = vmulq_f32(mp2, b);
+    return vaddq_f32(tmp1, tmp2);
+  }
+};
+
+class SSESub {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vsubq_f32(a, b);
+  }
+};
+
+class SSEMul {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vmulq_f32(a, b);
+  }
+};
+
+class SSEDiv {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    float32x4_t tmp;
+    tmp = vrecpeq_f32(b);
+    return vmulq_f32(a, tmp);
+  }
+};
+
+class SSESquaredDiff {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    float32x4_t tmp;
+    tmp = vsubq_f32(a, b);
+    return vmulq_f32(tmp, tmp);
+  }
+};
+
+class SSEFirst {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return a;
+  }
+};
+
+class SSESecond {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return b;
+  }
+};
+
+class SSEClassificationError {
+public:
+  static const bool sse = true;
+  const real p;
+  float32x4_t mp;
+  uint32x4_t result;
+
+public:
+  explicit SSEClassificationError(const real s) : p(s) {
+    mp = vdupq_n_f32(p);
+    result = vdupq_n_u32(1);
+  }
+  // TODO: to be check
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    uint32x4_t tmp1 = vcgtq_f32(a, mp);
+    uint32x4_t tmp2 = vcgtq_f32(b, mp);
+    uint32x4_t tmp3 = veorq_u32(tmp1, tmp2);
+    return vcvtq_f32_u32(vandq_u32(tmp3, result));
+  }
+};
+}  // namespace binary
+}  // namespace base
+
+#endif /* HL_MATRIX_BASE_NEON_CUH_ */
diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/cuda/include/hl_matrix_type.cuh
index 59213eee75f50d3c054ed8684a9a0e1053342a0a..f965ba966793f6f6eea0ad3606f60553fe904dda 100644
--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -17,13 +17,20 @@ limitations under the License. */
 
 #include "hl_base.h"
 
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__)
 #include <vector_types.h>
 #ifndef PADDLE_TYPE_DOUBLE
 typedef float4 vecType;
 #else
 typedef double2 vecType;
 #endif
+#elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
+#include <arm_neon.h>
+#ifndef PADDLE_TYPE_DOUBLE
+typedef float32x4_t  vecType;
+#else
+#error NEON instructions does not support double precision
+#endif
 #else
 #include <mmintrin.h>
 #include <xmmintrin.h>
diff --git a/paddle/cuda/include/hl_neon_matrix_kernel.cuh b/paddle/cuda/include/hl_neon_matrix_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..7b4e5b00079b66d0a46a1344a43f41962cf50f10
--- /dev/null
+++ b/paddle/cuda/include/hl_neon_matrix_kernel.cuh
@@ -0,0 +1,299 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_NEON_MATRIX_KERNEL_CUH_
+#define HL_NEON_MATRIX_KERNEL_CUH_
+
+#include "hl_matrix_type.cuh"
+
+#define VECTOR_SIZE     16
+
+/* number of float in vector */
+#define     VECTOR_LEN      4
+#define     VECTOR_SET      vdupq_n_f32
+
+inline bool hl_check_align(size_t size) {
+  return !(size & (VECTOR_SIZE - 1));
+}
+
+inline bool hl_check_align(void *ptr) {
+  return hl_check_align(reinterpret_cast<size_t>(ptr));
+}
+
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  float32x4_t rev = vrev64q_f32(mm);
+  float32x4_t tmp1 = agg.vecOp(rev, rev);
+  float32x2_t lo = vget_high_f32(rev);
+  float32x2_t hi = vget_low_f32(rev);
+  float32x4_t tmp2 = vcombine_f32(hi, lo);
+  float32x4_t ret = agg.vecOp(tmp1, tmp2);
+
+  return vgetq_lane_f32(ret, 0);
+}
+
+template <class Agg, class Op, class Saver>
+void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
+                          int dimM, int dimN,
+                          real *dst, int ld,
+                          real *A, int lda) {
+  for (int i = 0; i < dimM; i++, A += lda) {
+    vecType mm = VECTOR_SET(agg.init());
+    vecType *a = (vecType*)(A);
+    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++) {
+      mm = agg.vecOp(mm, op.vecOp(*a));
+    }
+
+    int rem = dimN % VECTOR_LEN;
+    if (rem) {
+      real tmp = hl_agg_op(agg, mm);
+      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
+      for (int j = 0; j < rem; j++) {
+        tmp = agg(tmp, op(a[j]));
+      }
+      dst[i*ld] = sv(dst[i*ld], tmp);
+    } else {
+      dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
+    }
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
+                          int dimM, int dimN,
+                          real *dst, int ld,
+                          real *A, int lda,
+                          real *B, int ldb) {
+  for (int i = 0; i < dimM; i++, A += lda, B += ldb) {
+    vecType mm = VECTOR_SET(agg.init());
+    vecType *a = (vecType*)(A);
+    vecType *b = (vecType*)(B);
+    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++, b++) {
+        mm = agg.vecOp(mm, op.vecOp(*a, *b));
+    }
+
+    int rem = dimN % VECTOR_LEN;
+    if (rem) {
+      real tmp = hl_agg_op(agg, mm);
+      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
+      real *b = B + (dimN / VECTOR_LEN) * VECTOR_LEN;
+      for (int j = 0; j < rem; j++) {
+          tmp = agg(tmp, op(a[j], b[j]));
+      }
+      dst[i*ld] = sv(dst[i*ld], tmp);
+    } else {
+        dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
+    }
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_matrix_column_op(Agg agg, Op op, Saver sv,
+                         int dimM, int dimN,
+                         real *dst,
+                         real *A, int lda) {
+  for (int j = 0; j < dimN; j++) {
+    real tmp = agg.init();
+    for (int i = 0; i < dimM; i++) {
+        tmp = agg(tmp, op(A[i * lda + j]));
+    }
+    dst[j] = sv(dst[j], tmp);
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_matrix_column_op(Agg agg, Op op, Saver sv,
+                         int dimM, int dimN,
+                         real *dst,
+                         real *A, int lda,
+                         real *B, int ldb) {
+  for (int j = 0; j < dimN; j++) {
+    real tmp = agg.init();
+    for (int i = 0; i < dimM; i++) {
+        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
+    }
+    dst[j] = sv(dst[j], tmp);
+  }
+}
+
+/*
+ * MaxRow greater than or equal dimN
+ * dimN is multiples of VECTOR_LEN
+ * so rem <= MaxRow / VECTOR_LEN
+ */
+template <int MaxRow, class Agg, class Op, class Saver>
+void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
+                               int dimM, int dimN,
+                               real *dst,
+                               real *A, int lda) {
+  vecType mm[MaxRow / VECTOR_LEN];
+  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
+    mm[n] = VECTOR_SET(agg.init());
+  }
+
+  for (int i = 0; i < dimM; i++) {
+    vecType *a = (vecType*)(A + i * lda);
+    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
+      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
+    }
+  }
+
+  vecType *result = (vecType*)(dst);
+  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
+    result[n] = sv.vecOp(result[n], mm[n]);
+  }
+
+  int rem = dimN % VECTOR_LEN;
+  if (rem) {
+    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda);
+  }
+}
+
+/*
+ * dimN is multiples of VECTOR_LEN
+ * dimN greater than Step
+ */
+template <int Step, class Agg, class Op, class Saver>
+void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda) {
+  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step) {
+    vecType mm[Step / VECTOR_LEN];
+    for (int n = 0; n < Step / VECTOR_LEN; n++) {
+      mm[n] = VECTOR_SET(agg.init());
+    }
+
+    for (int i = 0; i < dimM; i++) {
+      vecType *a = (vecType*)(A + i * lda);
+      for (int n = 0; n < Step / VECTOR_LEN; n++) {
+        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
+      }
+    }
+
+    vecType *result = (vecType*)(dst);
+    for (int n = 0; n < Step / VECTOR_LEN; n++) {
+      result[n] = sv.vecOp(result[n], mm[n]);
+    }
+  }
+
+  int remRow = dimN % Step;
+  if (remRow) {
+    hl_sse_column_op_with_rem<Step>(agg, op, sv, dimM, remRow, dst, A, lda);
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda) {
+  if (dimN <= 16) {
+    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda);
+  } else if (dimN <= 32) {
+    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda);
+  } else if (dimN <= 1024 || dimM <= 512) {
+    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda);
+  } else {
+    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda);
+  }
+}
+
+template <int MaxRow, class Agg, class Op, class Saver>
+void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
+                               int dimM, int dimN,
+                               real *dst,
+                               real *A, int lda,
+                               real *B, int ldb) {
+  vecType mm[MaxRow / VECTOR_LEN];
+  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
+    mm[n] = VECTOR_SET(agg.init());
+  }
+
+  for (int i = 0; i < dimM; i++) {
+    vecType *a = (vecType*)(A + i * lda);
+    vecType *b = (vecType*)(B + i * ldb);
+    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
+      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
+    }
+  }
+
+  vecType *result = (vecType*)(dst);
+  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
+    result[n] = sv.vecOp(result[n], mm[n]);
+  }
+
+  int rem = dimN % VECTOR_LEN;
+  if (rem) {
+    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    B += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda, B, ldb);
+  }
+}
+
+template <int Step, class Agg, class Op, class Saver>
+void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda,
+                             real *B, int ldb) {
+  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step, B += Step) {
+    vecType mm[Step / VECTOR_LEN];
+    for (int n = 0; n < Step / VECTOR_LEN; n++) {
+      mm[n] = VECTOR_SET(agg.init());
+    }
+
+    for (int i = 0; i < dimM; i++) {
+      vecType *a = (vecType*)(A + i * lda);
+      vecType *b = (vecType*)(B + i * ldb);
+      for (int n = 0; n < Step / VECTOR_LEN; n++) {
+        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
+      }
+    }
+
+    vecType *result = (vecType*)(dst);
+    for (int n = 0; n < Step / VECTOR_LEN; n++) {
+      result[n] = sv.vecOp(result[n], mm[n]);
+    }
+  }
+
+  int remRow = dimN % Step;
+  if (remRow) {
+    hl_sse_column_op_with_rem<Step>(
+        agg, op, sv, dimM, remRow, dst, A, lda, B, ldb);
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda,
+                             real *B, int ldb) {
+  if (dimN <= 16) {
+    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  } else if (dimN <= 32) {
+    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  } else if (dimN <= 1024 || dimM <= 512) {
+    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  } else {
+    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  }
+}
+
+#endif /* HL_NEON_MATRIX_KERNEL_CUH_ */
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 9ad00c6f370cf64e9cc26f16e62c4d2ddb284003..15eb35b7f7dac1b98f2d8694707d83b84bda0f2e 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -38,7 +38,7 @@ public:
       if (err) {
         *err = Error(e.what());
       } else {
-        LOG(FATAL) << "Cannot get key " << key << "with error " << e.what();
+        LOG(FATAL) << "Cannot get key " << key << " with error " << e.what();
       }
       return T();
     }
diff --git a/paddle/function/PadOpGpu.cu b/paddle/function/PadOpGpu.cu
index 9104b1aca507c526858c2117e0a5db59f535091e..9094f1528433fdcaad3397a991aa8ac6fa04bc01 100644
--- a/paddle/function/PadOpGpu.cu
+++ b/paddle/function/PadOpGpu.cu
@@ -44,9 +44,9 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs,
   size_t nth = num * inC * inH * inW;
   int blockSize = 1024;
   int gridSize = (nth + 1024 - 1) / 1024;
-  int cstart = pad.channelStart, cend = pad.channelEnd;
-  int hstart = pad.heightStart, hend = pad.heightEnd;
-  int wstart = pad.widthStart, wend = pad.widthEnd;
+  int cstart = pad.channel[0], cend = pad.channel[1];
+  int hstart = pad.height[0], hend = pad.height[1];
+  int wstart = pad.width[0], wend = pad.width[1];
   int outC = inC + cstart + cend;
   int outH = inH + hstart + hend;
   int outW = inW + wstart + wend;
@@ -83,9 +83,9 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
   int nth = num * inC * inH * inW;
   int blockSize = 1024;
   int gridSize = (nth + 1024 - 1) / 1024;
-  int cstart = pad.channelStart, cend = pad.channelEnd;
-  int hstart = pad.heightStart, hend = pad.heightEnd;
-  int wstart = pad.widthStart, wend = pad.widthEnd;
+  int cstart = pad.channel[0], cend = pad.channel[1];
+  int hstart = pad.height[0], hend = pad.height[1];
+  int wstart = pad.width[0], wend = pad.width[1];
   int outC = inC + cstart + cend;
   int outH = inH + hstart + hend;
   int outW = inW + wstart + wend;
diff --git a/paddle/function/PadOpTest.cpp b/paddle/function/PadOpTest.cpp
index cd22d9113567912f7694e05e5d631e49d940e3ac..f77ac2a8c49c83f2d6c64c2a30b6a2f2eb09ac10 100644
--- a/paddle/function/PadOpTest.cpp
+++ b/paddle/function/PadOpTest.cpp
@@ -24,48 +24,22 @@ TEST(Pad, real) {
         for (size_t imgSizeW : {5, 32, 96}) {
           VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
                   << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
-
-          FunctionCompare compare("Pad",
-                                  FuncConfig()
-                                      .set("cstart", 2)
-                                      .set("cend", 3)
-                                      .set("hstart", 1)
-                                      .set("hend", 2)
-                                      .set("wstart", 3)
-                                      .set("wend", 2));
-          TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
-          TensorShape outDims{
-              numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
-          compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, inDims));
-          compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT, outDims, ASSIGN_TO));
-          compare.run();
-        }
-      }
-    }
-  }
-}
-
-TEST(PadGrad, real) {
-  for (size_t numSamples : {5, 32}) {
-    for (size_t channels : {1, 5, 32}) {
-      for (size_t imgSizeH : {5, 33, 100}) {
-        for (size_t imgSizeW : {5, 32, 96}) {
-          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
-          FunctionCompare compare("PadGrad",
-                                  FuncConfig()
-                                      .set("cstart", 2)
-                                      .set("cend", 3)
-                                      .set("hstart", 1)
-                                      .set("hend", 2)
-                                      .set("wstart", 3)
-                                      .set("wend", 2));
-          TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
-          TensorShape outDims{
-              numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
-          compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, outDims));
-          compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT, inDims, ASSIGN_TO));
-          compare.run();
+          for (bool test_grad : {false, true}) {
+            FunctionCompare compare(
+                test_grad ? "PadGrad" : "Pad",
+                FuncConfig()
+                    .set<std::vector<uint32_t>>("channel", {2, 3})
+                    .set<std::vector<uint32_t>>("height", {1, 2})
+                    .set<std::vector<uint32_t>>("width", {3, 2}));
+            TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
+            TensorShape outDims{
+                numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
+            compare.addInputs(
+                BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
+            compare.addOutputs(BufferArg(
+                VALUE_TYPE_FLOAT, test_grad ? inDims : outDims, ASSIGN_TO));
+            compare.run();
+          }
         }
       }
     }
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index 7617af10ba719490d1b33dd297b070cd8c7c292c..a0b1cd471dd02fd20bb2247395bdb74651610bbf 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -778,8 +778,10 @@ void testProjectionGrad(ProjectionConfig conf,
   config.biasSize = biasSize == 0 ? config.layerConfig.size() : biasSize;
   config.layerConfig.set_bias_size(config.biasSize);
   config.layerConfig.set_shared_biases(sharedBias);
-  config.inputDefs.push_back(
-      {inputType, "layer_0", conf.input_size(), parameterSize});
+  config.inputDefs.push_back({inputType,
+                              "layer_0",
+                              static_cast<size_t>(conf.input_size()),
+                              parameterSize});
   *config.layerConfig.add_inputs()->mutable_proj_conf() = conf;
   config.testState = testState;
   testLayerGrad(config, "mixed", batchSize, false, useGpu);
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index d7aa1184872d5a6129becca1f6e282776c9dbe15..6203cd3b9ab9f95853cd3c46750fd55d6dfbba4a 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -85,11 +85,16 @@ int getrf<float>(const CBLAS_ORDER order,
                  float* A,
                  const int lda,
                  int* ipiv) {
+#ifdef PADDLE_USE_LAPACK
 #ifdef PADDLE_USE_ATLAS
   return clapack_sgetrf(order, M, N, A, lda, ipiv);
 #else
   return LAPACKE_sgetrf(order, M, N, A, lda, ipiv);
 #endif
+#else
+  LOG(FATAL) << "Not implemented";
+#endif
+  return 0;
 }
 
 template <>
@@ -99,11 +104,16 @@ int getrf<double>(const CBLAS_ORDER order,
                   double* A,
                   const int lda,
                   int* ipiv) {
+#ifdef PADDLE_USE_LAPACK
 #ifdef PADDLE_USE_ATLAS
   return clapack_dgetrf(order, M, N, A, lda, ipiv);
 #else
   return LAPACKE_dgetrf(order, M, N, A, lda, ipiv);
 #endif
+#else
+  LOG(FATAL) << "Not implemented";
+#endif
+  return 0;
 }
 
 template <>
@@ -112,11 +122,16 @@ int getri<float>(const CBLAS_ORDER order,
                  float* A,
                  const int lda,
                  const int* ipiv) {
+#ifdef PADDLE_USE_LAPACK
 #ifdef PADDLE_USE_ATLAS
   return clapack_sgetri(order, N, A, lda, ipiv);
 #else
   return LAPACKE_sgetri(order, N, A, lda, ipiv);
 #endif
+#else
+  LOG(FATAL) << "Not implemented";
+#endif
+  return 0;
 }
 
 template <>
@@ -125,11 +140,16 @@ int getri<double>(const CBLAS_ORDER order,
                   double* A,
                   const int lda,
                   const int* ipiv) {
+#ifdef PADDLE_USE_LAPACK
 #ifdef PADDLE_USE_ATLAS
   return clapack_dgetri(order, N, A, lda, ipiv);
 #else
   return LAPACKE_dgetri(order, N, A, lda, ipiv);
 #endif
+#else
+  LOG(FATAL) << "Not implemented";
+#endif
+  return 0;
 }
 
 template <>
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index c8559eefd8378450fc18c2ba821c65b39c8cc046..9f8f84a87c5e60b2a6573844f251c42152d8156b 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -17,11 +17,14 @@ limitations under the License. */
 
 #ifdef PADDLE_USE_MKL
 #include <mkl.h>
+#ifdef PADDLE_USE_LAPACK
 #include <mkl_lapacke.h>
+#endif
 #else
 extern "C" {
 #include <cblas.h>
 }
+#ifdef PADDLE_USE_LAPACK
 #ifdef PADDLE_USE_ATLAS
 extern "C" {
 #include <clapack.h>
@@ -30,6 +33,7 @@ extern "C" {
 #include <lapacke.h>
 #endif
 #endif
+#endif
 
 #include <cmath>
 
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 5f30a15f2eb913d57d01479cf132e188b9e7c813..55a7344495f8e57dc95095ab1b81b45008fa9acc 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -2426,41 +2426,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
   int lda = a->getStride();
   int ldb = b->getStride();
   int ldc = getStride();
-#ifndef PADDLE_TYPE_DOUBLE
-  cblas_sgemm(CblasRowMajor,
-              a_trans,
-              b_trans,
-              M,
-              N,
-              K,
-              scaleAB,
-              A,
-              lda,
-              B,
-              ldb,
-              scaleT,
-              C,
-              ldc);
-#else
-  cblas_dgemm(CblasRowMajor,
-              a_trans,
-              b_trans,
-              M,
-              N,
-              K,
-              scaleAB,
-              A,
-              lda,
-              B,
-              ldb,
-              scaleT,
-              C,
-              ldc);
-// TODO(yuyang18): Is gemm defined other place?
-#endif
-
-  VLOG(2) << " A[0]=" << A[0] << " A[1]=" << A[1] << " B[0]=" << B[0]
-          << " B[1]=" << B[1] << " C[0]=" << C[0] << " C[1]=" << C[1];
+  gemm<real>(
+      a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
 }
 
 void CpuMatrix::mul(
diff --git a/paddle/math/SIMDFunctions.cpp b/paddle/math/SIMDFunctions.cpp
index 95219debf50e57407b668d315b91141d259fc779..d66d543a61450b47b7758b50eaecc107c6fe3576 100644
--- a/paddle/math/SIMDFunctions.cpp
+++ b/paddle/math/SIMDFunctions.cpp
@@ -13,119 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "SIMDFunctions.h"
+#ifdef __SSE3__
 #include <immintrin.h>
+#endif
 #include <algorithm>
 
-#ifndef __AVX__
-static void addto_sse(float* a, const float* b, size_t len) {
-  int offset = len % 16;
-  __m128 ma0, ma1, ma2, ma3;
-  __m128 mb0, mb1, mb2, mb3;
-
-  for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) {
-    ma0 = _mm_load_ps(a);
-    ma1 = _mm_load_ps(a + 4);
-    ma2 = _mm_load_ps(a + 8);
-    ma3 = _mm_load_ps(a + 12);
-
-    mb0 = _mm_load_ps(b);
-    mb1 = _mm_load_ps(b + 4);
-    mb2 = _mm_load_ps(b + 8);
-    mb3 = _mm_load_ps(b + 12);
-
-    ma0 = _mm_add_ps(ma0, mb0);
-    ma1 = _mm_add_ps(ma1, mb1);
-    ma2 = _mm_add_ps(ma2, mb2);
-    ma3 = _mm_add_ps(ma3, mb3);
-
-    _mm_store_ps(a, ma0);
-    _mm_store_ps(a + 4, ma1);
-    _mm_store_ps(a + 8, ma2);
-    _mm_store_ps(a + 12, ma3);
-  }
-
-  for (int i = 0; i < offset; i++) a[i] += b[i];
-}
-
-static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
-  int offset = len % 16;
-
-  __m128 ma0, ma1, ma2, ma3;
-  __m128 mb0, mb1, mb2, mb3;
-
-  for (unsigned int k = 0; k < len / 16; k++, a += 16) {
-    ma0 = _mm_load_ps(a);
-    ma1 = _mm_load_ps(a + 4);
-    ma2 = _mm_load_ps(a + 8);
-    ma3 = _mm_load_ps(a + 12);
-
-    for (int i = 0; i < batch; i++) {
-      mb0 = _mm_load_ps(b[i]);
-      mb1 = _mm_load_ps(b[i] + 4);
-      mb2 = _mm_load_ps(b[i] + 8);
-      mb3 = _mm_load_ps(b[i] + 12);
-      ma0 = _mm_add_ps(ma0, mb0);
-      ma1 = _mm_add_ps(ma1, mb1);
-      ma2 = _mm_add_ps(ma2, mb2);
-      ma3 = _mm_add_ps(ma3, mb3);
-      b[i] += 16;
-    }
-
-    _mm_store_ps(a, ma0);
-    _mm_store_ps(a + 4, ma1);
-    _mm_store_ps(a + 8, ma2);
-    _mm_store_ps(a + 12, ma3);
-  }
-
-  for (int i = 0; i < offset; i++) {
-    for (int k = 0; k < batch; k++) a[i] += b[k][i];
-  }
-  return;
-}
-
-static void col_max_sse(float* result,
-                        const float* data,
-                        int dim,
-                        int numSamples) {
-  // first sample, direct copy
-  for (int d = 0; d < dim; ++d) {
-    result[d] = data[d];
-  }
-  int offset = dim % 16;
-  __m128 ma0, ma1, ma2, ma3;
-  __m128 mb0, mb1, mb2, mb3;
-  // first 16n dims
-  for (int k = 0; k < dim / 16; k++, result += 16, data += 16) {
-    ma0 = _mm_load_ps(result);
-    ma1 = _mm_load_ps(result + 4);
-    ma2 = _mm_load_ps(result + 8);
-    ma3 = _mm_load_ps(result + 12);
-    for (int i = 1; i < numSamples; i++) {
-      mb0 = _mm_load_ps(data + i * dim);
-      mb1 = _mm_load_ps(data + i * dim + 4);
-      mb2 = _mm_load_ps(data + i * dim + 8);
-      mb3 = _mm_load_ps(data + i * dim + 12);
-      ma0 = _mm_max_ps(ma0, mb0);
-      ma1 = _mm_max_ps(ma1, mb1);
-      ma2 = _mm_max_ps(ma2, mb2);
-      ma3 = _mm_max_ps(ma3, mb3);
-    }
-    _mm_store_ps(result, ma0);
-    _mm_store_ps(result + 4, ma1);
-    _mm_store_ps(result + 8, ma2);
-    _mm_store_ps(result + 12, ma3);
-  }
-  // last dims
-  for (int d = 0; d < offset; ++d) {
-    float sm = data[d];
-    for (int i = 1; i < numSamples; ++i) {
-      sm = std::max(sm, data[i * dim + d]);
-    }
-    result[d] = sm;
-  }
-}
-
-#else
+#ifdef __AVX__
 static void addto_avx(float* a, const float* b, size_t len) {
   int offset = len % 32;
 
@@ -355,17 +248,128 @@ static void decayL1_avx(
   }
 }
 
+#elif defined(__SSE3__)
+
+static void addto_sse(float* a, const float* b, size_t len) {
+  int offset = len % 16;
+  __m128 ma0, ma1, ma2, ma3;
+  __m128 mb0, mb1, mb2, mb3;
+
+  for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) {
+    ma0 = _mm_load_ps(a);
+    ma1 = _mm_load_ps(a + 4);
+    ma2 = _mm_load_ps(a + 8);
+    ma3 = _mm_load_ps(a + 12);
+
+    mb0 = _mm_load_ps(b);
+    mb1 = _mm_load_ps(b + 4);
+    mb2 = _mm_load_ps(b + 8);
+    mb3 = _mm_load_ps(b + 12);
+
+    ma0 = _mm_add_ps(ma0, mb0);
+    ma1 = _mm_add_ps(ma1, mb1);
+    ma2 = _mm_add_ps(ma2, mb2);
+    ma3 = _mm_add_ps(ma3, mb3);
+
+    _mm_store_ps(a, ma0);
+    _mm_store_ps(a + 4, ma1);
+    _mm_store_ps(a + 8, ma2);
+    _mm_store_ps(a + 12, ma3);
+  }
+
+  for (int i = 0; i < offset; i++) a[i] += b[i];
+}
+
+static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
+  int offset = len % 16;
+
+  __m128 ma0, ma1, ma2, ma3;
+  __m128 mb0, mb1, mb2, mb3;
+
+  for (unsigned int k = 0; k < len / 16; k++, a += 16) {
+    ma0 = _mm_load_ps(a);
+    ma1 = _mm_load_ps(a + 4);
+    ma2 = _mm_load_ps(a + 8);
+    ma3 = _mm_load_ps(a + 12);
+
+    for (int i = 0; i < batch; i++) {
+      mb0 = _mm_load_ps(b[i]);
+      mb1 = _mm_load_ps(b[i] + 4);
+      mb2 = _mm_load_ps(b[i] + 8);
+      mb3 = _mm_load_ps(b[i] + 12);
+      ma0 = _mm_add_ps(ma0, mb0);
+      ma1 = _mm_add_ps(ma1, mb1);
+      ma2 = _mm_add_ps(ma2, mb2);
+      ma3 = _mm_add_ps(ma3, mb3);
+      b[i] += 16;
+    }
+
+    _mm_store_ps(a, ma0);
+    _mm_store_ps(a + 4, ma1);
+    _mm_store_ps(a + 8, ma2);
+    _mm_store_ps(a + 12, ma3);
+  }
+
+  for (int i = 0; i < offset; i++) {
+    for (int k = 0; k < batch; k++) a[i] += b[k][i];
+  }
+  return;
+}
+
+static void col_max_sse(float* result,
+                        const float* data,
+                        int dim,
+                        int numSamples) {
+  // first sample, direct copy
+  for (int d = 0; d < dim; ++d) {
+    result[d] = data[d];
+  }
+  int offset = dim % 16;
+  __m128 ma0, ma1, ma2, ma3;
+  __m128 mb0, mb1, mb2, mb3;
+  // first 16n dims
+  for (int k = 0; k < dim / 16; k++, result += 16, data += 16) {
+    ma0 = _mm_load_ps(result);
+    ma1 = _mm_load_ps(result + 4);
+    ma2 = _mm_load_ps(result + 8);
+    ma3 = _mm_load_ps(result + 12);
+    for (int i = 1; i < numSamples; i++) {
+      mb0 = _mm_load_ps(data + i * dim);
+      mb1 = _mm_load_ps(data + i * dim + 4);
+      mb2 = _mm_load_ps(data + i * dim + 8);
+      mb3 = _mm_load_ps(data + i * dim + 12);
+      ma0 = _mm_max_ps(ma0, mb0);
+      ma1 = _mm_max_ps(ma1, mb1);
+      ma2 = _mm_max_ps(ma2, mb2);
+      ma3 = _mm_max_ps(ma3, mb3);
+    }
+    _mm_store_ps(result, ma0);
+    _mm_store_ps(result + 4, ma1);
+    _mm_store_ps(result + 8, ma2);
+    _mm_store_ps(result + 12, ma3);
+  }
+  // last dims
+  for (int d = 0; d < offset; ++d) {
+    float sm = data[d];
+    for (int i = 1; i < numSamples; ++i) {
+      sm = std::max(sm, data[i * dim + d]);
+    }
+    result[d] = sm;
+  }
+}
+
 #endif
 
-#ifndef __AVX__
-#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
-#else
+#if defined(__AVX__)
 #define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__)
+#elif defined(__SSE3__)
+#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
 #endif
 
 namespace paddle {
 namespace simd {
 namespace internal {
+#ifdef __SSE3__
 void addToImpl(float* a, const float* b, size_t len) {
   SIMD_INVOKE(addto, a, b, len);
 }
@@ -376,6 +380,7 @@ void batchAddToImpl(float* a, const float* b[], int batch, size_t len) {
 void colMaxImpl(float* result, const float* data, int dim, int numSamples) {
   SIMD_INVOKE(col_max, result, data, dim, numSamples);
 }
+#endif
 
 #ifdef __AVX__
 void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len) {
@@ -385,8 +390,8 @@ void decayL1AvxImpl(
     float* dst, float* src, float* lr, float lambda, size_t len) {
   decayL1_avx(dst, src, lr, lambda, len);
 }
-
 #endif
+
 }  // namespace internal
 }  // namespace simd
 }  // namespace paddle
diff --git a/paddle/math/SIMDFunctions.h b/paddle/math/SIMDFunctions.h
index 9b0a8719b287a2b88e966484090974586d64521f..439f11b79d134d7054f45f2d0a70fc5a6fde6c13 100644
--- a/paddle/math/SIMDFunctions.h
+++ b/paddle/math/SIMDFunctions.h
@@ -128,17 +128,29 @@ void decayL1AvxImpl(
 
 template <>
 inline void addTo(float* a, const float* b, size_t len) {
+#ifdef __SSE3__
   internal::addToImpl(a, b, len);
+#else
+  naive::addTo(a, b, len);
+#endif
 }
 
 template <>
 inline void batchAddTo(float* a, const float* b[], int batch, size_t len) {
+#ifdef __SSE3__
   internal::batchAddToImpl(a, b, batch, len);
+#else
+  naive::batchAddTo(a, b, batch, len);
+#endif
 }
 
 template <>
 inline void colMax(float* result, const float* data, int dim, int numSamples) {
+#ifdef __SSE3__
   internal::colMaxImpl(result, data, dim, numSamples);
+#else
+  naive::colMax(result, data, dim, numSamples);
+#endif
 }
 
 template <>
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 56e5442394b04230c22d668aa734dc0fa44004c2..7ce17a3207becb176a852a16fca52376009db9ee 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "Storage.h"
 #include "Allocator.h"
+#include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
 
 DEFINE_int32(pool_limit_size,
@@ -62,7 +63,7 @@ PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
     }
     if (gpuAllocator_[deviceId] == nullptr) {
       std::string name =
-          "gpu" + std::to_string(deviceId) + std::string("_pool");
+          "gpu" + str::to_string(deviceId) + std::string("_pool");
       gpuAllocator_[deviceId] =
           new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name);
     }
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index 877cbb86ec112739a5c7eeee969ca48ef491ee87..19ff40ba7e9584f772043f939bcb31caf666163d 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/Stat.h"
+#include "paddle/utils/StringUtil.h"
 
 DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
 DEFINE_double(async_lagged_ratio_min,
@@ -218,7 +219,8 @@ void ParameterServer2::setConfig(const SetConfigRequest& request,
   callback(response);
 
   /// always defined, barrier slowest node function need it.
-  statSet_.reset(new StatSet("ParameterServer" + std::to_string(serverId_)));
+  statSet_.reset(new StatSet("ParameterServer" +
+                             str::to_string(static_cast<int>(serverId_))));
 }
 
 real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
diff --git a/paddle/utils/CpuId.cpp b/paddle/utils/CpuId.cpp
index 8eefdd2980e7f56a836df6fd2ff8c31b81a55555..edd33c454122d95078e0fde2a2e9d68903951ee8 100644
--- a/paddle/utils/CpuId.cpp
+++ b/paddle/utils/CpuId.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 /// for MSVC
 #define CPUID(info, x) __cpuidex(info, x, 0)
 
-#else
+#elif !defined(__ANDROID__)
 
 #include <cpuid.h>
 
@@ -31,6 +31,7 @@ limitations under the License. */
 namespace paddle {
 
 SIMDFlags::SIMDFlags() {
+#if !defined(__ANDROID__)
   unsigned int cpuInfo[4];
   // CPUID: https://en.wikipedia.org/wiki/CPUID
   // clang-format off
@@ -51,6 +52,9 @@ SIMDFlags::SIMDFlags() {
   CPUID(cpuInfo, 0x80000001);
   simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4  : SIMD_NONE;
   // clang-fotmat on
+#else
+  simd_flags_ = SIMD_NEON;
+#endif
 }
 
 SIMDFlags const* SIMDFlags::instance() {
diff --git a/paddle/utils/CpuId.h b/paddle/utils/CpuId.h
index 5fc610964d4f5b8064f16ebf1b26bbb002264ce1..869be5be541dafd699a87a8e8893aadadf59b711 100644
--- a/paddle/utils/CpuId.h
+++ b/paddle/utils/CpuId.h
@@ -30,6 +30,7 @@ enum simd_t {
   SIMD_AVX    = 1 << 8,     ///< AVX
   SIMD_AVX2   = 1 << 9,     ///< AVX 2
   SIMD_AVX512 = 1 << 10,    ///< AVX 512
+  SIMD_NEON   = 1 << 11,    ///  NEON
 };
 // clang-format on
 
@@ -96,6 +97,7 @@ private:
 #define HAS_AVX     HAS_SIMD(SIMD_AVX)
 #define HAS_AVX2    HAS_SIMD(SIMD_AVX2)
 #define HAS_AVX512  HAS_SIMD(SIMD_AVX512)
+#define HAS_NEON    HAS_SIMD(SIMD_NEON)
 // clang-format on
 
 /**
diff --git a/paddle/utils/Logging.cpp b/paddle/utils/Logging.cpp
index 5a1c6ecb2219f7983609c27f3215c7fc1e9e9ef2..ea96bad240ad81c4c29b7dab35b015549052e2bb 100644
--- a/paddle/utils/Logging.cpp
+++ b/paddle/utils/Logging.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
  */
 
 #include "Logging.h"
+#include <cstdlib>
 
 namespace paddle {
 
diff --git a/paddle/utils/StringUtil.h b/paddle/utils/StringUtil.h
index 0b4f4c9113ae9d714b634b67931e51b408bbe777..95f071cb7de87d87f6988c136d7993c66fa9dde1 100644
--- a/paddle/utils/StringUtil.h
+++ b/paddle/utils/StringUtil.h
@@ -54,6 +54,25 @@ inline T toWithStatus(const std::string& s, bool* ok = nullptr) {
   return v;
 }
 
+/**
+ * Cast type T to string with status.
+ *
+ * @param [in] v input value of type T.
+ * @param [out] ok status, return true if there is no error in casting. Set
+ *              nullptr if user don't care error at all.
+ * @return result of casting. If error occurred, a empty string will be
+ *              returned.
+ */
+template <class T>
+inline std::string toWithStatus(const T v, bool* ok = nullptr) {
+  std::ostringstream sout;
+  sout << v;
+  if (ok) {
+    *ok = !sout.fail();
+  }
+  return sout.str();
+}
+
 /// Convert string to type T. It makes sure all the characters in s are used.
 /// Otherwise it will abort.
 ///
@@ -67,6 +86,18 @@ inline T to(const std::string& s) {
   return v;
 }
 
+/// Convert type T to string.
+///
+/// @tparam T type of input value
+/// @param v input value of type T
+template <class T>
+std::string to_string(T v) {
+  bool ok;
+  std::string s = toWithStatus<T>(v, &ok);
+  CHECK(ok) << "Cannot convert v(" << v << ") to type std::string";
+  return s;
+}
+
 }  // namespace str
 
 #undef DEFINE_STRING_CONVERSION
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index 1f56b6b8a96602d298507452fc7182d46179de41..b18b73e06a6c39c3bf9717280bc6323917c80efb 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -15,11 +15,16 @@ limitations under the License. */
 #include "Util.h"
 
 #include <dirent.h>
-#include <pmmintrin.h>
 #include <signal.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+
+#ifdef __SSE__
 #include <xmmintrin.h>
+#endif
+#ifdef __SSE3__
+#include <pmmintrin.h>
+#endif
 
 #include <fstream>
 #include <mutex>
@@ -163,8 +168,12 @@ void initMain(int argc, char** argv) {
 
   installProfilerSwitch();
 
+#ifdef __SSE__
   _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+#endif
+#ifdef __SSE3__
   _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#endif
 
   if (FLAGS_seed == 0) {
     unsigned int t = time(NULL);
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
index 2a6f96e04d024ac3977bc154dbeeb69ce9ab3a5d..310c9a6542563891d4ba5888e58406ea28d6a2ce 100644
--- a/paddle/utils/arch/linux/Locks.cpp
+++ b/paddle/utils/arch/linux/Locks.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/utils/Locks.h"
 #include <semaphore.h>
 #include <unistd.h>
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 class SemaphorePrivate {
@@ -26,7 +27,10 @@ Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
   sem_init(&m->sem, 0, initValue);
 }
 
-Semaphore::~Semaphore() { sem_destroy(&m->sem); }
+Semaphore::~Semaphore() {
+  sem_destroy(&m->sem);
+  delete m;
+}
 
 bool Semaphore::timeWait(struct timespec* ts) {
   return (0 == sem_timedwait(&m->sem, ts));
@@ -36,36 +40,101 @@ void Semaphore::wait() { sem_wait(&m->sem); }
 
 void Semaphore::post() { sem_post(&m->sem); }
 
+#ifdef PADDLE_USE_PTHREAD_SPINLOCK
+
 class SpinLockPrivate {
 public:
   inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
   inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
+
+  inline void lock() { pthread_spin_lock(&lock_); }
+  inline void unlock() { pthread_spin_unlock(&lock_); }
+
   pthread_spinlock_t lock_;
   char padding_[64 - sizeof(pthread_spinlock_t)];
 };
 
-SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
+#else
 
-SpinLock::~SpinLock() { delete m; }
+#include <atomic>
+class SpinLockPrivate {
+public:
+  inline void lock() {
+    while (lock_.test_and_set(std::memory_order_acquire)) {
+    }
+  }
+  inline void unlock() { lock_.clear(std::memory_order_release); }
+
+  std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
+  char padding_[64 - sizeof(lock_)];  // Padding to cache line size
+};
 
-void SpinLock::lock() { pthread_spin_lock(&m->lock_); }
+#endif
 
-void SpinLock::unlock() { pthread_spin_unlock(&m->lock_); }
+SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
+SpinLock::~SpinLock() { delete m; }
+void SpinLock::lock() { m->lock(); }
+void SpinLock::unlock() { m->unlock(); }
+
+#ifdef PADDLE_USE_PTHREAD_BARRIER
 
 class ThreadBarrierPrivate {
 public:
   pthread_barrier_t barrier_;
+
+  inline explicit ThreadBarrierPrivate(int count) {
+    pthread_barrier_init(&barrier_, nullptr, count);
+  }
+
+  inline ~ThreadBarrierPrivate() { pthread_barrier_destroy(&barrier_); }
+
+  inline void wait() { pthread_barrier_wait(&barrier_); }
 };
 
-ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate()) {
-  pthread_barrier_init(&m->barrier_, nullptr, count);
-}
+#else
 
-ThreadBarrier::~ThreadBarrier() {
-  pthread_barrier_destroy(&m->barrier_);
-  delete m;
-}
+class ThreadBarrierPrivate {
+public:
+  pthread_mutex_t mutex_;
+  pthread_cond_t cond_;
+  int count_;
+  int tripCount_;
+
+  inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
+    CHECK_NE(cnt, 0);
+    CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
+    CHECK_GE(pthread_cond_init(&cond_, 0), 0);
+  }
+
+  inline ~ThreadBarrierPrivate() {
+    pthread_cond_destroy(&cond_);
+    pthread_mutex_destroy(&mutex_);
+  }
+
+  /**
+   * @brief wait
+   * @return true if the last wait
+   */
+  inline bool wait() {
+    pthread_mutex_lock(&mutex_);
+    ++count_;
+    if (count_ >= tripCount_) {
+      count_ = 0;
+      pthread_cond_broadcast(&cond_);
+      pthread_mutex_unlock(&mutex_);
+      return true;
+    } else {
+      pthread_cond_wait(&cond_, &mutex_);
+      pthread_mutex_unlock(&mutex_);
+      return false;
+    }
+  }
+};
+
+#endif
 
-void ThreadBarrier::wait() { pthread_barrier_wait(&m->barrier_); }
+ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
+ThreadBarrier::~ThreadBarrier() { delete m; }
+void ThreadBarrier::wait() { m->wait(); }
 
 }  // namespace paddle
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
index 378788bcecd579fff1c762702a8c27f54cee94bf..b5d9f93f1376048eabd726331006b0bb848bce11 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "paddle/utils/CustomStackTrace.h"
 #include "paddle/utils/Locks.h"
+#include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
 
 DEFINE_int32(test_thread_num, 10, "testing thread number");
@@ -69,11 +70,11 @@ TEST(CustomStackTrace, normalTrain) {
     while (countDown-- > 0) {
       start.wait();
       for (size_t i = 0; i < layerSize; ++i) {
-        tracer.push("layer_" + std::to_string(i));
+        tracer.push("layer_" + paddle::str::to_string(i));
       }
       tracer.pop("");
       for (size_t i = 0; i < layerSize; ++i) {
-        tracer.pop("layer_" + std::to_string(layerSize - 1 - i));
+        tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i));
       }
       finish.wait();
     }
@@ -89,7 +90,7 @@ TEST(CustomStackTrace, normalTest) {
     while (countDown-- > 0) {
       start.wait();
       for (size_t i = 0; i < layerSize; ++i) {
-        tracer.push("layer_" + std::to_string(i));
+        tracer.push("layer_" + paddle::str::to_string(i));
       }
       tracer.clear();  // in forward test, tracer will clear after forward.
       finish.wait();
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.cpp b/paddle/utils/tests/test_CustomStackTracePrint.cpp
index 611b16aa7116d03ee51ba0095d043b78df1742ba..360c61c88a757da708b01d2bb54068b948b235cc 100644
--- a/paddle/utils/tests/test_CustomStackTracePrint.cpp
+++ b/paddle/utils/tests/test_CustomStackTracePrint.cpp
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/utils/CustomStackTrace.h"
+#include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
 
 int main(int argc, char** argv) {
   paddle::initMain(argc, argv);
 
   for (size_t i = 0; i < 1000; ++i) {
-    paddle::gLayerStackTrace.push("layer_" + std::to_string(i));
+    paddle::gLayerStackTrace.push("layer_" + paddle::str::to_string(i));
     if (i == 998) {
       throw "Unhandle exception";
     }
diff --git a/paddle/utils/tests/test_SIMDFlags.cpp b/paddle/utils/tests/test_SIMDFlags.cpp
index 8200a24ce7b7df75b48a89fbb7af15f304c5957f..185789c927be19385d6ddc7a1889b6cc56109d38 100644
--- a/paddle/utils/tests/test_SIMDFlags.cpp
+++ b/paddle/utils/tests/test_SIMDFlags.cpp
@@ -18,7 +18,8 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 
 TEST(SIMDFlags, gccTest) {
-#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__))
+#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) && \
+    !defined(__arm__)
   // clang-format off
   CHECK(!__builtin_cpu_supports("sse")    != HAS_SSE);
   CHECK(!__builtin_cpu_supports("sse2")   != HAS_SSE2);
@@ -43,4 +44,5 @@ TEST(SIMDFlags, normalPrint) {
   LOG(INFO) << "Has AVX:     " << std::boolalpha << HAS_AVX;
   LOG(INFO) << "Has AVX2:    " << std::boolalpha << HAS_AVX2;
   LOG(INFO) << "Has AVX512:  " << std::boolalpha << HAS_AVX512;
+  LOG(INFO) << "Has NEON:    " << std::boolalpha << HAS_NEON;
 }
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 03f9fc2707aaec7c997e7c41ded331087e181582..d055cddee9ec42912738c62c074d2d713fed5403 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1940,7 +1940,7 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
 @layer_support()
 def hsigmoid(input,
              label,
-             num_classes,
+             num_classes=None,
              name=None,
              bias_attr=None,
              param_attr=None,
@@ -1956,8 +1956,7 @@ def hsigmoid(input,
     ..  code-block:: python
 
         cost = hsigmoid(input=[layer1, layer2],
-                        label=data_layer,
-                        num_classes=3)
+                        label=data_layer)
 
     :param input: Input layers. It could be a LayerOutput or list/tuple of
                  LayerOutput.
@@ -1965,12 +1964,14 @@ def hsigmoid(input,
     :param label: Label layer.
     :type label: LayerOutput
     :param num_classes: number of classes.
-    :type num_classes: int
+    :type num_classes: int|None
     :param name: layer name
     :type name: basestring
     :param bias_attr: Bias attribute. None means default bias.
                       False means no bias.
     :type bias_attr: ParameterAttribute|False
+    :param param_attr: Parameter Attribute. None means default parameter.
+    :type param_attr: ParameterAttribute|None
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -1990,6 +1991,11 @@ def hsigmoid(input,
     assert isinstance(label, LayerOutput)
     assert label.layer_type == LayerType.DATA
 
+    if num_classes is None:
+        num_classes = label.size
+    if num_classes is None or num_classes <= 2:
+        raise ValueError("hsigmoid label size must larger than 2.")
+
     ipts_for_layer = []
     parents = []
     for each_input, each_param_attr in zip(input, param_attr):