Merge branch 'develop' into sequence_enumerate_op

4ec12496 · Chen Weihang · GitHub · 733ea0d2 · 4fcc2936 · 4ec12496
75 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,9 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
        "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+if(WIN32)
+    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
+endif(WIN32)

 if(NOT CMAKE_CROSSCOMPILING)
    find_package(CUDA QUIET)
@@ -165,7 +168,6 @@ include(external/python)    # download, build, install python
 include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
-include(external/warpctc)   # download, build, install warpctc
 include(external/boost)     # download boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
@@ -173,6 +175,14 @@ include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)

+if (NOT WIN32)
+# there is no official support of snappystream, warpctc, nccl, cupti in windows
+include(external/snappy)    # download snappy
+include(external/snappystream) # download snappystream
+include(external/warpctc)   # download, build, install warpctc
+include(cupti)
+endif (NOT WIN32)
+
 if(WITH_DISTRIBUTE)
    if(WITH_GRPC)
        include(external/grpc)
@@ -194,13 +204,10 @@ if(WITH_BRPC_RDMA)
    endif()
 endif()

-include(external/snappy)    # download snappy
-include(external/snappystream)
-include(external/threadpool)

+include(external/threadpool)
 include(flags)              # set paddle compile flags
 include(cudnn)              # set cudnn libraries, must before configure
-include(cupti)
 include(configure)          # add paddle env configuration

 if(WITH_GPU)

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -61,6 +61,11 @@ if(NOT CMAKE_CROSSCOMPILING)
    endif()
 endif()

+if(WIN32)
+  # windows stupid compile option for all targets.
+  add_definitions(-D_XKEYCHECK_H)
+endif(WIN32)
+
 if(NOT WITH_GOLANG)
    add_definitions(-DPADDLE_WITHOUT_GOLANG)
 endif(NOT WITH_GOLANG)

--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -28,7 +28,12 @@ if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
    set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
    set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
 endif()
-MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
+IF (WIN32)
+    MESSAGE(WARNING, "In windows, boost can not be downloaded automaticlly, please build it manually and put it at " ${THIRD_PARTY_PATH}install/boost)
+else()
+    MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
+ENDIF(WIN32)
+
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
@@ -36,12 +41,13 @@ set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)

 include_directories(${BOOST_INCLUDE_DIR})

+if (NOT WIN32)
 ExternalProject_Add(
    ${BOOST_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DOWNLOAD_DIR          ${BOOST_DOWNLOAD_DIR}
    DOWNLOAD_COMMAND      wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
-                          && tar zxf ${BOOST_TAR}.tar.gz
+    && tar zxf ${BOOST_TAR}.tar.gz
    DOWNLOAD_NO_PROGRESS  1
    PREFIX                ${BOOST_SOURCES_DIR}
    CONFIGURE_COMMAND     ""
@@ -49,8 +55,9 @@ ExternalProject_Add(
    INSTALL_COMMAND       ""
    UPDATE_COMMAND        ""
 )
+endif(NOT WIN32)

-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
    file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
    add_library(boost STATIC ${dummyfile})

--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -18,7 +18,7 @@ SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags)
 SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
 SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
 IF(WIN32)
-  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
 ELSE(WIN32)
  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
 ENDIF(WIN32)
@@ -45,7 +45,13 @@ ExternalProject_Add(
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
-
+IF(WIN32)
+  IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib")
+    add_custom_command(TARGET extern_gflags POST_BUILD
+    COMMAND cmake -E rename ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib
+  )
+  ENDIF()
+ENDIF(WIN32)
 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
 ADD_DEPENDENCIES(gflags extern_gflags)
@@ -60,3 +66,4 @@ IF(WITH_C_API)
    INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib)
  ENDIF()
 ENDIF()
+
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -60,6 +60,13 @@ ExternalProject_Add(
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
+IF(WIN32)
+  IF(NOT EXISTS "${GLOG_INSTALL_DIR}/lib/libglog.lib")
+    add_custom_command(TARGET extern_glog POST_BUILD
+    COMMAND cmake -E rename ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib
+  )
+  ENDIF()
+ENDIF(WIN32)

 ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -54,7 +54,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51"
+    GIT_TAG             "64e03a1939e0d526aa8e9f2e3f7dc0ad8d372944"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -17,20 +17,29 @@ IF(USE_EIGEN_FOR_BLAS)
 ENDIF(USE_EIGEN_FOR_BLAS)

 INCLUDE(cblas)
+# IF(WIN32 AND NOT ${CBLAS_FOUND})
+
+

 IF(NOT ${CBLAS_FOUND})
+
    INCLUDE(ExternalProject)

    SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
    SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
-    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
+    SET(CBLAS_INCLUDE_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)

    SET(CBLAS_LIBRARIES
        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
        CACHE FILEPATH "openblas library." FORCE)

    ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
+    IF (WIN32)
+        SET(CBLAS_FOUND true)
+        MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR})
+    ENDIF(WIN32)

+    IF (NOT WIN32)
    SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
    SET(OPENBLAS_COMMIT "v0.2.20")

@@ -69,7 +78,6 @@ IF(NOT ${CBLAS_FOUND})
    ENDIF()

    SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
-
    ExternalProject_Add(
        extern_openblas
        ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -84,9 +92,11 @@ IF(NOT ${CBLAS_FOUND})
        UPDATE_COMMAND      ""
        CONFIGURE_COMMAND   ""
    )
+    ELSE()
+    ENDIF(NOT WIN32)
    SET(CBLAS_PROVIDER openblas)
    IF(WITH_C_API)
-        INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
+        INSTALL(DIRECTORY ${CBLAS_INCLUDE_DIR} DESTINATION third_party/openblas)
        # Because libopenblas.a is a symbolic link of another library, thus need to
        # install the whole directory.
        IF(ANDROID)
@@ -107,7 +117,8 @@ IF(NOT ${CBLAS_FOUND})
 ENDIF(NOT ${CBLAS_FOUND})

 MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
-INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
+MESSAGE(STATUS "BLAS Include: ${CBLAS_INCLUDE_DIR}")
+INCLUDE_DIRECTORIES(${CBLAS_INCLUDE_DIR})

 # FIXME(gangliao): generate cblas target to track all high performance
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -14,11 +14,14 @@

 INCLUDE(ExternalProject)
 # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
+IF(NOT WIN32)
 FIND_PACKAGE(Protobuf QUIET)
+ENDIF(NOT WIN32)
 macro(UNSET_VAR VAR_NAME)
    UNSET(${VAR_NAME} CACHE)
    UNSET(${VAR_NAME})
 endmacro()
+
 UNSET_VAR(PROTOBUF_INCLUDE_DIR)
 UNSET_VAR(PROTOBUF_FOUND)
 UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
@@ -94,12 +97,14 @@ macro(PROMPT_PROTOBUF_LIB)
    SET(protobuf_DEPS ${ARGN})

    MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}")
+    MESSAGE(STATUS "Protobuf-lite library: ${PROTOBUF_LITE_LIBRARY}")
    MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}")
+    MESSAGE(STATUS "Protoc library: ${PROTOBUF_PROTOC_LIBRARY}")
    MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}")
    INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})

    # Assuming that all the protobuf libraries are of the same type.
-    IF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$")
+    IF(${PROTOBUF_LIBRARY} MATCHES ${CMAKE_STATIC_LIBRARY_SUFFIX})
        SET(protobuf_LIBTYPE STATIC)
    ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$")
        SET(protobuf_LIBTYPE SHARED)
@@ -137,18 +142,25 @@ macro(SET_PROTOBUF_VERSION)
 endmacro()

 set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
+IF (WIN32)
+    SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf)
+    MESSAGE(WARNING, "In windows, protobuf only support msvc build, please build it manually and put it at " ${PROTOBUF_ROOT})
+ENDIF(WIN32)
+
 if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
+
    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
-    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LIBRARY protobuf libprotobuf.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite libprotobuf-lite.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_PROTOC_LIBRARY protoc libprotoc.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
    if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
        message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
+        SET(PROTOBUF_FOUND true)
        SET_PROTOBUF_VERSION()
        PROMPT_PROTOBUF_LIB()
    else()
-        message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}.")
+        message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}")
    endif()
 endif()

@@ -239,6 +251,7 @@ IF(CMAKE_CROSSCOMPILING)
        CACHE FILEPATH "protobuf executable." FORCE)
 ENDIF()

+
 IF(NOT PROTOBUF_FOUND)
    build_protobuf(extern_protobuf FALSE)


--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -148,7 +148,8 @@ function(merge_static_libs TARGET_NAME)
      COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
      )
-  else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+  endif(APPLE)
+  if(LINUX) # general UNIX: use "ar" to extract objects and re-add to a common lib
    set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)

    foreach(lib ${libs})
@@ -187,7 +188,36 @@ function(merge_static_libs TARGET_NAME)
        COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
        COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
        WORKING_DIRECTORY ${target_DIR})
-  endif()
+  endif(LINUX)
+  if(WIN32) # windows do not support gcc/nvcc combined compiling. Use msvc lib.exe to merge libs.
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
+      DEPENDS ${libs})
+
+    # Generate dummy staic lib
+    file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
+
+    foreach(lib ${libs})
+      # Get the file names of the libraries to be merged
+      #if(NOT $<TARGET_FILE:${lib}> MATCHES "lib.*\\.lib")
+      #  message("library" ${lib})
+      #  set(libfiles ${libfiles} lib$<TARGET_FILE:${lib}>)
+      #else()
+      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+      #endif()
+    endforeach()
+   
+    # windows cmd return error in clean env.
+    # COMMAND del "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+      COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.lib ${libfiles}
+      )
+  endif(WIN32)
 endfunction(merge_static_libs)

 function(cc_library TARGET_NAME)
@@ -195,6 +225,10 @@ function(cc_library TARGET_NAME)
  set(oneValueArgs "")
  set(multiValueArgs SRCS DEPS)
  cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  if(WIN32)
+      # add libxxx.lib prefix in windows
+      set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
+  endif(WIN32)
  if(cc_library_SRCS)
    if(cc_library_SHARED OR cc_library_shared) # build *.so
      add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -101,6 +101,7 @@ if(WITH_MKLDNN)
  )
 endif()

+if (NOT WIN32)
 if(NOT MOBILE_INFERENCE AND NOT RPI)
  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
  copy(snappy_lib
@@ -120,15 +121,23 @@ if(NOT MOBILE_INFERENCE AND NOT RPI)
    DSTS ${dst_dir} ${dst_dir}/lib
    DEPS zlib)
 endif()
+endif(NOT WIN32)

 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
+if (NOT WIN32)
 copy(framework_lib DEPS framework_py_proto 
  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
 )
+else()
+copy(framework_lib
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
+)
+endif(NOT WIN32)

 set(module "memory")
 copy(memory_lib

--- a/doc/fluid/dev/releasing_process_en.md
+++ b/doc/fluid/dev/releasing_process_en.md
@@ -50,6 +50,33 @@ pop-up box, choose the current release branch and click "Run Build" button. You
 * pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
  old version. you must change the version number before upload a new one.

+### Publish wheel Packages for MacOS
+
+You need to build the binary wheel package for MacOS before publishing, to
+make sure that the package can be used by many versions of MacOS
+(10.11, 10.12, 10.13) and different python installs (python.org, homebrew, etc.),
+you must build the package ***exactly*** following below steps:
+
+Build steps:
+
+1. install python from python.org downloads, and make sure it's currently in use
+   in your system.
+1. `export MACOSX_DEPLOYMENT_TARGET=10.11`, use `10.11` is enough for recent versions.
+1. `git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle && mkdir build && cd build`
+1. `cmake -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_SYSTEM_BLAS=OFF  ..`, make sure the output of `cmake` command is using the correct python interpreter installed from python.org
+1. `make -j`
+1. `pip install delocate`
+1. `mkdir fixed_wheel && delocate-wheel -w fixed_wheel python/dist/*.whl`
+
+Then the whl under `fixed_wheel` is ready to upload.
+
+Install steps:
+
+1. run `pip install paddlepaddle...whl`
+1. find the `libpython.dylib` that are currently in use:
+    - for python.org package installs, do nothing.
+    - for other python installs, find the path of `libpython*.dylib` and `export LD_LIBRARY_PATH=you path && DYLD_LIBRARY_PATH=your path`
+
 ## Publish Docker Images

 Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:

--- a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
@@ -9,8 +9,6 @@ Paddle 预测 API

 -  头文件 ``paddle_inference_api.h`` 定义了所有的接口
 -  库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a``
-  库文件 ``libpaddle_inference_api.so`` 或
-   ``libpaddle_inference_api.a``

 编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。

@@ -97,8 +95,7 @@ engine
    CHECK(predictor->Run(slots, &outputs));
    // 获取 outputs ...

-编译时，联编 ``libpaddle_fluid.a/.so`` 和
-``libpaddle_inference_api.a/.so`` 便可。
+编译时，联编 ``libpaddle_fluid.a/.so`` 即可。

 详细代码参考
 ------------

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -163,6 +163,7 @@ paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], v
 paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None))
 paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))

--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -2,9 +2,13 @@ add_subdirectory(memory)
 add_subdirectory(platform)
 add_subdirectory(framework)
 add_subdirectory(operators)
-add_subdirectory(pybind)
 add_subdirectory(string)
+
+if (NOT WIN32)
+add_subdirectory(pybind)
 add_subdirectory(recordio)
+endif(NOT WIN32)
+
 if(WITH_INFERENCE)
  # NOTE: please add subdirectory inference at last.
  add_subdirectory(inference)

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
-add_subdirectory(details)
 add_subdirectory(ir)
+if (NOT WIN32)
+add_subdirectory(details)
+endif (NOT WIN32)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)

@@ -28,8 +30,12 @@ if(WITH_GPU)
 else()
  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
 endif()
-
+if (NOT WIN32)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
+else()
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
+endif (NOT WIN32)
+
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)

@@ -69,14 +75,22 @@ cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
+
+if (NOT WIN32)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
    shape_inference data_transform lod_tensor profiler)
+else()
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
+    shape_inference data_transform lod_tensor)
+endif(NOT WIN32)
+
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)

 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)

+if (NOT WIN32)
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
@@ -86,6 +100,7 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
    COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
    COMMENT "Copy generated python proto into directory paddle/fluid/proto."
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif(NOT WIN32)

 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)

@@ -120,7 +135,9 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
 # cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )

+if (NOT WIN32)
 cc_test(rw_lock_test SRCS rw_lock_test.cc)
+endif (NOT WIN32)

 # disable test temporarily.
 # TODO https://github.com/PaddlePaddle/Paddle/issues/11971

--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -26,6 +26,7 @@ namespace framework {
 extern proto::VarType::Type ToDataType(std::type_index type);
 extern std::type_index ToTypeIndex(proto::VarType::Type type);

+#if !defined(_WIN32)
 template <typename Visitor>
 inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
  switch (type) {
@@ -57,6 +58,40 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
      PADDLE_THROW("Not supported %d", type);
  }
 }
+#else
+// the msvc compiler do not implement two-stage name lookup correctly.
+template <typename Visitor>
+inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
+  switch (type) {
+    case proto::VarType::FP16:
+      visitor.operator()<platform::float16>();
+      break;
+    case proto::VarType::FP32:
+      visitor.operator()<float>();
+      break;
+    case proto::VarType::FP64:
+      visitor.operator()<double>();
+      break;
+    case proto::VarType::INT32:
+      visitor.operator()<int>();
+      break;
+    case proto::VarType::INT64:
+      visitor.operator()<int64_t>();
+      break;
+    case proto::VarType::BOOL:
+      visitor.operator()<bool>();
+      break;
+    case proto::VarType::UINT8:
+      visitor.operator()<uint8_t>();
+      break;
+    case proto::VarType::INT16:
+      visitor.operator()<int16_t>();
+      break;
+    default:
+      PADDLE_THROW("Not supported %d", type);
+  }
+}
+#endif  // _WIN32

 extern std::string DataTypeToString(const proto::VarType::Type type);
 extern size_t SizeOfType(std::type_index type);

--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -25,8 +25,10 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"

+#if !defined(_WIN32)
 #include "paddle/fluid/recordio/scanner.h"
 #include "paddle/fluid/recordio/writer.h"
+#endif  // _WIN32

 namespace paddle {
 namespace framework {
@@ -300,6 +302,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
  TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }

+#if !defined(_WIN32)
 void WriteToRecordIO(recordio::Writer *writer,
                     const std::vector<LoDTensor> &tensor,
                     const platform::DeviceContext &dev_ctx) {
@@ -329,7 +332,19 @@ bool ReadFromRecordIO(recordio::Scanner *scanner,

  return true;
 }
-
+#else
+class Writer {};
+class Scanner {};
+void WriteToRecordIO(recordio::Writer *writer,
+                     const std::vector<LoDTensor> &tensor,
+                     const platform::DeviceContext &dev_ctx) {}
+bool ReadFromRecordIO(recordio::Scanner *scanner,
+                      const platform::DeviceContext &dev_ctx,
+                      std::vector<LoDTensor> *result_ptr) {
+  PADDLE_ENFORCE("windows didn't supported recordio!.");
+  return true;
+}
+#endif  // _WIN32
 std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
    const std::vector<platform::Place> places) const {
  check_memory_size();

--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -274,6 +274,7 @@ TEST(LoD, ConvertToOffsetBasedLoD) {
  EXPECT_EQ(offset_lod, expected);
 }

+#if !defined(_WIN32)
 template <typename T>
 static void TestRecordIO() {
  LoDTensor tensor;
@@ -320,6 +321,7 @@ TEST(LoDTensor, RecordIO) {
  TestRecordIO<float>();
  TestRecordIO<double>();
 }
+#endif  // !defined(_WIN32)

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -129,10 +129,6 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
                                    "Optimized for variable")
      .SetDefault({});

-  AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
-                                    "Callstack for Op Creatation.")
-      .SetDefault({});
-
  Validate();
 }


--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -39,7 +39,6 @@ class OpProtoAndCheckerMaker {
 public:
  static const char *OpRoleAttrName() { return "op_role"; }
  static const char *OpRoleVarAttrName() { return "op_role_var"; }
-  static const char *OpCreationCallstackAttrName() { return "op_callstack"; }

  void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);


--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -11,17 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/framework/operator.h"
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
 #include <algorithm>
-#include <sstream>
-#include <string>
-#include <vector>
-#include "gflags/gflags.h"
-#include "glog/logging.h"
+
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -76,6 +74,12 @@ static DDim GetDims(const Scope& scope, const std::string& name,
  }
 }

+static bool VarInited(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var == nullptr) return false;
+  return var->IsInitialized();
+}
+
 static std::string GetDtype(const Scope& scope, const std::string& name) {
  Variable* var = scope.FindVar(name);
  if (var == nullptr) {
@@ -89,8 +93,12 @@ static std::string GetDtype(const Scope& scope, const std::string& name) {
    }
    return DataTypeToString(ToDataType(tensor.type()));
  } else if (var->IsType<SelectedRows>()) {
-    return DataTypeToString(
-        ToDataType(var->Get<SelectedRows>().value().type()));
+    auto tensor = var->Get<SelectedRows>().value();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return "uninited";
+    } else {
+      return DataTypeToString(ToDataType(tensor.type()));
+    }
  } else {
    return "";
  }
@@ -129,48 +137,19 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
 }

 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  try {
-    if (VLOG_IS_ON(4)) {
-      VLOG(4) << place << " " << DebugStringEx(&scope);
-    }
-    if (platform::is_gpu_place(place)) {
+  VLOG(4) << place << " " << DebugStringEx(&scope);
+  if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("Cannot run operator on place %s", place);
+    PADDLE_THROW("Cannot run operator on place %s", place);
 #else
-      auto dev_id = boost::get<platform::CUDAPlace>(place).device;
-      platform::SetDeviceId(dev_id);
+    auto dev_id = boost::get<platform::CUDAPlace>(place).device;
+    platform::SetDeviceId(dev_id);
 #endif
-    }
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(Type(), pool.Get(place));
-    RunImpl(scope, place);
-    if (VLOG_IS_ON(3)) {
-      VLOG(3) << place << " " << DebugStringEx(&scope);
-    }
-  } catch (platform::EnforceNotMet exception) {
-    if (Attrs().count("sub_block") != 0) {
-      throw exception;
-    }
-
-    auto& callstack = Attr<std::vector<std::string>>(
-        OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
-
-    if (callstack.empty()) {
-      throw exception;
-    }
-    std::ostringstream sout;
-    sout << "Invoke operator " << Type() << " error.\n";
-    sout << "Python Callstacks: \n";
-    for (auto& line : callstack) {
-      sout << line;
-    }
-    sout << "C++ Callstacks: \n";
-    sout << exception.err_str_;
-    exception.err_str_ = sout.str();
-    throw exception;
-  } catch (...) {
-    std::rethrow_exception(std::current_exception());
  }
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  platform::RecordEvent record_event(Type(), pool.Get(place));
+  RunImpl(scope, place);
+  VLOG(3) << place << " " << DebugStringEx(&scope);
 }

 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -198,7 +177,7 @@ const std::vector<std::string>& OperatorBase::Inputs(
 }

 bool OperatorBase::HasOutputs(const std::string& name) const {
-  if (outputs_.end() != outputs_.find(name)) {
+  if (outputs_.find(name) != outputs_.end()) {
    return true;
  } else {
    return false;
@@ -228,16 +207,21 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
    auto& input = *it;
    ss << input.first << "[";
    for (size_t i = 0; i < input.second.size(); ++i) {
-      ss << input.second[i];
+      auto var_name = input.second[i];
+      ss << var_name;
      if (scope) {
-        int row_size = GetRowSize(*scope, input.second[i]);
-        if (row_size >= 0) {
-          ss << "[row_size=" << row_size << "]";
+        if (!VarInited(*scope, var_name)) {
+          ss << "[uninited]";
+        } else {
+          int row_size = GetRowSize(*scope, var_name);
+          if (row_size >= 0) {
+            ss << "[row_size=" << row_size << "]";
+          }
+          std::string dtype = GetDtype(*scope, var_name);
+          ss << ":" << dtype;
+          ss << "[" << GetDims(*scope, var_name, true) << "]";
+          ss << "(" << GetLoD(*scope, var_name) << ")";
        }
-        std::string dtype = GetDtype(*scope, input.second[i]);
-        ss << ":" << dtype;
-        ss << "[" << GetDims(*scope, input.second[i], true) << "]";
-        ss << "(" << GetLoD(*scope, input.second[i]) << ")";
      }
      if (i != input.second.size() - 1) {
        ss << ", ";
@@ -254,14 +238,19 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
    auto& output = *it;
    ss << output.first << "[";
    for (size_t i = 0; i < output.second.size(); ++i) {
-      ss << output.second[i];
+      auto var_name = output.second[i];
+      ss << var_name;
      if (scope) {
-        int row_size = GetRowSize(*scope, output.second[i]);
-        if (row_size >= 0) {
-          ss << "[row_size=" << row_size << "]";
+        if (!VarInited(*scope, var_name)) {
+          ss << "[uninited]";
+        } else {
+          int row_size = GetRowSize(*scope, output.second[i]);
+          if (row_size >= 0) {
+            ss << "[row_size=" << row_size << "]";
+          }
+          ss << "[" << GetDims(*scope, var_name, true) << "]";
+          ss << "(" << GetLoD(*scope, var_name) << ")";
        }
-        ss << "[" << GetDims(*scope, output.second[i], true) << "]";
-        ss << "(" << GetLoD(*scope, output.second[i]) << ")";
      }
      if (i != output.second.size() - 1) {
        ss << ", ";

--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@@ -14,13 +14,16 @@ limitations under the License. */

 #pragma once

+#if !defined(_WIN32)
 #include <pthread.h>
+#endif  // !_WIN32

 #include "paddle/fluid/platform/enforce.h"

 namespace paddle {
 namespace framework {

+#if !defined(_WIN32)
 struct RWLock {
  RWLock() { pthread_rwlock_init(&lock_, nullptr); }

@@ -43,6 +46,15 @@ struct RWLock {
 private:
  pthread_rwlock_t lock_;
 };
+#else
+// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
+// In windows, rw_lock seems like a hack. Use empty object and do nothing.
+struct RWLock {
+  void RDLock() {}
+  void WRLock() {}
+  void UNLock() {}
+};
+#endif

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -31,7 +31,8 @@ size_t Tensor::memory_size() const {
  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
 }

-void* Tensor::mutable_data(platform::Place place, std::type_index type) {
+void* Tensor::mutable_data(platform::Place place, std::type_index type,
+                           size_t requested_size) {
  if (holder_ != nullptr) {
    holder_->set_type(type);
  }
@@ -39,7 +40,7 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type) {
                    "When calling this method, the Tensor's numel must be "
                    "equal or larger than zero. "
                    "Please check Tensor::Resize has been called first.");
-  int64_t size = numel() * SizeOfType(type);
+  size_t size = requested_size ? requested_size : numel() * SizeOfType(type);
  /* some versions of boost::variant don't have operator!= */
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + offset_) {
@@ -68,10 +69,10 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type) {
                                 offset_);
 }

-void* Tensor::mutable_data(platform::Place place) {
+void* Tensor::mutable_data(platform::Place place, size_t requested_size) {
  PADDLE_ENFORCE(this->holder_ != nullptr,
                 "Cannot invoke mutable data if current hold nothing.");
-  return mutable_data(place, holder_->type());
+  return mutable_data(place, holder_->type(), requested_size);
 }

 Tensor& Tensor::ShareDataWith(const Tensor& src) {

--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -89,22 +89,24 @@ class Tensor {
   * @note    If not exist, then allocation.
   */
  template <typename T>
-  T* mutable_data(platform::Place place);
+  T* mutable_data(platform::Place place, size_t requested_size = 0);

-  void* mutable_data(platform::Place place, std::type_index type);
+  void* mutable_data(platform::Place place, std::type_index type,
+                     size_t requested_size = 0);

-  void* mutable_data(platform::Place place);
+  void* mutable_data(platform::Place place, size_t requested_size = 0);

  /**
   * @brief     Return a pointer to mutable memory block.
   *
-   * @param[in] dims    The dimensions of the memory block.
-   * @param[in] place   The place of the memory block.
+   * @param[in] dims           The dimensions of the memory block.
+   * @param[in] place          The place of the memory block.
+   * @param[in] requested_size The size of the block in bytes.
   *
   * @note      If not exist, then allocation.
   */
  template <typename T>
-  T* mutable_data(DDim dims, platform::Place place);
+  T* mutable_data(DDim dims, platform::Place place, size_t requested_size = 0);

  /*! Return the dimensions of the memory block. */
  const DDim& dims() const;

--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -46,16 +46,17 @@ inline T* Tensor::data() {
 }

 template <typename T>
-inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
+inline T* Tensor::mutable_data(DDim dims, platform::Place place,
+                               size_t requested_size) {
  static_assert(std::is_pod<T>::value, "T must be POD");
  Resize(dims);
-  return mutable_data<T>(place);
+  return mutable_data<T>(place, requested_size);
 }

 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place) {
+inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) {
  static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
+  return reinterpret_cast<T*>(mutable_data(place, typeid(T), requested_size));
 }

 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {

--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -23,9 +23,11 @@ include_directories("${PADDLE_LIB}")
 include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")
 include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+if (NOT WIN32)
 include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
 include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
 include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+endif(NOT WIN32)

 include_directories("${PADDLE_LIB}/third_party/boost")
 include_directories("${PADDLE_LIB}/third_party/eigen3")

--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -11,12 +11,18 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#define GLOG_NO_ABBREVIATED_SEVERITIES

 #include "paddle/fluid/memory/detail/system_allocator.h"

-#include <stdlib.h>    // for malloc and free
+#ifdef _WIN32
+#include <malloc.h>
+#include <windows.h>  // VirtualLock/VirtualUnlock
+#else
 #include <sys/mman.h>  // for mlock and munlock
-#include <algorithm>   // for std::max
+#endif
+#include <stdlib.h>   // for malloc and free
+#include <algorithm>  // for std::max

 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/assert.h"
@@ -35,31 +41,42 @@ namespace paddle {
 namespace memory {
 namespace detail {

-void* CPUAllocator::Alloc(size_t* index, size_t size) {
-  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
-  // malloc might not return nullptr if size is zero, but the returned
-  // pointer shall not be dereferenced -- so we make it nullptr.
-  if (size <= 0) return nullptr;
-
-  *index = 0;  // unlock memory
-
+void* AlignedMalloc(size_t size) {
  void* p = nullptr;
-
+  size_t alignment = 32ul;
 #ifdef PADDLE_WITH_MKLDNN
  // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
  // memory alignment
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0, "Alloc %ld error!",
-                    size);
+  alignment = 4096ul;
+#endif
+#ifdef _WIN32
+  p = _aligned_malloc(size, alignment);
 #else
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0, "Alloc %ld error!",
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, alignment, size), 0, "Alloc %ld error!",
                    size);
 #endif
  PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size);
+  return p;
+}
+
+void* CPUAllocator::Alloc(size_t* index, size_t size) {
+  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
+  // malloc might not return nullptr if size is zero, but the returned
+  // pointer shall not be dereferenced -- so we make it nullptr.
+  if (size <= 0) return nullptr;
+
+  *index = 0;  // unlock memory
+
+  void* p = AlignedMalloc(size);

  if (p != nullptr) {
    if (FLAGS_use_pinned_memory) {
      *index = 1;
+#ifdef _WIN32
+      VirtualLock(p, size);
+#else
      mlock(p, size);  // lock memory
+#endif
    }
  }

@@ -68,7 +85,11 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {

 void CPUAllocator::Free(void* p, size_t size, size_t index) {
  if (p != nullptr && index == 1) {
+#ifdef _WIN32
+    VirtualUnlock(p, size);
+#else
    munlock(p, size);
+#endif
  }
  free(p);
 }

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -85,7 +85,7 @@ function(op_library TARGET)

    #remove windows unsupported op
    if (WIN32)
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op")
        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
          return()
        endif()
@@ -319,8 +319,9 @@ foreach(src ${GENERAL_OPS})
 endforeach()

 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
-
+if (NOT WIN32)
 add_subdirectory(reader)
+endif(NOT WIN32)
 foreach(src ${READER_LIBRARY})
    set(OP_LIBRARY ${src} ${OP_LIBRARY})
 endforeach()

--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -232,40 +232,28 @@ use lstm_x_t as input and compute as standard LSTM.
 template <typename T>
 inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
  if (bias) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = x[i] + bias[0];
-    }
-    math::vec_relu<T>(n, y, y);
+    math::vec_add_bias<T, platform::jit::avx>(n, *bias, x, y);
+    math::vec_relu<T, platform::jit::avx>(n, y, y);
  } else {
-    math::vec_relu<T>(n, x, y);
+    math::vec_relu<T, platform::jit::avx>(n, x, y);
  }
 }

-template <typename DeviceContext, typename T>
-inline void vec_softmax(const math::BlasT<DeviceContext, T>& blas, const int n,
-                        const T* x, T* y) {
+template <typename T>
+inline void vec_softmax(const int n, const T* x, T* y) {
  T scalar = x[0];
  // max
  for (int i = 1; i < n; ++i) {
    scalar = scalar < x[i] ? x[i] : scalar;
  }
-
-  // sub
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] - scalar;
-  }
-
-  // exp
-  blas.VEXP(n, y, y);
-
+  math::vec_add_bias<T, platform::jit::avx>(n, -scalar, x, y);  // sub
+  math::vec_exp<T>(n, y, y);                                    // exp
  // sum
  scalar = T(0);
  for (int i = 0; i < n; ++i) {
    scalar += y[i];
  }
-
-  // scale
-  blas.SCAL(n, static_cast<T>(1) / scalar, y);
+  math::vec_scal<T>(n, static_cast<T>(1) / scalar, y);  // scale
 }

 template <typename T>
@@ -311,11 +299,21 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D);
    fc_out->Resize({max_seq_len, 1});

-    math::VecActivations<T> act_functor;
    std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand;
-    act_gate = act_functor(ctx.Attr<std::string>("gate_activation"));
-    act_cell = act_functor(ctx.Attr<std::string>("cell_activation"));
-    act_cand = act_functor(ctx.Attr<std::string>("candidate_activation"));
+    auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
+    auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
+    auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
+    if (platform::jit::MayIUse(platform::jit::avx)) {
+      math::VecActivations<T, platform::jit::avx> act_functor;
+      act_gate = act_functor(act_gate_str);
+      act_cell = act_functor(act_cell_str);
+      act_cand = act_functor(act_cand_str);
+    } else {
+      math::VecActivations<T, platform::jit::isa_any> act_functor;
+      act_gate = act_functor(act_gate_str);
+      act_cell = act_functor(act_cell_str);
+      act_cand = act_functor(act_cand_str);
+    }

    const T* x_data = x->data<T>();
    const T* h0_data = h0 ? h0->data<T>() : NULL;
@@ -363,7 +361,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
                       fc_out_data);
        }
        // 1d. softmax
-        vec_softmax<DeviceContext, T>(blas, seq_len, fc_out_data, fc_out_data);
+        vec_softmax<T>(seq_len, fc_out_data, fc_out_data);
        // mul x(seq_len*M) and sum pool
        math::FCCompute<DeviceContext, T>(blas, 1, M, seq_len, fc_out_data,
                                          cur_x_data, lstm_x_data);

--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -60,6 +60,20 @@ class AucKernel : public framework::OpKernel<T> {
    const T* inference_data = predict->data<T>();
    const auto* label_data = label->data<int64_t>();

+    // check if states are inited.
+    auto* tp_in = ctx.Input<Tensor>("TP");
+    auto* fp_in = ctx.Input<Tensor>("FP");
+    auto* tn_in = ctx.Input<Tensor>("TN");
+    auto* fn_in = ctx.Input<Tensor>("FN");
+    PADDLE_ENFORCE(tp_in->IsInitialized(), "true_positive is not inited!");
+    PADDLE_ENFORCE(fp_in->IsInitialized(), "false_negative is not inited!");
+    PADDLE_ENFORCE(tn_in->IsInitialized(), "true_negative is not inited!");
+    PADDLE_ENFORCE(fn_in->IsInitialized(), "false_positive is not inited!");
+    PADDLE_ENFORCE_EQ(tp_in->numel(), num_thresholds, "");
+    PADDLE_ENFORCE_EQ(fp_in->numel(), num_thresholds, "");
+    PADDLE_ENFORCE_EQ(tn_in->numel(), num_thresholds, "");
+    PADDLE_ENFORCE_EQ(fn_in->numel(), num_thresholds, "");
+
    auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
    auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
    auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace());

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -135,7 +135,7 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Variance",
             "The global variance (for training) "
             "or estimated Variance (for testing)");
-    AddOutput("Y", "result after normalization");
+    AddOutput("Y", "result after normalization").Reuse("X");
    AddOutput("MeanOut",
              "Share memory with Mean. "
              "Store the global mean when training")

--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -53,6 +53,18 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
    key_ += "-BWD";
  }

+  size_t GetDstMemorySize() const {
+    return conv_pd_->dst_primitive_desc().get_size();
+  }
+
+  size_t GetDiffWeightsMemorySize() const {
+    return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size();
+  }
+
+  size_t GetDiffSourceMemorySize() const {
+    return conv_bwd_data_pd_->diff_src_primitive_desc().get_size();
+  }
+
  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
      const std::shared_ptr<mkldnn::memory> user_memory_p,
      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
@@ -294,7 +306,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {

    const T* input_data = input->data<T>();
    const T* filter_data = filter->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());

    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
    std::vector<int> weights_tz =
@@ -354,6 +365,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto user_weights_memory_p = handler.AcquireWeightsMemory(
        user_weights_md, to_void_cast<T>(filter_data));

+    T* output_data =
+        output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
    // create reorder primitive if the input format is not the preferred one
    auto src_memory_p =
        handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
@@ -476,13 +489,6 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    T* input_grad_data = nullptr;
    T* filter_grad_data = nullptr;

-    if (input_grad) {
-      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-    }
-    if (filter_grad) {
-      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-    }
-
    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
    std::vector<int> weights_tz =
        paddle::framework::vectorize2int(filter->dims());
@@ -568,6 +574,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
          handler.AcquireDiffDstMemoryFromWeightsPrimitive(
              user_diff_dst_memory_p, pipeline);

+      const size_t size = handler.GetDiffWeightsMemorySize();
+      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace(), size);
+
      auto diff_weights_memory_p =
          handler.AcquireDiffWeightsMemoryFromWeightsPrimitive(
              reinterpret_cast<void*>(filter_grad_data));
@@ -590,6 +599,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
          handler.AcquireDiffDstMemoryFromDataPrimitive(user_diff_dst_memory_p,
                                                        pipeline);

+      const size_t size = handler.GetDiffSourceMemorySize();
+      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace(), size);
+
      auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive(
          reinterpret_cast<void*>(input_grad_data));


--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -151,6 +151,7 @@ bool VariableResponse::CopySelectRowsData(
    ::google::protobuf::io::CodedInputStream* input,
    const platform::DeviceContext& ctx, int length) {
  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
+  slr->mutable_rows()->clear();
  slr->mutable_rows()->resize(length /
                              framework::SizeOfType(typeid(int64_t)));  // int64
  int64_t* rows_data = slr->mutable_rows()->data();

--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device_context.h"

 namespace paddle {
 namespace operators {
@@ -41,19 +40,33 @@ class FillConstantOp : public framework::OperatorBase {
        static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
    auto value = Attr<float>("value");
    auto force_cpu = Attr<bool>("force_cpu");
-    auto &out =
-        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
-    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+
+    framework::Tensor *tensor = nullptr;
+
+    auto &out_var = *scope.FindVar(Output("Out"));
+
+    if (out_var.IsType<framework::LoDTensor>()) {
+      tensor = out_var.GetMutable<framework::LoDTensor>();
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    } else if (out_var.IsType<framework::SelectedRows>()) {
+      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    } else {
+      PADDLE_THROW(
+          "fill constant op's output only"
+          "supports SelectedRows and LoDTensor");
+    }
+
    if (force_cpu) {
      auto cpu = platform::CPUPlace();
-      out.mutable_data(cpu, framework::ToTypeIndex(data_type));
+      tensor->mutable_data(cpu, framework::ToTypeIndex(data_type));
    } else {
-      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
+      tensor->mutable_data(dev_place, framework::ToTypeIndex(data_type));
    }

    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(dev_place);
-    math::set_constant(dev_ctx, &out, value);
+    math::set_constant(dev_ctx, tensor, value);
  }
 };


--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -165,12 +165,13 @@ void ListenAndServOp::RunSyncLoop(
                          recv_scope);
    VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";

-    rpc_service_->SetCond(distributed::kRequestGet);
-    rpc_service_->WaitBarrier(distributed::kRequestGet);
-    rpc_service_->ResetBarrierCounter();
    // reset received sparse vars to avoid reuse it in the next mini-batch
    dynamic_cast<distributed::RequestSendHandler *>(request_send_handler_.get())
        ->ResetSparseVarRecorder();
+
+    rpc_service_->SetCond(distributed::kRequestGet);
+    rpc_service_->WaitBarrier(distributed::kRequestGet);
+    rpc_service_->ResetBarrierCounter();
  }  // while(true)
 }


--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -65,3 +65,4 @@ if(WITH_GPU)
    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat)
+cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
@@ -48,16 +48,16 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
    auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());

    // computation
-    for (int k = 0; k < out_rows; ++k) {
-      T* dst_ptr = output->data<T>() + k * out_cols;
-      int col_idx = 0;
-      for (int j = 0; j < num; ++j) {
-        int col_len = input_cols[j];
-        const T* src_prt = input[j].data<T>() + k * col_len;
-        memory::Copy(cpu_place, dst_ptr + col_idx, cpu_place, src_prt,
-                     sizeof(T) * col_len);
-        col_idx += col_len;
+    auto output_data = output->data<T>();
+    int col_idx = 0;
+    for (int j = 0; j < num; ++j) {
+      int col_len = input_cols[j];
+      auto input_data = input[j].data<T>();
+      for (int k = 0; k < out_rows; ++k) {
+        memory::Copy(cpu_place, output_data + k * out_cols + col_idx, cpu_place,
+                     input_data + k * col_len, sizeof(T) * col_len);
      }
+      col_idx += col_len;
    }
  }
 };

--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -13,8 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <cmath>
 #include <string>
 #include "paddle/fluid/platform/cpu_info.h"
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif

 namespace paddle {
 namespace operators {
@@ -22,16 +30,161 @@ namespace math {

 #define SIGMOID_THRESHOLD_MIN -40.0
 #define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
+
+#define AVX_FLOAT_BLOCK 8
+#define AVX_DOUBLE_BLOCK 4
+#define AVX2_FLOAT_BLOCK 8
+#define AVX2_DOUBLE_BLOCK 4
+#define AVX512_FLOAT_BLOCK 16
+#define AVX512_DOUBLE_BLOCK 8

 template <typename T>
-inline T sigmoid(T x) {
-  return 1. / (1. + exp(-x));
+inline void vec_exp(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
 }

 template <typename T>
-inline T tanh(T x) {
-  return 2. * sigmoid(2. * x) - 1.;
+inline void vec_scal(const int n, const T a, T* x) {
+  for (int i = 0; i < n; ++i) {
+    x[i] = a * x[i];
+  }
+}
+
+#ifdef PADDLE_WITH_MKLML
+template <>
+inline void vec_exp<float>(const int n, const float* x, float* y) {
+  platform::dynload::vsExp(n, x, y);
+}
+
+template <>
+inline void vec_exp<double>(const int n, const double* x, double* y) {
+  platform::dynload::vdExp(n, x, y);
+}
+
+template <>
+inline void vec_scal<float>(const int n, const float a, float* x) {
+  platform::dynload::cblas_sscal(n, a, x, 1);
+}
+
+template <>
+inline void vec_scal<double>(const int n, const double a, double* x) {
+  platform::dynload::cblas_dscal(n, a, x, 1);
+}
+#endif
+
+// MKL scal only support inplace, choose this if src and dst are not equal
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_scal(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a * x[i];
+  }
+}
+
+template <>
+inline void vec_scal<float, platform::jit::avx>(const int n, const float a,
+                                                const float* x, float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_scal<float, platform::jit::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 scalar = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP               \
+  tmp = _mm256_loadu_ps(x + i);     \
+  tmp = _mm256_mul_ps(tmp, scalar); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = a * x[i];
+  }
+#else
+  vec_scal<float, platform::jit::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_scal<float, platform::jit::avx2>(const int n, const float a,
+                                                 const float* x, float* y) {
+  vec_scal<float, platform::jit::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_scal<float, platform::jit::avx512_common>(const int n,
+                                                          const float a,
+                                                          const float* x,
+                                                          float* y) {
+  // TODO(TJ): enable me
+  vec_scal<float, platform::jit::avx2>(n, a, x, y);
+}
+
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+}
+
+template <>
+inline void vec_add_bias<float, platform::jit::avx>(const int n, const float a,
+                                                    const float* x, float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_add_bias<float, platform::jit::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(x + i);   \
+  tmp = _mm256_add_ps(tmp, bias); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+#else
+  vec_add_bias<float, platform::jit::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_add_bias<float, platform::jit::avx2>(const int n, const float a,
+                                                     const float* x, float* y) {
+  vec_add_bias<float, platform::jit::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_add_bias<float, platform::jit::avx512_common>(const int n,
+                                                              const float a,
+                                                              const float* x,
+                                                              float* y) {
+  // TODO(TJ): enable me
+  vec_add_bias<float, platform::jit::avx2>(n, a, x, y);
 }

 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
@@ -45,18 +198,97 @@ inline void vec_sigmoid(const int n, const T* x, T* y) {
  const T min = SIGMOID_THRESHOLD_MIN;
  const T max = SIGMOID_THRESHOLD_MAX;
  for (int i = 0; i < n; ++i) {
-    T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = 1.0 / (1.0 + std::exp(-tmp));
+    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = static_cast<T>(0) - y[i];
+  }
+  vec_exp<T>(n, y, y);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
+  }
+}
+
+template <>
+inline void vec_sigmoid<float, platform::jit::avx>(const int n, const float* x,
+                                                   float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_sigmoid<float, platform::jit::isa_any>(n, x, y);
+    return;
  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+  __m256 zeros = _mm256_setzero_ps();
+  __m256 tmp;
+#define MOVE_ONE_STEP              \
+  tmp = _mm256_loadu_ps(x + i);    \
+  tmp = _mm256_max_ps(tmp, min);   \
+  tmp = _mm256_min_ps(tmp, max);   \
+  tmp = _mm256_sub_ps(zeros, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest != 0) {
+    // can not continue move step since the src and dst address could be equal
+    const float xmin = SIGMOID_THRESHOLD_MIN;
+    const float xmax = SIGMOID_THRESHOLD_MAX;
+    for (i = n - rest; i < n; ++i) {
+      y[i] = 0.f - ((x[i] < xmin) ? xmin : ((x[i] > xmax) ? xmax : x[i]));
+    }
+  }
+
+  vec_exp<float>(n, y, y);
+
+  __m256 ones = _mm256_set1_ps(1.0f);
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(y + i);   \
+  tmp = _mm256_add_ps(ones, tmp); \
+  tmp = _mm256_div_ps(ones, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step
+  for (i = n - rest; i < n; ++i) {
+    y[i] = 1.f / (1.f + y[i]);
+  }
+#else
+  vec_sigmoid<float, platform::jit::isa_any>(n, x, y);
+#endif
+}
+
+template <>
+inline void vec_sigmoid<float, platform::jit::avx2>(const int n, const float* x,
+                                                    float* y) {
+  vec_sigmoid<float, platform::jit::avx>(n, x, y);
+}
+
+template <>
+inline void vec_sigmoid<float, platform::jit::avx512_common>(const int n,
+                                                             const float* x,
+                                                             float* y) {
+  // TODO(TJ): enable me
+  vec_sigmoid<float, platform::jit::avx2>(n, x, y);
 }

 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 inline void vec_tanh(const int n, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = tanh<T>(x[i]);
-  }
+  vec_scal<T, isa>(n, static_cast<T>(2), x, y);
+  vec_sigmoid<T, isa>(n, y, y);
+  vec_scal<T>(n, static_cast<T>(2), y);
+  vec_add_bias<T, isa>(n, static_cast<T>(-1), y, y);
 }

+// TODO(TJ): make relu clip
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 inline void vec_relu(const int n, const T* x, T* y) {
  for (int i = 0; i < n; ++i) {
@@ -64,24 +296,56 @@ inline void vec_relu(const int n, const T* x, T* y) {
  }
 }

+template <>
+inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
+                                                float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block * 4) {
+    vec_relu<float, platform::jit::isa_any>(n, x, y);
+    return;
+  }
+
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 zeros = _mm256_setzero_ps();
+  __m256 tmp;
+#define MOVE_ONE_STEP              \
+  tmp = _mm256_loadu_ps(x + i);    \
+  tmp = _mm256_max_ps(tmp, zeros); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+  if (rest == 0) {
+    return;
+  }
+  i = n - block;
+  MOVE_ONE_STEP;
+#undef MOVE_ONE_STEP
+
+#else
+  vec_relu<float, platform::jit::isa_any>(n, x, y);
+#endif
+}
+
 template <>
 inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
                                                 float* y) {
-  // TODO(TJ): complete me
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0 ? x[i] : 0;
-  }
+  vec_relu<float, platform::jit::avx>(n, x, y);
 }

 template <>
-inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
-                                                float* y) {
-  // TODO(TJ): complete me
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0 ? x[i] : 0;
-  }
+inline void vec_relu<float, platform::jit::avx512_common>(const int n,
+                                                          const float* x,
+                                                          float* y) {
+  // TODO(TJ): enable me
+  vec_relu<float, platform::jit::avx2>(n, x, y);
 }

+// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
+
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 class VecActivations {
 public:
@@ -96,7 +360,7 @@ class VecActivations {
    } else if (type == "identity" || type == "") {
      return vec_identity<T, isa>;
    }
-    PADDLE_THROW("Not support type %s.", type);
+    LOG(FATAL) << "Not support type: " << type;
  }
 };


--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <sys/time.h>
+#include <cmath>
+#include <cstring>
+#include <random>
+#include <vector>
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/operators/math/cpu_vec.h"
+
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+constexpr int repeat = 1000;
+
+template <typename T>
+inline T _sigmoid(T x) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  T tmp = (x < min) ? min : ((x > max) ? max : x);
+  return static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp));
+}
+
+template <typename T>
+inline T _tanh(T x) {
+  return static_cast<T>(2) * _sigmoid<T>(static_cast<T>(2) * x) -
+         static_cast<T>(1);
+}
+
+template <typename T>
+void ref_sigmoid(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = _sigmoid(x[i]);
+  }
+}
+
+template <typename T>
+void ref_tanh(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = _tanh(x[i]);
+  }
+}
+template <typename T>
+void ref_relu(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <typename T>
+void RandomVec(const int n, T* a) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+  const T lower = static_cast<T>(-20.f);
+  const T upper = static_cast<T>(20.f);
+  for (int i = 0; i < n; ++i) {
+    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+  }
+}
+
+template <typename T>
+void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
+                  std::function<void(const int, const T*, T*)> ref) {
+  std::vector<T> x(n);
+  std::vector<T> ytgt(n), yref(n);
+  RandomVec<T>(n, x.data());
+
+  const T* x_data = x.data();
+  T* ytgt_data = ytgt.data();
+  T* yref_data = yref.data();
+  auto st = GetCurrentUS();
+  for (int i = 0; i < repeat; ++i) {
+    tgt(n, x_data, ytgt_data);
+  }
+  auto mt = GetCurrentUS();
+  for (int i = 0; i < repeat; ++i) {
+    ref(n, x_data, yref_data);
+  }
+  auto et = GetCurrentUS();
+
+  VLOG(3) << "Vec size " << n << ": refer takes: " << (et - mt) / repeat
+          << " us, tgt takes: " << (mt - st) / repeat;
+  for (int i = 0; i < n; ++i) {
+    EXPECT_NEAR(ytgt_data[i], yref_data[i], 1e-3);
+  }
+}
+
+TEST(CpuVecTest, sigmoid) {
+  namespace jit = paddle::platform::jit;
+  using namespace paddle::operators::math;  // NOLINT
+  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
+    TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
+    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
+    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
+    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx512_common>,
+                        ref_sigmoid<float>);
+  }
+  TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
+}
+
+TEST(CpuVecTest, tanh) {
+  namespace jit = paddle::platform::jit;
+  using namespace paddle::operators::math;  // NOLINT
+  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
+    TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>);
+    TestAndBench<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
+    TestAndBench<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
+    TestAndBench<float>(sz, vec_tanh<float, jit::avx512_common>,
+                        ref_tanh<float>);
+  }
+  TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>);
+}
+
+TEST(CpuVecTest, relu) {
+  namespace jit = paddle::platform::jit;
+  using namespace paddle::operators::math;  // NOLINT
+  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
+    TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>);
+    TestAndBench<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
+    TestAndBench<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);
+    TestAndBench<float>(sz, vec_relu<float, jit::avx512_common>,
+                        ref_relu<float>);
+  }
+  TestAndBench<double>(30, vec_relu<double>, ref_relu<double>);
+}
+
+template <typename T>
+void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt,
+                 std::function<void(const int, const T*, T*)> ref) {
+  std::vector<T> x(n);
+  std::vector<T> ytgt(n), yref(n);
+  RandomVec<T>(n, x.data());
+
+  const T* x_data = x.data();
+  T* yref_data = yref.data();
+  T* ytgt_data = ytgt.data();
+  std::memcpy(yref_data, x_data, sizeof(T) * n);
+  std::memcpy(ytgt_data, x_data, sizeof(T) * n);
+
+  ref(n, yref_data, yref_data);
+  tgt(n, ytgt_data, ytgt_data);
+
+  for (int i = 0; i < n; ++i) {
+    EXPECT_NEAR(ytgt_data[i], yref_data[i], 1e-3);
+  }
+}
+
+TEST(CpuVecTest, inplace_sigmoid) {
+  namespace jit = paddle::platform::jit;
+  using namespace paddle::operators::math;  // NOLINT
+  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
+    TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
+    TestInplace<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
+    TestInplace<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
+    TestInplace<float>(sz, vec_sigmoid<float, jit::avx512_common>,
+                       ref_sigmoid<float>);
+  }
+  TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
+}
+
+TEST(CpuVecTest, inplace_tanh) {
+  namespace jit = paddle::platform::jit;
+  using namespace paddle::operators::math;  // NOLINT
+  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
+    TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>);
+    TestInplace<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
+    TestInplace<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
+    TestInplace<float>(sz, vec_tanh<float, jit::avx512_common>,
+                       ref_tanh<float>);
+  }
+  TestInplace<double>(30, vec_tanh<double>, ref_tanh<double>);
+}
+
+TEST(CpuVecTest, inplace_relu) {
+  namespace jit = paddle::platform::jit;
+  using namespace paddle::operators::math;  // NOLINT
+  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
+    TestInplace<float>(sz, vec_relu<float>, ref_relu<float>);
+    TestInplace<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
+    TestInplace<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);
+    TestInplace<float>(sz, vec_relu<float, jit::avx512_common>,
+                       ref_relu<float>);
+  }
+  TestInplace<double>(30, vec_relu<double>, ref_relu<double>);
+}
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -19,6 +19,10 @@ limitations under the License. */

 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
+// remove typedef in openblas
+#undef FLOAT
+#undef INT
+#undef SIZE
 #endif

 #include <cmath>

--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -53,25 +53,27 @@ struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
      LoDTensor* out) {
    int out_offset = 0;
-    auto& eigen_place = *context.eigen_device();
+    int x_item_length = x.numel() / x.dims()[0];
+    auto out_data = out->data<T>();
+    auto x_data = x.data<T>();
    for (size_t i = 1; i < ref_lod.size(); ++i) {
      int repeat_num = ref_lod[i] - ref_lod[i - 1];
      int x_start = x_lod[i - 1];
      int x_end = x_lod[i];
      int x_seq_len = x_end - x_start;
      if (repeat_num > 0) {
-        auto x_sub_tensor = x.Slice(x_start, x_end);
-        x_sub_tensor.Resize({1, x_sub_tensor.numel()});
        int out_start = out_offset;
        if (out->lod().size() == 1) {
          out_start = out->lod()[0][out_offset];
        }
-        auto out_sub_tensor =
-            out->Slice(out_start, out_start + x_seq_len * repeat_num);
-        out_sub_tensor.Resize({repeat_num, x_sub_tensor.dims()[1]});
-        EigenMatrix<T>::From(out_sub_tensor).device(eigen_place) =
-            EigenMatrix<T>::From(x_sub_tensor)
-                .broadcast(Eigen::array<int, 2>({{repeat_num, 1}}));
+        for (int j = 0; j < repeat_num; j++) {
+          for (int k = 0; k < x_seq_len; k++) {
+            for (int l = 0; l < x_item_length; l++) {
+              out_data[(out_start + j * x_seq_len + k) * x_item_length + l] =
+                  x_data[(x_start + k) * x_item_length + l];
+            }
+          }
+        }
      }
      out_offset += repeat_num;
    }

--- a/paddle/fluid/operators/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_mask_op.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/sequence_mask_op.h"
+
+REGISTER_OPERATOR(sequence_mask, paddle::operators::SequenceMaskOp,
+                  paddle::operators::SequenceMaskOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    sequence_mask,
+    paddle::operators::SequenceMaskKernel<paddle::platform::CPUDeviceContext,
+                                          int>,
+    paddle::operators::SequenceMaskKernel<paddle::platform::CPUDeviceContext,
+                                          int64_t>);
--- a/paddle/fluid/operators/sequence_mask_op.cu
+++ b/paddle/fluid/operators/sequence_mask_op.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/sequence_mask_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    sequence_mask,
+    paddle::operators::SequenceMaskKernel<paddle::platform::CUDADeviceContext,
+                                          int>,
+    paddle::operators::SequenceMaskKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t>);
--- a/paddle/fluid/operators/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_mask_op.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef __NVCC__
+#include <thrust/device_ptr.h>
+#include <thrust/functional.h>
+#include <thrust/reduce.h>
+#else
+#include <algorithm>
+#endif
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceMaskOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist");
+
+    auto maxlen = ctx->Attrs().Get<int>("maxlen");
+    if (maxlen > 0) {  // We can only infershape when maxlen > 0
+      auto dim = framework::vectorize2int(ctx->GetInputDim("X"));
+      dim.push_back(maxlen);
+      ctx->SetOutputDim("Y", framework::make_ddim(dim));
+    }
+  }
+};
+
+class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor of sequence_mask op.");
+    AddOutput("Y", "The output mask of sequence_mask op.");
+    AddAttr<int>("maxlen",
+                 "The maximum length of the sequence. If maxlen < 0, maxlen "
+                 "= max(Input(X)).")
+        .SetDefault(-1)
+        .AddCustomChecker([](int &v) {
+          PADDLE_ENFORCE(v < 0 || v >= 1,
+                         "Attr(maxlen) must be less than 0 or larger than 1");
+        });
+    AddAttr<int>("out_dtype", "Output data type");
+    AddComment(R"DOC(
+SequenceMask Operator
+
+This operator outputs a Mask according to Input(X) and Attr(maxlen).
+Supposing Input(X) is a Tensor with shape [d_1, d_2, ..., d_n], the
+Output(Y) is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
+
+Y(i_1, i_2, ..., i_n, j) = (j < X(i_1, i_2, ..., i_n)) 
+
+If maxlen < 0, maxlen = max(X)
+    )DOC");
+  }
+};
+
+template <typename Tx, typename Ty>
+struct SequenceMaskForRangeFunctor {
+  HOSTDEVICE SequenceMaskForRangeFunctor(const Tx *x, Ty *y, int maxlen)
+      : x_(x), y_(y), maxlen_(maxlen) {}
+
+  HOSTDEVICE void operator()(int y_idx) const {
+    int x_idx = y_idx / maxlen_;
+    int j = y_idx % maxlen_;
+    y_[y_idx] = static_cast<Ty>(j < x_[x_idx] ? 1 : 0);
+  }
+
+ private:
+  const Tx *x_;
+  Ty *y_;
+  int maxlen_;
+};
+
+template <typename DeviceContext, typename Tx>
+struct SequenceMaskFunctor {
+  using Tensor = framework::LoDTensor;
+
+  SequenceMaskFunctor(const DeviceContext &ctx, const Tx *x, Tensor *y,
+                      int limits, int maxlen)
+      : ctx_(ctx), x_(x), y_(y), limits_(limits), maxlen_(maxlen) {}
+
+  template <typename Ty>
+  void operator()() const {
+    auto *y_data = y_->mutable_data<Ty>(ctx_.GetPlace());
+    platform::ForRange<DeviceContext> for_range(ctx_, limits_);
+    for_range(SequenceMaskForRangeFunctor<Tx, Ty>(x_, y_data, maxlen_));
+  }
+
+ private:
+  const DeviceContext &ctx_;
+  const Tx *x_;
+  Tensor *y_;
+  int limits_;
+  int maxlen_;
+};
+
+template <typename DeviceContext, typename Tx>
+class SequenceMaskKernel : public framework::OpKernel<Tx> {
+  using Tensor = framework::LoDTensor;
+
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto *y = ctx.Output<Tensor>("Y");
+    auto maxlen = ctx.Attr<int>("maxlen");
+
+    auto *x_data = x->data<Tx>();
+    auto x_numel = x->numel();
+    if (maxlen < 0) {
+#ifdef __NVCC__
+      VLOG(10)
+          << "SequenceMaskOp on GPU may be slow when maxlen is not provided.";
+      maxlen = static_cast<int>(
+          thrust::reduce(thrust::device_pointer_cast(x_data),
+                         thrust::device_pointer_cast(x_data) + x_numel,
+                         static_cast<Tx>(0), thrust::maximum<Tx>()));
+#else
+      maxlen = static_cast<int>(*std::max_element(x_data, x_data + x_numel));
+#endif
+      auto y_dim = framework::vectorize2int(x->dims());
+      y_dim.push_back(maxlen);
+      y->Resize(framework::make_ddim(y_dim));
+    }
+
+    auto out_dtype = static_cast<framework::proto::VarType::Type>(
+        ctx.Attr<int>("out_dtype"));
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    framework::VisitDataType(out_dtype,
+                             SequenceMaskFunctor<DeviceContext, Tx>(
+                                 dev_ctx, x_data, y, x_numel * maxlen, maxlen));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
@@ -150,30 +150,17 @@ class StackKernel : public framework::OpKernel<T> {
    int total_num = pre * n * post;

    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    constexpr auto kMaxThreshold = 16;
-    if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value ||
-        n > kMaxThreshold) {
 #ifdef __NVCC__
-      VLOG(10) << "Stack more than " << kMaxThreshold
-               << " tensors on GPU may be slow.";
-      thrust::device_vector<const T *> device_x_vec(x_datas);
-      auto x_data_arr = device_x_vec.data().get();
+    thrust::device_vector<const T *> device_x_vec(x_datas);
+    auto x_data_arr = device_x_vec.data().get();
 #else
-      auto x_data_arr = x_datas.data();
+    auto x_data_arr = x_datas.data();
 #endif
-      StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
+    StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
 #ifdef __NVCC__
-      // Wait() must be called because device_x_vec may be destructed before
-      // kernel ends
-      dev_ctx.Wait();
-#endif
-    }
-#ifdef __NVCC__
-    else {  // NOLINT
-      framework::Array<const T *, kMaxThreshold> x_data_arr;
-      for (int i = 0; i < n; ++i) x_data_arr[i] = x_datas[i];
-      StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
-    }
+    // Wait() must be called because device_x_vec may be destructed before
+    // kernel ends
+    dev_ctx.Wait();
 #endif
  }
 };
@@ -244,32 +231,17 @@ class StackGradKernel : public framework::OpKernel<T> {
    int post = total_num / (n * pre);

    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    constexpr auto kMaxThreshold = 16;
-    if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value ||
-        n > kMaxThreshold) {
 #ifdef __NVCC__
-      VLOG(10) << "Stack more than " << kMaxThreshold
-               << " tensors on GPU may be slow.";
-      thrust::device_vector<T *> device_dx_vec(dx_datas);
-      auto dx_data_arr = device_dx_vec.data().get();
+    thrust::device_vector<T *> device_dx_vec(dx_datas);
+    auto dx_data_arr = device_dx_vec.data().get();
 #else
-      auto dx_data_arr = dx_datas.data();
+    auto dx_data_arr = dx_datas.data();
 #endif
-      StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n,
-                               post);
+    StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post);
 #ifdef __NVCC__
-      // Wait() must be called because device_dx_vec may be destructed before
-      // kernel ends
-      dev_ctx.Wait();
-#endif
-    }
-#ifdef __NVCC__
-    else {  // NOLINT
-      framework::Array<T *, kMaxThreshold> dx_data_arr;
-      for (int i = 0; i < n; ++i) dx_data_arr[i] = dx_datas[i];
-      StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n,
-                               post);
-    }
+    // Wait() must be called because device_dx_vec may be destructed before
+    // kernel ends
+    dev_ctx.Wait();
 #endif
  }
 };

--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -30,8 +30,6 @@ class TopkOp : public framework::OperatorWithKernel {
                   "Output(Indices) of TopkOp should not be null.");

    auto input_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(input_dims.size(), 2,
-                      "Rank of TopK op's input must be 2.");
    const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));

    PADDLE_ENFORCE_GE(k, 1, "k must >= 1");

--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -37,7 +37,7 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
    } else {
      PADDLE_THROW(
          "uniform_random_op's output only"
-          "supports SelectedRows and Tensor");
+          "supports SelectedRows and LoDTensor");
    }
    T* data = tensor->mutable_data<T>(ctx.GetPlace());
    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));

--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -54,7 +54,7 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
    } else {
      PADDLE_THROW(
          "uniform_random_op's output only"
-          "supports SelectedRows and Tensor");
+          "supports SelectedRows and LoDTensor");
    }
    T* data = tensor->mutable_data<T>(context.GetPlace());
    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -50,7 +50,7 @@ ENDIF()
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS malloc
-    place eigen3 stringpiece cpu_helper framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)

 cc_test(init_test SRCS init_test.cc DEPS device_context)

--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -51,7 +51,7 @@ typedef enum {
 } cpu_isa_t;  // Instruction set architecture

 // May I use some instruction
-inline bool MayIUse(const cpu_isa_t cpu_isa);
+bool MayIUse(const cpu_isa_t cpu_isa);

 }  // namespace jit


--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -56,7 +56,11 @@ limitations under the License. */
 #include <immintrin.h>
 #endif  // PADDLE_ARM

+#if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x) /*do nothing*/
+#endif

 namespace paddle {
 namespace platform {

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -18,6 +18,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
@@ -120,6 +121,22 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
 #ifndef PADDLE_WITH_MKLDNN
  platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
+
+  if (platform::jit::MayIUse(platform::jit::avx512_common)) {
+#ifndef __AVX512F__
+    LOG(WARNING) << "AVX512F is available, Please re-compile on local machine";
+#endif
+  }
+  if (platform::jit::MayIUse(platform::jit::avx2)) {
+#ifndef __AVX2__
+    LOG(WARNING) << "AVX2 is available, Please re-compile on local machine";
+#endif
+  }
+  if (platform::jit::MayIUse(platform::jit::avx)) {
+#ifndef __AVX__
+    LOG(WARNING) << "AVX is available, Please re-compile on local machine";
+#endif
+  }
 }

 void InitGLOG(const std::string &prog_name) {

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
-set(PYBIND_DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
-          )
+
+set(PYBIND_DEPS pybind python proto_desc memory executor prune  feed_fetch_method)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc)
 if(NOT WIN32)
-list(APPEND PYBIND_DEPS parallel_executor)
+list(APPEND PYBIND_DEPS parallel_executor profiler)
+list(APPEND PYBIND_SRCS recordio.cc)
 endif()
 if(WITH_PYTHON)
  if(WITH_AMD_GPU)
    hip_library(paddle_pybind SHARED
-      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
+      SRCS ${PYBIND_SRCS}
      DEPS ${PYBIND_DEPS}
      ${GLOB_OP_LIB})
  else()
    cc_library(paddle_pybind SHARED
-      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
+      SRCS ${PYBIND_SRCS}
      DEPS ${PYBIND_DEPS}
      ${GLOB_OP_LIB})
    if(NOT APPLE AND NOT ANDROID AND NOT WIN32)

--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -43,9 +43,6 @@ void BindConstValue(pybind11::module* m) {
  op_proto_and_checker_maker.def(
      "kOpRoleVarAttrName",
      framework::OpProtoAndCheckerMaker::OpRoleVarAttrName);
-  op_proto_and_checker_maker.def(
-      "kOpCreationCallstackAttrName",
-      framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName);
 }

 }  // namespace pybind

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -328,6 +328,11 @@ function assert_api_not_changed() {
    source .env/bin/activate
    pip install ${PADDLE_ROOT}/build/python/dist/*whl
    python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
+    if [ "$1" == "cp35-cp35m" ]; then
+        # Use sed to make python2 and python3 sepc keeps the same
+        sed -i 's/arg0: str/arg0: unicode/g' new.spec
+        sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec
+    fi
    python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec
    deactivate

@@ -621,7 +626,7 @@ function main() {
        gen_capi_package
        gen_fluid_inference_lib
        test_fluid_inference_lib
-        assert_api_not_changed
+        assert_api_not_changed ${PYTHON_ABI:-""}
        ;;
      *)
        print_usage

--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -19,6 +19,7 @@ import hashlib
 import os
 import errno
 import shutil
+import six
 import sys
 import importlib
 import paddle.dataset
@@ -94,6 +95,8 @@ def download(url, module_name, md5sum, save_name=None):
                dl = 0
                total_length = int(total_length)
                for data in r.iter_content(chunk_size=4096):
+                    if six.PY2:
+                        data = six.b(data)
                    dl += len(data)
                    f.write(data)
                    done = int(50 * dl / total_length)

--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -35,20 +35,22 @@ import itertools
 import functools
 from .common import download
 import tarfile
+import six
 import scipy.io as scio
 from paddle.dataset.image import *
 from paddle.reader import *
 import os
 import numpy as np
 from multiprocessing import cpu_count
+import six
 from six.moves import cPickle as pickle
 from six.moves import zip
 __all__ = ['train', 'test', 'valid']

-DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
-LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
-SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
-DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
+DATA_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/102flowers.tgz'
+LABEL_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/imagelabels.mat'
+SETID_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/setid.mat'
+DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
 LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
 SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
 # In official 'readme', tstid is the flag of test data
@@ -120,7 +122,10 @@ def reader_creator(data_file,
                file = file.strip()
                batch = None
                with open(file, 'rb') as f:
-                    batch = pickle.load(f)
+                    if six.PY2:
+                        batch = pickle.load(f)
+                    else:
+                        batch = pickle.load(f, encoding='bytes')
                data = batch['data']
                labels = batch['label']
                for sample, label in zip(data, batch['label']):

--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -36,11 +36,6 @@ import numpy as np
 try:
    import cv2
 except ImportError:
-    import sys
-    sys.stderr.write(
-        '''Warning with paddle image module: opencv-python should be imported,
-    or paddle image module could NOT work; please install opencv-python first.'''
-    )
    cv2 = None
 import os
 import tarfile
@@ -53,6 +48,18 @@ __all__ = [
 ]


+def _check_cv2():
+    if cv2 is None:
+        import sys
+        sys.stderr.write(
+            '''Warning with paddle image module: opencv-python should be imported,
+         or paddle image module could NOT work; please install opencv-python first.'''
+        )
+        return False
+    else:
+        return True
+
+
 def batch_images_from_tar(data_file,
                          dataset_name,
                          img2label,
@@ -134,7 +141,7 @@ def load_image_bytes(bytes, is_color=True):
                     load and return a gray image.
    :type is_color: bool
    """
-    assert cv2 is not None
+    assert _check_cv2() is True

    flag = 1 if is_color else 0
    file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
@@ -159,7 +166,7 @@ def load_image(file, is_color=True):
                     load and return a gray image.
    :type is_color: bool
    """
-    assert cv2 is not None
+    assert _check_cv2() is True

    # cv2.IMAGE_COLOR for OpenCV3
    # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
@@ -188,7 +195,7 @@ def resize_short(im, size):
    :param size: the shorter edge size of image after resizing.
    :type size: int
    """
-    assert cv2 is not None
+    assert _check_cv2() is True

    h, w = im.shape[:2]
    h_new, w_new = size, size

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -18,7 +18,6 @@ import collections
 import contextlib
 import re
 import six
-import traceback

 import numpy as np

@@ -506,10 +505,6 @@ class Operator(object):
        if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
            del op_attrs[role_var_name]

-        callstack_var_name = op_maker.kOpCreationCallstackAttrName()
-        op_attrs[callstack_var_name] = list(
-            reversed(traceback.format_stack()))[1:]
-
        if len(self.desc.type()) != 0:
            return
        if type is None:

--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -119,10 +119,14 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
    helper = LayerHelper("auc", **locals())
    auc_out = helper.create_tmp_variable(dtype="float64")
    # make tp, tn, fp, fn persistable, so that can accumulate all batches.
-    tp = helper.create_global_variable(persistable=True, dtype='int64')
-    tn = helper.create_global_variable(persistable=True, dtype='int64')
-    fp = helper.create_global_variable(persistable=True, dtype='int64')
-    fn = helper.create_global_variable(persistable=True, dtype='int64')
+    tp = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[num_thresholds])
+    tn = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[num_thresholds])
+    fp = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[num_thresholds])
+    fn = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[num_thresholds])
    for var in [tp, tn, fp, fn]:
        helper.set_variable_initializer(
            var, Constant(

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -27,7 +27,6 @@ from . import utils
 import random
 from .. import unique_name
 from functools import reduce
-import warnings

 __all__ = [
    'fc',
@@ -104,6 +103,7 @@ __all__ = [
    'rank_loss',
    'prelu',
    'flatten',
+    'sequence_mask',
    'stack',
    'sequence_enumerate',
 ]
@@ -2049,7 +2049,7 @@ def batch_norm(input,
        param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
        bias_attr(ParamAttr): The parameter attribute for Parameter `bias`.
        data_layout(string, default NCHW): NCHW|NHWC
-        in_place(bool, Default False): This argument is deprecated since 0.15.0.
+        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
        use_mkldnn(bool, Default false): ${use_mkldnn_comment}
        name(string, Default None): A name for this layer(optional). If set None, the layer
            will be named automatically.
@@ -2071,10 +2071,6 @@ def batch_norm(input,
    helper = LayerHelper('batch_norm', **locals())
    dtype = helper.input_dtype()

-    if in_place:
-        raise warnings.warn("The argument in_place is deprecated since 0.15.0, "
-                            "please do not set it True.")
-
    input_shape = input.shape
    if data_layout == 'NCHW':
        channel_num = input_shape[1]
@@ -2124,7 +2120,7 @@ def batch_norm(input,
    saved_mean = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
    saved_variance = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)

-    batch_norm_out = helper.create_tmp_variable(dtype)
+    batch_norm_out = input if in_place else helper.create_tmp_variable(dtype)

    helper.append_op(
        type="batch_norm",
@@ -5571,8 +5567,76 @@ def sequence_enumerate(input, win_size, pad_value, name=None):
        attrs={'win_size': win_size,
               'pad_value': pad_value})

+    
+def sequence_mask(x, maxlen=None, dtype='int64', name=None):
+    """
+    **SequenceMask Layer**
+
+    This layer outputs a mask according to the input :code:`x` and
+    :code:`maxlen` with data type of :code:`dtype`.
+
+    Supposing :code:`x` is a Tensor with shape [d_1, d_2, ..., d_n], the
+    :code:`y` is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
+    
+    .. math::
+     
+        y(i_1, i_2,..., i_n, j) = (j < x(i_1, i_2,..., i_n))
+
+    Args:
+        x (Variable): Input tensor of sequence_mask layer, 
+                      whose elements are integers less than :code:`maxlen`.
+        maxlen (int|None): Maximum length of the sequence. If :code:`maxlen`
+                           is None, it would be replace with :math:`max(x)`.
+        dtype (np.dtype|core.VarDesc.VarType|str): Data type of the output.
+        name (str|None): A name for this layer(optional). If set None, the 
+                         layer will be named automatically.  
+    
+    Returns:
+        Variable: The output sequence mask.
+    
+    """
+
+    helper = LayerHelper('sequence_mask', **locals())
+    if name is None:
+        out = helper.create_tmp_variable(dtype=dtype)
+    else:
+        out = helper.create_tmp_variable(dtype=dtype, name=name)
+
+    helper.append_op(
+        type='sequence_mask',
+        inputs={'X': [x]},
+        outputs={'Y': out},
+        attrs={
+            'max_len': maxlen if maxlen is not None else -1,
+            'out_dtype': out.dtype
+        })
+    return out
+

 def stack(x, axis=0):
+    """
+    **Stack Layer**
+
+    This layer stacks all of the input :code:`x` along axis.
+   
+    Input :code:`x` can be a single variable, a :code:`list` of variables, 
+    or a :code:`tuple` of variables. If :code:`x` is a :code:`list` or 
+    :code:`tuple`, the shapes of all these variables must be the same.  
+    Supposing the shape of each input is :math:`[d_0, d_1, ..., d_{n-1}]`, 
+    the shape of the output variable would be 
+    :math:`[d_0, d_1, ..., d_{axis}=len(x), ..., d_{n-1}]`. 
+    If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x[0])+1`.
+    If :code:`axis` is None, it would be replaced with 0. 
+
+    Args:
+        x (Variable|list(Variable)|tuple(Variable)): Input variables. 
+        axis (int|None): The axis along which all inputs are stacked.
+    
+    Returns:
+        Variable: The stacked variable.
+    
+    """
+
    helper = LayerHelper('stack', **locals())
    axis = 0 if axis is None else axis


--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -229,7 +229,7 @@ def img_conv_group(input,
            use_mkldnn=use_mkldnn)

        if conv_with_batchnorm[i]:
-            tmp = layers.batch_norm(input=tmp, act=conv_act)
+            tmp = layers.batch_norm(input=tmp, act=conv_act, in_place=True)
            drop_rate = conv_batchnorm_drop_rate[i]
            if abs(drop_rate) > 1e-5:
                tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)

--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -256,10 +256,7 @@ def main(net_type, use_cuda, is_local=True):
    save_dirname = "image_classification_" + net_type + ".inference.model"

    train(net_type, use_cuda, save_dirname, is_local)
-
-    # There is bug in fluid.InferenceTranspiler for VGG.
-    if net_type == "resnet":
-        infer(use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)


 class TestImageClassification(unittest.TestCase):

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -64,6 +64,7 @@ if(WITH_DISTRIBUTE)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
+set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150)
 py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
 py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)

--- a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py
@@ -37,7 +37,7 @@ def attention_lstm(
    T = sum(lod[0])
    N = len(lod[0])
    M = x.shape[1]
-    D = b.shape[1] / 4
+    D = b.shape[1] // 4
    assert T == x.shape[0]
    assert len(fcws) == len(fcbs)
    hidden = []

--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
@@ -120,8 +120,8 @@ def operator_equal(a, b):
                raise ValueError("In operator_equal not equal:{0}\n".format(k))

        elif isinstance(v, collections.OrderedDict):
-            v0 = sorted(six.iteritems(v), key=lambda x: x[0])
-            v1 = sorted(six.iteritems(b.__dict__[k]), key=lambda x: x[0])
+            v0 = sorted(list(six.iteritems(v)), key=lambda x: x[0])
+            v1 = sorted(list(six.iteritems(b.__dict__[k])), key=lambda x: x[0])

            if v0 != v1:
                raise ValueError("In operator_equal not equal:{0}\n".format(k))
@@ -139,17 +139,15 @@ def block_equal(a, b):
            continue

        elif k == "ops":
+            assert (len(a.ops) == len(b.ops))
            for i in range(0, len(a.ops)):
                if not operator_equal(a.ops[i], b.ops[i]):
                    raise ValueError("In block_equal not equal:{0}\n".format(k))
-            assert (len(a.ops) == len(b.ops))

        elif isinstance(v, collections.OrderedDict):
-            v0 = sorted(six.iteritems(v), key=lambda x: x[0])
-            v1 = sorted(six.iteritems(b.__dict__[k]), key=lambda x: x[0])
-
-            if v0 != v1:
-                raise ValueError("In block_equal not equal:{0}\n".format(k))
+            for key, value in six.iteritems(v):
+                if str(value) != str(b.__dict__[k][key]):
+                    raise ValueError("In block_equal not equal:{0}\n".format(k))

        elif (v != b.__dict__[k]):
            raise ValueError("In block_equal not equal:{0}\n".format(k))

--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -21,6 +21,7 @@ import paddle.fluid as fluid
 from paddle.fluid.transpiler.distribute_transpiler import delete_ops
 import traceback
 import collections
+import six


 class TranspilerTest(unittest.TestCase):
@@ -644,18 +645,18 @@ class TestLoadSliceVar(TranspilerTest):
        self.assertTrue(pserver._slice_vars_and_attrs)
        self.assertTrue(pserver2._slice_vars_and_attrs)

-        for idx in xrange(len(pserver._slice_vars_and_attrs)):
+        for idx in six.moves.xrange(len(pserver._slice_vars_and_attrs)):
            self.assertEqual(pserver._slice_vars_and_attrs[idx][0],
                             pserver2._slice_vars_and_attrs[idx][0])

-            total_numel = reduce(lambda x, y: x * y,
-                                 pserver._slice_vars_and_attrs[idx][0].shape)
+            total_numel = six.moves.reduce(
+                lambda x, y: x * y, pserver._slice_vars_and_attrs[idx][0].shape)
            self.assertEqual(
                total_numel,
-                reduce(lambda x, y: x * y,
-                       pserver._slice_vars_and_attrs[idx][2].shape) + reduce(
-                           lambda x, y: x * y,
-                           pserver2._slice_vars_and_attrs[idx][2].shape))
+                six.moves.reduce(lambda x, y: x * y,
+                                 pserver._slice_vars_and_attrs[idx][2].shape) +
+                six.moves.reduce(lambda x, y: x * y,
+                                 pserver2._slice_vars_and_attrs[idx][2].shape))


 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -18,6 +18,9 @@ import unittest
 import numpy as np
 from op_test import OpTest

+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+

 class TestFillConstantOp1(OpTest):
    def setUp(self):
@@ -47,5 +50,31 @@ class TestFillConstantOp2(OpTest):
        self.check_output()


+class TestFillConstantOpWithSelectedRows(OpTest):
+    def check_with_place(self, place):
+        scope = core.Scope()
+        # create Out Variable
+        out = scope.var('Out').get_selected_rows()
+
+        # create and run fill_constant_op operator
+        fill_constant_op = Operator(
+            "fill_constant", shape=[123, 92], value=3.8, Out='Out')
+        fill_constant_op.run(scope, place)
+
+        # get result from Out
+        result_array = np.array(out.get_tensor())
+        full_array = np.full((123, 92), 3.8, 'float32')
+
+        self.assertTrue(np.array_equal(result_array, full_array))
+
+    def test_fill_constant_with_selected_rows(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -67,10 +67,7 @@ class TestOperator(unittest.TestCase):
        self.assertEqual(mul_op.output("Out"), ["mul.out"])
        self.assertEqual(
            set(mul_op.attr_names),
-            set([
-                "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var",
-                "op_callstack"
-            ]))
+            set(["x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var"]))
        self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
        self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
        self.assertEqual(mul_op.attr("x_num_col_dims"), 1)

--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -51,30 +51,28 @@ class PReluTest(OpTest):
    def test_check_output(self):
        self.check_output()

-    def test_check_grad(self):
-        self.check_grad(['X', 'Alpha'], 'Out')
-
-    def test_check_grad_ignore_x(self):
+    def test_check_grad_1_ignore_x(self):
        self.check_grad(['Alpha'], 'Out', no_grad_set=set('X'))

-    def test_check_grad_ignore_alpha(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('Alpha'))
-
-
-class TestCase1(PReluTest):
-    def initTestCase(self):
-        self.attrs = {'mode': "all"}
+    def test_check_grad_2(self):
+        self.check_grad(['X', 'Alpha'], 'Out')

+    def test_check_grad_3_ignore_alpha(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set('Alpha'))

-class TestCase2(PReluTest):
-    def initTestCase(self):
-        self.attrs = {'mode': "channel"}

+# TODO(minqiyang): Resume these test cases after fixing Python3 CI job issues
+#  class TestCase1(PReluTest):
+#  def initTestCase(self):
+#  self.attrs = {'mode': "all"}

-class TestCase3(PReluTest):
-    def initTestCase(self):
-        self.attrs = {'mode': "element"}
+#  class TestCase2(PReluTest):
+#  def initTestCase(self):
+#  self.attrs = {'mode': "channel"}

+#  class TestCase3(PReluTest):
+#  def initTestCase(self):
+#  self.attrs = {'mode': "element"}

 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_sequence_mask.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_mask.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_test import OpTest
+import paddle.fluid as fluid
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+import paddle.fluid.core as core
+import numpy as np
+import copy
+import unittest
+
+
+class SequenceMaskTestBase(OpTest):
+    def initDefaultParameters(self):
+        self.op_type = 'sequence_mask'
+        self.maxlen = 10
+        self.mask_dtype = 'int64'
+        self.x = [[0, 3, 4], [5, 7, 9]]
+
+    def initParameters(self):
+        pass
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.initParameters()
+        if not isinstance(self.x, np.ndarray):
+            self.x = np.array(self.x)
+
+        self.inputs = {'X': self.x}
+        self.outputs = {'Y': self.calc_ground_truth_mask()}
+        self.attrs = {
+            'maxlen': self.maxlen,
+            'out_dtype': convert_np_dtype_to_dtype_(self.mask_dtype)
+        }
+
+    def calc_ground_truth_mask(self):
+        maxlen = np.max(self.x) if self.maxlen < 0 else self.maxlen
+        shape = self.x.shape + (maxlen, )
+        index_broadcast = np.broadcast_to(
+            np.reshape(
+                range(maxlen), newshape=[1] * self.x.ndim + [-1]),
+            shape=shape)
+        x_broadcast = np.broadcast_to(
+            np.reshape(
+                self.x, newshape=self.x.shape + (-1, )), shape=shape)
+        return (index_broadcast < x_broadcast).astype(self.mask_dtype)
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class SequenceMaskTest1(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'bool'
+
+
+class SequenceMaskTest2(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'uint8'
+
+
+class SequenceMaskTest3(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'int32'
+
+
+class SequenceMaskTest4(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'float32'
+
+
+class SequenceMaskTest5(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'float64'
+
+
+class SequenceMaskTest6(SequenceMaskTestBase):
+    def initParameters(self):
+        self.maxlen = -1
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -159,7 +159,7 @@ def program_to_code(prog):
            get_indent_space(indent), '{', block_idx))
        indent += 1
        # sort all vars
-        all_vars = sorted(block.vars.iteritems(), key=lambda x: x[0])
+        all_vars = sorted(six.iteritems(block.vars), key=lambda x: x[0])
        for var in all_vars:
            print("{}{}".format(
                get_indent_space(indent), variable_to_code(var[1])))

--- a/tools/check_ctest_hung.py
+++ b/tools/check_ctest_hung.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 import sys
 import re

@@ -46,7 +48,7 @@ Diff:  set(['test_parallel_executor_crf'])
                start_parts = escape(l).split(" ")
                m = re.search("Start\s+[0-9]+\:\s([a-z0-9_]+)", escape(l))
                started.add(m.group(1))
-    print "Diff: ", started - passed
+    print("Diff: ", started - passed)


 if __name__ == "__main__":

--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -17,6 +17,8 @@ Print all signature of a python module in alphabet order.
 Usage:
    ./print_signature  "paddle.fluid" > signature.txt
 """
+from __future__ import print_function
+
 import importlib
 import inspect
 import collections
@@ -64,4 +66,4 @@ def visit_all_module(mod):
 visit_all_module(importlib.import_module(sys.argv[1]))

 for name in member_dict:
-    print name, member_dict[name]
+    print(name, member_dict[name])
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -14,6 +14,7 @@

 import argparse
 import json
+import six
 import sys
 import unittest

@@ -124,7 +125,7 @@ class Timeline(object):
        return cur_pid

    def _allocate_pids(self):
-        for k, profile_pb in self._profile_dict.iteritems():
+        for k, profile_pb in six.iteritems(self._profile_dict):
            for event in profile_pb.events:
                if event.type == profiler_pb2.Event.CPU:
                    if (k, event.device_id, "CPU") not in self._devices:
@@ -140,7 +141,7 @@ class Timeline(object):
                                                    (k, event.device_id), pid)

    def _allocate_events(self):
-        for k, profile_pb in self._profile_dict.iteritems():
+        for k, profile_pb in six.iteritems(self._profile_dict):
            for event in profile_pb.events:
                if event.type == profiler_pb2.Event.CPU:
                    type = "CPU"