Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_default_att

a7ebfe06 · phlrain · 376c2f01 · 1ef23279 · a7ebfe06 · a7ebfe06
1000 changed file
--- a/.github/ISSUE_TEMPLATE/---document-issue-.md
+++ b/.github/ISSUE_TEMPLATE/---document-issue-.md
@@ -56,4 +56,4 @@ For example: no sample code; The sample code is not helpful; The sample code not
 For example:Chinese API in this doc is inconsistent with English API, including params, description, sample code, formula, etc.

 #### Other
-For example: The doc link is broken; The doc page is missing; Dead link in docs.
\ No newline at end of file
+For example: The doc link is broken; The doc page is missing; Dead link in docs.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,6 +13,7 @@
 # limitations under the License

 cmake_minimum_required(VERSION 3.10)
+cmake_policy(VERSION 3.10)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
@@ -21,9 +22,6 @@ include(system)

 project(paddle CXX C)

-include(init)
-include(generic)            # simplify cmake module
-
 # enable language CUDA
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
 find_package(CUDA QUIET)
@@ -32,16 +30,23 @@ option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
+option(WITH_ROCM        "Compile PaddlePaddle with ROCM platform"       OFF)
+# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
+# to develop some acl related functionality on x86
+option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
+option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
+# Note(zhouwei): It use option above, so put here
+include(init)
+include(generic)            # simplify cmake module
+
 if (WITH_GPU  AND WITH_XPU)
    message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
-if (WITH_GPU  AND WITH_ASCEND)
+if (WITH_GPU AND WITH_ASCEND)
    message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
 endif()
-# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
-if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
-    message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
-       "You can use cmake 3.16 (recommended), 3.10, 3.11, 3.15 or 3.17. Please refer to the install document: https://cmake.org/install/")
+if (WITH_GPU AND WITH_ROCM)
+    message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
 endif()

 if(WITH_GPU AND NOT APPLE)
@@ -61,6 +66,10 @@ if(WITH_MUSL)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
 endif()

+if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
+    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+endif()
+
 if(WIN32)
    option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)

@@ -72,6 +81,13 @@ if(WIN32)
    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj")
    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj")

+    if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
+        set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /Zc:inline")
+        set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /Zc:inline")
+        set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /Zc:inline")
+        set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /Zc:inline")
+    endif()
+
    if (MSVC_STATIC_CRT)
        message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019")
        set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /MTd")
@@ -89,8 +105,8 @@ if(WIN32)
        endforeach(flag_var)
    endif()

-    # NOTE(Avin0323): Less parallel count result in faster compilation.
    math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
+
    # windows build turn off warnings, use parallel compiling.
    foreach(flag_var
        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
@@ -98,7 +114,10 @@ if(WIN32)
        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
        string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
-        set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
+        # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling
+        if(NOT WITH_GPU)
+            set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
+        endif()
    endforeach(flag_var)
    foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
        set(${flag_var} "${${flag_var}} /w")
@@ -116,6 +135,13 @@ if(WIN32)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")

+    foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
+        set(${flag_var} "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221")
+        if(MSVC_STATIC_CRT)
+            set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB")
+        endif()
+    endforeach(flag_var)
+
    if (WITH_WIN_DUMP_DBG)
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi")
@@ -153,8 +179,6 @@ option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(ON_INFER         "Turn on inference optimization and inference-lib generation" OFF)
 ################################ Internal Configurations #######################################
-option(WITH_ROCM        "Compile PaddlePaddle with ROCM platform"         OFF)
-option(WITH_RCCL        "Compile PaddlePaddle with RCCL support"          OFF)
 option(WITH_NV_JETSON   "Compile PaddlePaddle with NV JETSON"             OFF)
 option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
@@ -165,14 +189,15 @@ option(WITH_PSLIB       "Compile with pslib support"                    OFF)
 option(WITH_BOX_PS      "Compile with box_ps support"                   OFF)
 option(WITH_XBYAK       "Compile with xbyak support"                    ON)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
-option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_PSCORE     "Compile with parameter server support"         ${WITH_DISTRIBUTE})
+option(WITH_HETERPS     "Compile with heterps"                          OFF})
 option(WITH_INFERENCE_API_TEST   "Test fluid inference C++ high-level api interface"  OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
 option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
 option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
+option(WITH_RCCL   "Compile PaddlePaddle with RCCL support"             ON)
 option(WITH_XPU_BKCL    "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL"   OFF)
 option(WITH_CRYPTO   "Compile PaddlePaddle with crypto support"         ON)
 option(WITH_ARM   "Compile PaddlePaddle with arm support"         OFF)
@@ -180,6 +205,7 @@ option(WITH_SW   "Compile PaddlePaddle with sw support"         OFF)
 option(WITH_MIPS   "Compile PaddlePaddle with mips support"         OFF)
 option(WITH_MUSL        "Compile with musl libc instead of gblic"  OFF)
 option(WITH_UNITY_BUILD "Compile with UnityBuild mode"             OFF)
+option(WITH_STRIP       "Strip so files of Whl packages"         OFF)

 # PY_VERSION
 if(NOT PY_VERSION)
@@ -240,9 +266,6 @@ endif()

 if(WITH_BRPC_RDMA)
    message(STATUS "Use brpc with rdma.")
-    if(WITH_GRPC)
-        message(FATAL_ERROR "Can't use grpc with brpc rdma.")
-    endif()
    if(NOT WITH_DISTRIBUTE)
        message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
    endif()
@@ -290,9 +313,9 @@ endif(WITH_ROCM)

 if (NOT WITH_ROCM AND WITH_RCCL)
    MESSAGE(WARNING
-        "Disable RCCL when compiling without GPU. Force WITH_RCCL=OFF.")
-    set(WITH_NCCL OFF CACHE STRING
-        "Disable RCCL when compiling without GPU" FORCE)
+        "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
+    set(WITH_RCCL OFF CACHE STRING
+        "Disable RCCL when compiling without ROCM" FORCE)
 endif()

 if(WITH_RCCL)
@@ -330,6 +353,11 @@ if (WITH_MIPS)
    add_definitions(-DPADDLE_WITH_MIPS)
 endif()

+if (WITH_HETERPS)
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new")
+    endif()
+endif()
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")

 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
@@ -347,6 +375,13 @@ else()
    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
 endif()

+if(WITH_STRIP)
+    find_program(STRIP_PATH strip)
+    if(NOT STRIP_PATH OR NOT LINUX)
+        set(WITH_STRIP OFF CACHE STRING "Command strip is only used on Linux when it exists." FORCE)
+    endif()
+endif()
+
 add_subdirectory(paddle)
 if(WITH_PYTHON)
    add_subdirectory(python)

--- a/README.md
+++ b/README.md
-
-<p align="center">
+<p align="center">
 <img align="center" src="doc/imgs/logo.png", width=1600>
 <p>
    

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -82,6 +82,10 @@ if(WITH_ASCEND)
    add_definitions(-DPADDLE_WITH_ASCEND)
 endif()

+if(WITH_ASCEND_CL)
+    add_definitions(-DPADDLE_WITH_ASCEND_CL)
+endif()
+
 if(WITH_XPU)
    message(STATUS "Compile with XPU!")
    add_definitions(-DPADDLE_WITH_XPU)
@@ -93,13 +97,18 @@ if(WITH_GPU)

    FIND_PACKAGE(CUDA REQUIRED)

-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 7)
-        message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 10.1)
+        message(FATAL_ERROR "Paddle needs CUDA >= 10.1 to compile")
    endif()

    if(NOT CUDNN_FOUND)
        message(FATAL_ERROR "Paddle needs cudnn to compile")
    endif()
+
+    if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+        message(FATAL_ERROR "Paddle needs CUDNN >= 7.0 to compile")
+    endif()
+
    if(CUPTI_FOUND)
        include_directories(${CUPTI_INCLUDE_DIR})
        add_definitions(-DPADDLE_WITH_CUPTI)
@@ -164,10 +173,9 @@ if(WITH_PSCORE)
    add_definitions(-DPADDLE_WITH_PSCORE)
 endif()

-
-if(WITH_GRPC)
-    add_definitions(-DPADDLE_WITH_GRPC)
-endif(WITH_GRPC)
+if(WITH_HETERPS)
+    add_definitions(-DPADDLE_WITH_HETERPS)
+endif()

 if(WITH_BRPC_RDMA)
    add_definitions(-DPADDLE_WITH_BRPC_RDMA)

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -6,15 +6,9 @@ endif()
 if (WITH_NV_JETSON)
  add_definitions(-DWITH_NV_JETSON)
  set(paddle_known_gpu_archs "53 62 72")
-  set(paddle_known_gpu_archs7 "53")
-  set(paddle_known_gpu_archs8 "53 62")
-  set(paddle_known_gpu_archs9 "53 62")
  set(paddle_known_gpu_archs10 "53 62 72")
 else()
-  set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
-  set(paddle_known_gpu_archs7 "30 35 50 52")
-  set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
-  set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
+  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80")
  set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
  set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
 endif()
@@ -74,7 +68,7 @@ endfunction()
 #   select_nvcc_arch_flags(out_variable)
 function(select_nvcc_arch_flags out_variable)
  # List of arch names
-  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
+  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "Ampere" "All" "Manual")
  set(archs_name_default "Auto")
  list(APPEND archs_names "Auto")

@@ -91,7 +85,7 @@ function(select_nvcc_arch_flags out_variable)

  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
    set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
-    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+    set(CUDA_ARCH_PTX ""                        CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
  else()
    unset(CUDA_ARCH_BIN CACHE)
@@ -108,6 +102,8 @@ function(select_nvcc_arch_flags out_variable)
    set(cuda_arch_bin "70")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
    set(cuda_arch_bin "75")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
+    set(cuda_arch_bin "80")
  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
    set(cuda_arch_bin ${paddle_known_gpu_archs})
  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
@@ -158,31 +154,21 @@ function(select_nvcc_arch_flags out_variable)
 endfunction()

 message(STATUS "CUDA detected: " ${CMAKE_CUDA_COMPILER_VERSION})
-if (${CMAKE_CUDA_COMPILER_VERSION} LESS 7.0)
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 8.0) # CUDA 7.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 9.0) # CUDA 8.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
+if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
-  # warning for now.
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) # CUDA 9.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs9})
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
+elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.2) # CUDA 11.0/11.1
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs11})
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs11})
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
+elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.2+
+  set(paddle_known_gpu_archs "${paddle_known_gpu_archs11} 86")
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
 endif()

 if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
@@ -198,14 +184,11 @@ select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA}")
 message(STATUS "NVCC_FLAGS_EXTRA: ${NVCC_FLAGS_EXTRA}")

-# Set C++11 support
+# Set C++14 support
 set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
-if (NOT WIN32) # windows msvc2015 support c++11 natively.
-    # -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
-  set(CMAKE_CUDA_STANDARD 11)
-endif(NOT WIN32)
+set(CMAKE_CUDA_STANDARD 14)

 # (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w
 # So replace /W[1-4] with /W0

--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -94,7 +94,7 @@ macro(find_cudnn_version cudnn_header_file)
                "${CUDNN_MAJOR_VERSION} * 1000 +
                 ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
            message(STATUS "Current cuDNN header is ${cudnn_header_file} "
-              "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ")
+              "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCHLEVEL_VERSION}. ")
        endif()
    endif()
 endmacro()

--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@@ -12,50 +12,78 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-INCLUDE(ExternalProject)
-
-SET(ASCEND_PROJECT       "extern_ascend")
-IF((NOT DEFINED ASCEND_VER) OR (NOT DEFINED ASCEND_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(ASCEND_VER "0.1.1" CACHE STRING "" FORCE)
-  SET(ASCEND_NAME "ascend" CACHE STRING "" FORCE)
-  SET(ASCEND_URL "http://paddle-ascend.bj.bcebos.com/ascend.tar.gz" CACHE STRING "" FORCE)
-ENDIF()
-MESSAGE(STATUS "ASCEND_NAME: ${ASCEND_NAME}, ASCEND_URL: ${ASCEND_URL}")
-SET(ASCEND_SOURCE_DIR    "${THIRD_PARTY_PATH}/ascend")
-SET(ASCEND_DOWNLOAD_DIR  "${ASCEND_SOURCE_DIR}/src/${ASCEND_PROJECT}")
-SET(ASCEND_DST_DIR       "ascend")
-SET(ASCEND_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(ASCEND_INSTALL_DIR   ${ASCEND_INSTALL_ROOT}/${ASCEND_DST_DIR})
-SET(ASCEND_ROOT          ${ASCEND_INSTALL_DIR})
-SET(ASCEND_INC_DIR       ${ASCEND_ROOT}/include)
-SET(ASCEND_LIB_DIR       ${ASCEND_ROOT}/lib)
-SET(ASCEND_LIB           ${ASCEND_LIB_DIR}/libge_runner.so)
-SET(ASCEND_GRAPH_LIB           ${ASCEND_LIB_DIR}/libgraph.so)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${ASCEND_ROOT}/lib")
-
-INCLUDE_DIRECTORIES(${ASCEND_INC_DIR})
-FILE(WRITE ${ASCEND_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(ASCEND)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${ASCEND_NAME}/include ${ASCEND_NAME}/lib \n"
-  "        DESTINATION ${ASCEND_DST_DIR})\n")
-ExternalProject_Add(
-    ${ASCEND_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${ASCEND_SOURCE_DIR}
-    DOWNLOAD_DIR          ${ASCEND_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${ASCEND_URL} -c -q -O ${ASCEND_NAME}.tar.gz
-                          && tar zxvf ${ASCEND_NAME}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${ASCEND_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${ASCEND_INSTALL_ROOT}
-)
-ADD_LIBRARY(ascend SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ascend PROPERTY IMPORTED_LOCATION ${ASCEND_LIB})
-
-ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${ASCEND_GRAPH_LIB})
-ADD_DEPENDENCIES(ascend ascend_graph ${ASCEND_PROJECT})

+#NOTE: Logic is from
+# https://github.com/mindspore-ai/graphengine/blob/master/CMakeLists.txt
+if(DEFINED ENV{ASCEND_CUSTOM_PATH})
+    set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH})
+else()
+    set(ASCEND_DIR /usr/local/Ascend)
+endif()
+
+if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h)
+  # It means CANN 20.2 +
+  add_definitions(-DPADDLE_WITH_ASCEND_STRING)
+endif()
+
+
+if(WITH_ASCEND OR WITH_ASCEND_CL)
+  set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
+  set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
+  set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
+  set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
+  set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
+  set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
+  set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
+
+  set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
+  set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
+  set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
+  set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
+  set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
+  set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
+
+  set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
+  set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
+  set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
+  INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
+
+
+  ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
+
+  ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
+
+  ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
+
+  add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
+endif()
+
+if(WITH_ASCEND_CL)
+  set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
+
+  set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
+  set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
+  set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
+  set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
+
+  message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}")
+  message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
+  INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR})
+  INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR})
+
+  ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
+
+  ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
+
+  ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})
+  add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
+
+endif()
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -39,9 +39,9 @@ set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/
 ExternalProject_Add(
        extern_brpc
        ${EXTERNAL_PROJECT_LOG_ARGS}
-        # TODO(gongwb): change to de newst repo when they changed.
+        # TODO(gongwb): change to de newst repo when they changed
        GIT_REPOSITORY  "https://github.com/wangjiawei04/brpc"
-        GIT_TAG         "6d79e0b17f25107c35b705ea58d888083f59ff47"
+        GIT_TAG         "e203afb794caf027da0f1e0776443e7d20c0c28e"
        PREFIX          ${BRPC_SOURCES_DIR}
        UPDATE_COMMAND  ""
        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -14,11 +14,11 @@

 include(ExternalProject)

-# update eigen to the commit id 4da2c6b1 on 03/19/2020
+# update eigen to the commit id f612df27 on 03/16/2021
 set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3)
 set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3)
 set(EIGEN_REPOSITORY https://gitlab.com/libeigen/eigen.git)
-set(EIGEN_TAG        4da2c6b1974827b1999bab652a3d4703e1992d26)
+set(EIGEN_TAG        f612df273689a19d25b45ca4f8269463207c4fee)

 cache_third_party(extern_eigen3
    REPOSITORY    ${EIGEN_REPOSITORY}
@@ -27,47 +27,15 @@ cache_third_party(extern_eigen3

 if(WIN32)
    add_definitions(-DEIGEN_STRONG_INLINE=inline)
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Half.h native_src)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/CUDA/Half.h native_dst)
-    # For Windows
-    # which will cause a compilation error in Tensor:74:
-    # "can not open file 'unistd.h'"
-    # so use following patch to solve compilation error On Windows.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Tensor native_src2)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/Tensor native_dst2)
-    # For VS2015
-    # which will cause a compilation error in TensorBlock.h:1028:
-    # "syntax error"
-    # so use following patch to solve compilation error On Windows.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorBlock.h native_src3)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h native_dst3)
-    set(EIGEN_PATCH_COMMAND copy ${native_src} ${native_dst} /Y && copy ${native_src2} ${native_dst2} /Y && copy ${native_src3} ${native_dst3} /Y)
 elseif(LINUX)
-    # For gxx=4.8, __GXX_ABI_VERSION is less than 1004
-    # which will cause a compilation error in Geometry_SSE.h:38:
-    # "no matching function for call to 'pmul(Eigen::internal::Packet4f&, __m128)"
-    # refer to: https://gitlab.com/libeigen/eigen/-/blob/4da2c6b1974827b1999bab652a3d4703e1992d26/Eigen/src/Core/arch/SSE/PacketMath.h#L33-60
-    # add -fabi-version=4 could avoid above error, but will cause "double free corruption" when compile with gcc8
-    # so use following patch to solve compilation error with different version of gcc.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Geometry_SSE.h native_src1)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Geometry/arch/Geometry_SSE.h native_dst1)
-    # The compiler fully support const expressions since c++14,
-    # but Eigen use some const expressions such as std::max and std::min, which are not supported in c++11
-    # add patch to avoid compilation error in c++11
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/MathFunctions.h native_src2)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/MathFunctions.h native_dst2)
    if(WITH_ROCM)
        # For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC
        # which will cause compiler error of using __host__ funciont in __host__ __device__
-        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src3)
-        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst3)
-        # For HIPCC Eigen::internal::scalar_sum_op<bool,bool> is not EIGEN_DEVICE_FUNC
-        # which will cause compiler error of using __host__ funciont in __host__ __device__
-        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/BinaryFunctors.h native_src4)
-        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/functors/BinaryFunctors.h native_dst4)
-        set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2} && cp ${native_src3} ${native_dst3} && cp ${native_src4} ${native_dst4})
-    else()
-        set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2})
+        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src)
+        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst)
+        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorReductionGpu.h native_src1)
+        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h native_dst1)
+        set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst} && cp ${native_src1} ${native_dst1})
    endif()
 endif()

@@ -82,7 +50,7 @@ ExternalProject_Add(
    PREFIX          ${EIGEN_PREFIX_DIR}
    SOURCE_DIR      ${EIGEN_SOURCE_DIR}
    UPDATE_COMMAND    ""
-    PATCH_COMMAND   ${EIGEN_PATCH_COMMAND}
+    PATCH_COMMAND     ${EIGEN_PATCH_COMMAND}
    CONFIGURE_COMMAND ""
    BUILD_COMMAND     ""
    INSTALL_COMMAND   ""

--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@@ -32,21 +32,39 @@ cache_third_party(extern_gloo
    TAG           ${GLOO_TAG}
    DIR           GLOO_SOURCE_DIR)

-ExternalProject_Add(
-    extern_gloo
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    "${GLOO_DOWNLOAD_CMD}"
-    PREFIX                "${GLOO_PREFIX_DIR}"
-    SOURCE_DIR            "${GLOO_SOURCE_DIR}"
-    UPDATE_COMMAND        ""
-    CONFIGURE_COMMAND     ""
-    BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
-        && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make
-        && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
-    INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
-    COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
-)
+  if(WITH_ASCEND OR WITH_ASCEND_CL)
+  ExternalProject_Add(
+      extern_gloo
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      ${SHALLOW_CLONE}
+      "${GLOO_DOWNLOAD_CMD}"
+      PREFIX                "${GLOO_PREFIX_DIR}"
+      SOURCE_DIR            "${GLOO_SOURCE_DIR}"
+      UPDATE_COMMAND        ""
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
+          && cd ${GLOO_SOURCE_DIR}/build && cmake .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make
+          && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
+      INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
+      COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
+  )
+else()
+  ExternalProject_Add(
+      extern_gloo
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      ${SHALLOW_CLONE}
+      "${GLOO_DOWNLOAD_CMD}"
+      PREFIX                "${GLOO_PREFIX_DIR}"
+      SOURCE_DIR            "${GLOO_SOURCE_DIR}"
+      UPDATE_COMMAND        ""
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
+          && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make
+          && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
+      INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
+      COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
+  )
+endif()


 ADD_LIBRARY(gloo STATIC IMPORTED GLOBAL)

--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-include (ExternalProject)
-
-SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
-SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
-SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
-SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
-
-include(ProcessorCount)
-ProcessorCount(NUM_OF_PROCESSOR)
-
-IF(APPLE)
-  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
-  SET(GRPC_INSTALL_CMD make prefix=${GRPC_INSTALL_DIR} install) 
-ELSE()
-  SET(GRPC_CFLAGS "-Wno-error -std=c11 ${CLFAGS}")
-  SET(GRPC_CXXFLAGS "-Wno-error -std=c++11 ${CXXFLAGS}")
-  SET(BUILD_CMD make CFLAGS=${GRPC_CFLAGS} CXXFLAGS=${GRPC_CXXFLAGS} HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin) 
-  SET(GRPC_INSTALL_CMD make prefix=${GRPC_INSTALL_DIR} install CFLAGS=${GRPC_CFLAGS} CXXFLAGS=${GRPC_CXXFLAGS})
-ENDIF()
-
-# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them
-ExternalProject_Add(
-    extern_grpc
-    DEPENDS protobuf zlib
-    # NOTE(wuyi):
-    # this package is generated by following steps:
-    # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
-    # 2. git submodule update --init
-    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
-    #    checkout and clean other dirs under third_party
-    # 4. remove .git, and package the directory.
-    URL          http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x_paddle.tar.gz
-    URL_MD5      f5442d137ddccee252e194b1bc90f98c
-    PREFIX          ${GRPC_SOURCES_DIR}
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_IN_SOURCE 1
-    # NOTE(yuyang18):
-    # Disable -Werror, otherwise the compile will fail in MacOS.
-    # It seems that we cannot configure that by make command.
-    # Just dry run make command and remove `-Werror`, then use a shell to run make commands
-    BUILD_COMMAND  ${BUILD_CMD}
-    INSTALL_COMMAND ${GRPC_INSTALL_CMD}
-)
-
-ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
-             "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
-
-ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION
-            "${GRPC_INSTALL_DIR}/lib/libgrpc++.a")
-ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION
-            "${GRPC_INSTALL_DIR}/lib/libgpr.a")
-
-ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
-            "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a")
-
-include_directories(${GRPC_INCLUDE_DIR})
-ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 SET(MKLDNN_REPOSITORY     ${GIT_URL}/oneapi-src/oneDNN.git)
-SET(MKLDNN_TAG            72efa005effb49595933e033cc732f215ef0445a)
+SET(MKLDNN_TAG            f58682cd8bd0615f41d879f8afc8f1511ab42d24)

 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -198,8 +198,16 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
            "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
    ENDIF()

+if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+    SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
+    SET(PROTOBUF_TAG         v3.8.0)
+elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
+    SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
+    SET(PROTOBUF_TAG         v3.8.0)
+else()
    SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
    SET(PROTOBUF_TAG         9f75c5aa851cd877fb0d93ccc31b8567a6706546)
+endif()

    cache_third_party(${TARGET_NAME}
        REPOSITORY    ${PROTOBUF_REPOSITORY}
@@ -234,7 +242,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
    )
 ENDFUNCTION()

-SET(PROTOBUF_VERSION 3.1.0)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
+    SET(PROTOBUF_VERSION 3.8.0)
+else()
+    SET(PROTOBUF_VERSION 3.1.0)
+endif()

 IF(NOT PROTOBUF_FOUND)
    build_protobuf(extern_protobuf FALSE)

--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
@@ -16,7 +16,11 @@ INCLUDE(ExternalProject)

 SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool)
 SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool)
-SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
+    SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git)
+else()
+    SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
+endif()
 SET(THREADPOOL_TAG        9a42ec1329f259a5f4881a291db1dcb8f2ad9040)

 cache_third_party(extern_threadpool

--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -14,11 +14,17 @@

 INCLUDE(ExternalProject)

+IF(WITH_ROCM)
+    add_definitions(-DWARPCTC_WITH_HIP)
+ENDIF()
+
 SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
+# in case of low internet speed  
+#set(WARPCTC_REPOSITORY  https://gitee.com/tianjianhe/warp-ctc.git)
 set(WARPCTC_REPOSITORY  ${GIT_URL}/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG         95a461eddeabd51099ef059dcfada1117eb1bfb8)
+set(WARPCTC_TAG         c690fc5755abbdbdc98ef78d51ec10a6748a8cd1)

 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
    CACHE PATH "Warp-ctc Directory" FORCE)
@@ -37,38 +43,92 @@ cache_third_party(extern_warpctc
    TAG          ${WARPCTC_TAG}
    DIR          WARPCTC_SOURCE_DIR)

-ExternalProject_Add(
-    extern_warpctc
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    "${WARPCTC_DOWNLOAD_CMD}"
-    PREFIX          ${WARPCTC_PREFIX_DIR}
-    SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
-    #UPDATE_COMMAND  ""
-    PATCH_COMMAND   ""
-    BUILD_ALWAYS    1
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                    -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-                    -DWITH_GPU=${WITH_GPU}
-                    -DWITH_OMP=${USE_OMP}
-                    -DWITH_TORCH=OFF
-                    -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-                    -DBUILD_SHARED=ON
-                    -DBUILD_TESTS=OFF
-                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
-)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
+    ExternalProject_Add(
+        extern_warpctc
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        ${SHALLOW_CLONE}
+        "${WARPCTC_DOWNLOAD_CMD}"
+        PREFIX          ${WARPCTC_PREFIX_DIR}
+        SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
+        #UPDATE_COMMAND  ""
+        PATCH_COMMAND   ""
+        BUILD_ALWAYS    1
+        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                        -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                        -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
+                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                        -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+                        -DWITH_GPU=${WITH_GPU}
+                        -DWITH_ROCM=${WITH_ROCM}
+                        -DWITH_OMP=${USE_OMP}
+                        -DWITH_TORCH=OFF
+                        -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+                        -DBUILD_SHARED=ON
+                        -DBUILD_TESTS=OFF
+                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                        ${EXTERNAL_OPTIONAL_ARGS}
+        CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                         -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
+    )
+else()
+    if(WIN32)
+        set(WARPCTC_C_FLAGS $<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>)
+        set(WARPCTC_C_FLAGS_DEBUG $<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+        set(WARPCTC_C_FLAGS_RELEASE $<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+        set(WARPCTC_CXX_FLAGS $<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>)
+        set(WARPCTC_CXX_FLAGS_RELEASE $<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+        set(WARPCTC_CXX_FLAGS_DEBUG $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+    else()
+        set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS})
+        set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+        set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+        set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+        set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+        set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+    endif()
+    ExternalProject_Add(
+        extern_warpctc
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        ${SHALLOW_CLONE}
+        "${WARPCTC_DOWNLOAD_CMD}"
+        PREFIX          ${WARPCTC_PREFIX_DIR}
+        SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
+        #UPDATE_COMMAND  ""
+        PATCH_COMMAND   ""
+        BUILD_ALWAYS    1
+        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                        -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS}
+                        -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG}
+                        -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE}
+                        -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS}
+                        -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE}
+                        -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG}
+                        -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+                        -DWITH_GPU=${WITH_GPU}
+                        -DWITH_ROCM=${WITH_ROCM}
+                        -DWITH_OMP=${USE_OMP}
+                        -DWITH_TORCH=OFF
+                        -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+                        -DBUILD_SHARED=ON
+                        -DBUILD_TESTS=OFF
+                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                        ${EXTERNAL_OPTIONAL_ARGS}
+        CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                         -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
+    )
+endif()
+
+
 IF(WIN32)
    SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
            CACHE FILEPATH "Warp-ctc Library" FORCE)

--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -13,7 +13,7 @@ if(NOT XPU_SDK_ROOT)
  elseif(WITH_SUNWAY)
      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
  else()
-      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_02_27.tar.gz" CACHE STRING "" FORCE)
+      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_04_09.tar.gz" CACHE STRING "" FORCE)
  endif()

  SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -4,10 +4,10 @@ include(CheckCCompilerFlag)
 include(CheckCXXSymbolExists)
 include(CheckTypeSize)

-function(CheckCompilerCXX11Flag)
+function(CheckCompilerCXX14Flag)
    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
-            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
+        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.4)
+            message(FATAL_ERROR "Unsupported GCC version. GCC >= 5.4 required.")
        elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.2)
            message(WARNING "Found GCC ${CMAKE_CXX_COMPILER_VERSION} which is too high, recommended to use GCC 8.2")
        endif()
@@ -20,23 +20,15 @@ function(CheckCompilerCXX11Flag)
                message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.")
            endif()
        else()
-            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
-                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
+            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.4)
+                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.4 required.")
            endif()
        endif()
    endif()
 endfunction()

-CheckCompilerCXX11Flag()
-if (WITH_GPU)
-    if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0)
-       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
-    else()
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-    endif()
-else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-endif()
+CheckCompilerCXX14Flag()
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -447,9 +447,20 @@ function(cc_test TARGET_NAME)
    cc_test_build(${TARGET_NAME}
 	    SRCS ${cc_test_SRCS}
 	    DEPS ${cc_test_DEPS})
-    cc_test_run(${TARGET_NAME}
-	    COMMAND ${TARGET_NAME}
-	    ARGS ${cc_test_ARGS})
+    # we dont test hcom op, because it need complex configuration
+    # with more than one machine
+    if(NOT ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test"         OR
+            "${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test"         OR
+            "${TARGET_NAME}" STREQUAL "send_v2_op_npu_test"             OR
+            "${TARGET_NAME}" STREQUAL "c_reduce_sum_op_npu_test"        OR
+            "${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test"))
+      cc_test_run(${TARGET_NAME}
+        COMMAND ${TARGET_NAME}
+        ARGS ${cc_test_ARGS})
+    endif()
  endif()
 endfunction(cc_test)

@@ -492,10 +503,8 @@ function(nv_library TARGET_NAME)
        message(FATAL "Please specify source file or library in nv_library.")
      endif()
    endif(nv_library_SRCS)
-    if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
-      if(${MSVC_VERSION} LESS_EQUAL 1900)
-        set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
-      endif()
+    if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910))
+      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
    endif()
  endif()
 endfunction(nv_library)
@@ -512,7 +521,7 @@ function(nv_binary TARGET_NAME)
      add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
      common_link(${TARGET_NAME})
    endif()
-    if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+    if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910))
      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
    endif()
  endif()
@@ -539,7 +548,7 @@ function(nv_test TARGET_NAME)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
-    if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+    if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910))
      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
    endif()
  endif()
@@ -809,7 +818,7 @@ function(py_test TARGET_NAME)
               ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
               WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
    endif()
-    
+
    if (WIN32)
        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
    endif()

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -192,6 +192,15 @@ include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
 copy(inference_lib_dist
        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include/*
        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex64.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex128.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)

 # CAPI inference library for only inference
 set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING
@@ -202,11 +211,11 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 if(WIN32)
  set(paddle_inference_c_lib $<TARGET_FILE_DIR:paddle_inference_c>/paddle_inference_c.*)
 else(WIN32)
-  set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_inference_c.*)
+  set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi_exp/libpaddle_inference_c.*)
 endif(WIN32)

 copy(inference_lib_dist
-      SRCS  ${src_dir}/inference/capi/paddle_c_api.h  ${paddle_inference_c_lib}
+      SRCS  ${src_dir}/inference/capi_exp/pd_*.h  ${paddle_inference_c_lib}
      DSTS  ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib)

 # fluid library for both train and inference

--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -18,6 +18,10 @@ if(NOT WIN32)
    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
    set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
 else()
+    # It can specify CUDA compile flag manualy,
+    # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous
+    # because CUDA will update by nvidia, then error will occur.
+    # Now, it's only used in VS2015 + CUDA:[10.0, 10.2]
    set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
 endif()


--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -11,6 +11,7 @@ function(op_library TARGET)
    set(cu_cc_srcs)
    set(hip_cc_srcs)
    set(xpu_cc_srcs)
+    set(npu_cc_srcs)
    set(cudnn_cu_cc_srcs)
    set(miopen_cu_cc_srcs)
    set(cudnn_cu_srcs)
@@ -20,6 +21,9 @@ function(op_library TARGET)
    set(mkldnn_cc_srcs)
    set(MKLDNN_FILE)
    set(op_common_deps operator op_registry math_function layer common_infer_shape_functions)
+    if (WITH_ASCEND_CL)
+      set(op_common_deps ${op_common_deps} npu_op_runner)
+    endif()
    # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
    set(options UNITY)
    set(oneValueArgs "")
@@ -40,6 +44,9 @@ function(op_library TARGET)
            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
                list(APPEND cu_srcs ${TARGET}.cu)
            endif()
+            if (WITH_NV_JETSON)
+                list(REMOVE_ITEM cu_srcs "decode_jpeg_op.cu")
+            endif()
            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
                set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
                        ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
@@ -85,6 +92,12 @@ function(op_library TARGET)
                list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
            endif()
        endif()
+        if(WITH_ASCEND_CL)
+            string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc)
+                list(APPEND npu_cc_srcs ${NPU_FILE}.cc)
+            endif()
+        endif()
    else()
        foreach(src ${op_library_SRCS})
            if(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu$")
@@ -107,6 +120,8 @@ function(op_library TARGET)
                list(APPEND cu_cc_srcs ${src})
            elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
                list(APPEND xpu_cc_srcs ${src})
+            elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
+                list(APPEND npu_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cc$")
                list(APPEND cc_srcs ${src})
            else()
@@ -168,15 +183,15 @@ function(op_library TARGET)
        list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc")
        list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc")
        list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
-        list(REMOVE_ITEM hip_srcs "correlation_op.cu")
        list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
+        list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
    else()
        # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
        if(WITH_UNITY_BUILD AND op_library_UNITY)
            # Combine the cc source files.
-            compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs})
+            compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs})
            if(TARGET ${UNITY_TARGET})
                # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
                target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources})
@@ -187,7 +202,7 @@ function(op_library TARGET)
            # Add alias library to handle dependencies.
            add_library(${TARGET} ALIAS ${UNITY_TARGET})
        else()
-            cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
+            cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
        endif()
    endif()
@@ -207,6 +222,7 @@ function(op_library TARGET)
    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
+    set(ORIGINAL_TARGET ${TARGET})
    file(READ ${TARGET}.cc TARGET_CONTENT)
    string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
    # [ \t\r\n]* is used for blank characters
@@ -239,8 +255,9 @@ function(op_library TARGET)
    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
    list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
    list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
+    list(LENGTH npu_cc_srcs npu_cc_srcs_len)
    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
-        ${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
+        ${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0 AND ${npu_cc_srcs_len} EQUAL 0)
        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
        set(pybind_flag 1)
    endif()
@@ -280,6 +297,26 @@ function(op_library TARGET)
    if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
    endif()
+
+    if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
+        file(READ ${ORIGINAL_TARGET}_npu.cc TARGET_NPU_CONTENT)
+        # It is different from the logic above, becareful
+        string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\(.*" multi_npu_register "${TARGET_NPU_CONTENT}")
+        # [ \t\r\n]* is used for blank characters
+        string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\([ \t\r\n]*[a-z0-9_]*," one_npu_register "${multi_npu_register}")
+
+        if (one_npu_register STREQUAL "")
+            string(REPLACE "_op" "" NPU_TARGET "${TARGET}")
+        else ()
+            string(REPLACE "REGISTER_OP_NPU_KERNEL(" "" NPU_TARGET "${one_npu_register}")
+            string(REPLACE "," "" NPU_TARGET "${NPU_TARGET}")
+            # [ \t\r\n]+ is used for blank characters.
+            # Here we use '+' instead of '*' since it is a REPLACE operation.
+            string(REGEX REPLACE "[ \t\r\n]+" "" NPU_TARGET "${NPU_TARGET}")
+        endif()
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${NPU_TARGET}, NPU);\n")
+    endif()
+
    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
      # Append first implemented MKLDNN activation operator
@@ -330,6 +367,7 @@ function(register_operators)
    file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
    string(REPLACE "_mkldnn" "" OPS "${OPS}")
    string(REPLACE "_xpu" "" OPS "${OPS}")
+    string(REPLACE "_npu" "" OPS "${OPS}")
    string(REPLACE ".cc" "" OPS "${OPS}")
    list(REMOVE_DUPLICATES OPS)
    list(LENGTH register_operators_DEPS register_operators_DEPS_len)

--- a/cmake/paddle_win.props
+++ b/cmake/paddle_win.props
@@ -15,7 +15,7 @@
            <Warning>InheritFromHost</Warning>

            <BaseCommandLineTemplate>-ccbin "%(VCBinDir)" -x cu [GenerateRelocatableDeviceCode] [Include] [RequiredIncludes] [InterleaveSourceInPTX] [GPUDebugInfo] [GenerateLineInfo] [Keep] [KeepDir] [MaxRegCount] [PtxAsOptionV] [TargetMachinePlatform] [NvccCompilation] [CudaRuntime] [AdditionalOptions]</BaseCommandLineTemplate>
-            <BuildCommandLineTemplate>--use-local-env --cl-version $(CudaClVersion)</BuildCommandLineTemplate>
+            <BuildCommandLineTemplate>--use-local-env $(CudaClVersion)</BuildCommandLineTemplate>
            <BuildDynamicCommandLineTemplate>[CodeGeneration]</BuildDynamicCommandLineTemplate>
            <CleanCommandLineTemplate>-clean</CleanCommandLineTemplate>
            <!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] $(CudaForceSynchronousPdbWrites) /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->
@@ -88,4 +88,3 @@ set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)"
        </ClCompile>
    </ItemDefinitionGroup>
 </Project>
-
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -29,9 +29,9 @@ set(third_party_deps)
 # 2. REPOSITORY:    specify git REPOSITORY of 3rd party
 # 3. TAG:           specify git tag/branch/commitID of 3rd party
 # 4. DIR:           overwrite the original SOURCE_DIR when cache directory
-# 
+#
 # The function Return 1 PARENT_SCOPE variables:
-#  - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add, 
+#  - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add,
 #                            and you no longer need to set any donwnload steps in ExternalProject_Add.
 # For example:
 #    Cache_third_party(${TARGET}
@@ -52,7 +52,7 @@ FUNCTION(cache_third_party TARGET)
        SET(${TARGET_NAME}_DOWNLOAD_CMD
                GIT_REPOSITORY  ${cache_third_party_REPOSITORY})
        IF(cache_third_party_TAG)
-            LIST(APPEND   ${TARGET_NAME}_DOWNLOAD_CMD  
+            LIST(APPEND   ${TARGET_NAME}_DOWNLOAD_CMD
                    GIT_TAG     ${cache_third_party_TAG})
        ENDIF()
    ELSEIF(cache_third_party_URL)
@@ -130,7 +130,7 @@ ENDFUNCTION()
 # Correction of flags on different Platform(WIN/MAC) and Print Warning Message
 if (APPLE)
    if(WITH_MKL)
-        MESSAGE(WARNING 
+        MESSAGE(WARNING
            "Mac is not supported with MKL in Paddle yet. Force WITH_MKL=OFF.")
        set(WITH_MKL OFF CACHE STRING "Disable MKL for building on mac" FORCE)
    endif()
@@ -141,7 +141,7 @@ if(WIN32 OR APPLE)
    SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)

    if(WITH_LIBXSMM)
-        MESSAGE(WARNING 
+        MESSAGE(WARNING
            "Windows, Mac are not supported with libxsmm in Paddle yet."
            "Force WITH_LIBXSMM=OFF")
        SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM in Windows and MacOS" FORCE)
@@ -261,6 +261,14 @@ if(WITH_PSLIB)
    if(WITH_PSLIB_BRPC)
        include(external/pslib_brpc) # download, build, install pslib_brpc
        list(APPEND third_party_deps extern_pslib_brpc)
+    else()    
+        include(external/snappy)
+        list(APPEND third_party_deps extern_snappy)
+
+        include(external/leveldb)
+        list(APPEND third_party_deps extern_leveldb)
+        include(external/brpc)
+        list(APPEND third_party_deps extern_brpc)
    endif()
 endif(WITH_PSLIB)

@@ -274,10 +282,15 @@ if(WITH_BOX_PS)
    list(APPEND third_party_deps extern_box_ps)
 endif(WITH_BOX_PS)

-if(WITH_ASCEND)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
    include(external/ascend)
-    list(APPEND third_party_deps extern_ascend)
-endif (WITH_ASCEND)
+    if(WITH_ASCEND OR WITH_ASCEND_CL)
+        list(APPEND third_party_deps extern_ascend)
+    endif()
+    if(WITH_ASCEND_CL)
+        list(APPEND third_party_deps extern_ascend_cl)
+    endif()
+endif ()

 if (WITH_PSCORE)
    include(external/snappy)
@@ -285,7 +298,7 @@ if (WITH_PSCORE)

    include(external/leveldb)
    list(APPEND third_party_deps extern_leveldb)
-        
+
    include(external/brpc)
    list(APPEND third_party_deps extern_brpc)


--- a/go/README_cn.md
+++ b/go/README_cn.md
@@ -50,6 +50,7 @@ output_data := value.Interface().([][]float32)

 运行
 ```bash
+go mod init github.com/paddlepaddle
 export LD_LIBRARY_PATH=`pwd`/paddle_c/paddle/lib:$LD_LIBRARY_PATH
 go run ./demo/mobilenet.go
 ```
--- a/go/demo/mobilenet.go
+++ b/go/demo/mobilenet.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 package main

-import "../paddle"
+import "github.com/paddlepaddle/paddle"
 import "strings"
 import "io/ioutil"
 import "strconv"

--- a/go/demo/mobilenet_c_exp.cc
+++ b/go/demo/mobilenet_c_exp.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <pd_inference_api.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void ReadData(float* data, int size);
+
+int main(int argc, char* argv[]) {
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, "data/model/__model__", "data/model/__params__");
+  PD_ConfigDisableGlogInfo(config);
+
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  // config has destroyed in PD_PredictorCreate
+  config = NULL;
+
+  int input_num = PD_PredictorGetInputNum(predictor);
+  printf("Input num: %d\n", input_num);
+  int output_num = PD_PredictorGetOutputNum(predictor);
+  printf("Output num: %d\n", output_num);
+
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* input_tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  PD_OneDimArrayCstrDestroy(input_names);
+  input_names = NULL;
+
+  int32_t shape[] = {1, 3, 300, 300};
+  float* data = (float*)malloc(sizeof(float) * 1 * 3 * 300 * 300);  // NOLINT
+  ReadData(data, 1 * 3 * 300 * 300);                                // NOLINT
+  PD_TensorReshape(input_tensor, 4, shape);
+  PD_TensorCopyFromCpuFloat(input_tensor, data);
+  free(data);
+  data = NULL;
+  PD_PredictorRun(predictor);
+
+  PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor);
+  PD_Tensor* output_tensor =
+      PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
+  PD_OneDimArrayCstrDestroy(output_names);
+  output_names = nullptr;
+
+  PD_OneDimArrayInt32* out_shape = PD_TensorGetShape(output_tensor);
+  int32_t size = 1;
+  for (size_t index = 0; index < out_shape->size; ++index) {
+    size = size * out_shape->data[index];
+  }
+  PD_OneDimArrayInt32Destroy(out_shape);
+  out_shape = NULL;
+
+  data = (float*)malloc(sizeof(float) * size);  // NOLINT
+  PD_TensorCopyToCpuFloat(output_tensor, data);
+  free(data);
+  data = NULL;
+
+  PD_TensorDestroy(output_tensor);
+  output_tensor = NULL;
+  PD_TensorDestroy(input_tensor);
+  input_tensor = NULL;
+  PD_PredictorDestroy(predictor);
+  predictor = NULL;
+
+  return 0;
+}
+
+void ReadData(float* data, int n) {
+  FILE* fp = fopen("data/data.txt", "r");
+  for (int i = 0; i < n; i++) {
+    fscanf(fp, "%f", &data[i]);
+  }
+  fclose(fp);
+}
--- a/go/paddle/common.go
+++ b/go/paddle/common.go
@@ -15,7 +15,7 @@
 package paddle

 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include <paddle_c_api.h>
 import "C"

--- a/go/paddle/config.go
+++ b/go/paddle/config.go
@@ -15,7 +15,7 @@
 package paddle

 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include <stdlib.h>
 // #include <paddle_c_api.h>

--- a/go/paddle/predictor.go
+++ b/go/paddle/predictor.go
@@ -15,7 +15,7 @@
 package paddle

 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include "paddle_c_api.h"
 import "C"
@@ -88,7 +88,7 @@ func (predictor *Predictor) GetInputNames() []string {
 }

 func (predictor *Predictor) GetOutputNames() []string {
-	names := make([]string, predictor.GetInputNum())
+	names := make([]string, predictor.GetOutputNum())
 	for i := 0; i < len(names); i++ {
 		names[i] = predictor.GetOutputName(i)
 	}

--- a/go/paddle/tensor.go
+++ b/go/paddle/tensor.go
@@ -15,7 +15,7 @@
 package paddle

 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include <stdlib.h>
 // #include <string.h>
@@ -209,7 +209,7 @@ func DecodeTensor(r *bytes.Reader, shape []int32, t reflect.Type, ptr reflect.Va
 		value := reflect.Indirect(ptr)
 		value.Set(reflect.MakeSlice(t, int(shape[0]), int(shape[0])))
 		if len(shape) == 1 && value.Len() > 0 {
-			switch value.Index(1).Kind() {
+			switch value.Index(0).Kind() {
 			case reflect.Uint8, reflect.Int32, reflect.Int64, reflect.Float32:
 				binary.Read(r, Endian(), value.Interface())
 				return

--- a/paddle/extension.h
+++ b/paddle/extension.h
@@ -15,4 +15,4 @@ limitations under the License. */
 #pragma once

 // All paddle apis in C++ frontend
-#include "paddle/fluid/extension/include/ext_all.h"
+#include "paddle/extension/include/ext_all.h"
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -9,4 +9,3 @@ add_subdirectory(pybind)

 # NOTE: please add subdirectory inference at last.
 add_subdirectory(inference)
-add_subdirectory(train)
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -11,9 +11,10 @@ if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
            "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
 endif()

-add_subdirectory(table)
 add_subdirectory(service)
+add_subdirectory(table)
 add_subdirectory(test)
+add_subdirectory(index_dataset)

 get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)


--- a/paddle/fluid/distributed/fleet.cc
+++ b/paddle/fluid/distributed/fleet.cc
@@ -177,8 +177,11 @@ std::future<int32_t> FleetWrapper::PullSparseVarsAsync(
  for (auto& t : *fea_values) {
    pull_result_ptr.push_back(t.data());
  }
-  return pserver_ptr_->_worker_ptr->pull_sparse(
-      pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size());
+
+  bool training = true;
+  return pserver_ptr_->_worker_ptr->pull_sparse(pull_result_ptr.data(),
+                                                table_id, fea_keys->data(),
+                                                fea_keys->size(), training);
 }

 void FleetWrapper::PullSparseVarsSync(
@@ -224,8 +227,10 @@ void FleetWrapper::PullSparseVarsSync(
  for (auto& t : *fea_values) {
    pull_result_ptr.push_back(t.data());
  }
+  bool training = true;
  auto status = pserver_ptr_->_worker_ptr->pull_sparse(
-      pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size());
+      pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size(),
+      training);
  pull_sparse_status.push_back(std::move(status));
  for (auto& t : pull_sparse_status) {
    t.wait();
@@ -238,9 +243,13 @@ void FleetWrapper::PullSparseVarsSync(
  }
 }

+// is_training is true means training, false means inference, the behavior is
+// different on pserver
+
 void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
                                          uint64_t padding_id,
                                          platform::Place place,
+                                          bool is_training,
                                          std::vector<const LoDTensor*>* inputs,
                                          std::vector<LoDTensor*>* outputs) {
  std::vector<uint64_t> fea_keys;
@@ -279,7 +288,8 @@ void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
  }
  auto* communicator = Communicator::GetInstance();
  auto status = communicator->_worker_ptr->pull_sparse(
-      pull_result_ptr.data(), table_id, fea_keys.data(), fea_keys.size());
+      pull_result_ptr.data(), table_id, fea_keys.data(), fea_keys.size(),
+      is_training);
  status.wait();
  auto ret = status.get();
  if (ret != 0) {

--- a/paddle/fluid/distributed/fleet.h
+++ b/paddle/fluid/distributed/fleet.h
@@ -95,8 +95,12 @@ class FleetWrapper {

  // Pull sparse variables from server in sync mode
  // pull immediately to tensors
+  // is_training is true means training, false means inference, the behavior is
+  // different on pserver
+
  void PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
                              uint64_t padding_id, platform::Place place,
+                              bool is_training,
                              std::vector<const LoDTensor*>* inputs,  // NOLINT
                              std::vector<LoDTensor*>* outputs);      // NOLINT


--- a/paddle/fluid/distributed/index_dataset/CMakeLists.txt
+++ b/paddle/fluid/distributed/index_dataset/CMakeLists.txt
+proto_library(index_dataset_proto SRCS index_dataset.proto)
+cc_library(index_wrapper SRCS index_wrapper.cc DEPS index_dataset_proto fs)
+cc_library(index_sampler SRCS index_sampler.cc DEPS index_wrapper)
+
+if(WITH_PYTHON)
+  py_proto_compile(index_dataset_py_proto SRCS index_dataset.proto)
+endif()
--- a/paddle/fluid/distributed/index_dataset/index_dataset.proto
+++ b/paddle/fluid/distributed/index_dataset/index_dataset.proto
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+package paddle.distributed;
+
+message IndexNode {
+  required uint64 id = 1;
+  required bool is_leaf = 2;
+  required float probability = 3;
+}
+
+message TreeMeta {
+  required int32 height = 1;
+  required int32 branch = 2;
+}
+
+message KVItem {
+  required bytes key = 1;
+  required bytes value = 2;
+}
\ No newline at end of file
--- a/paddle/fluid/distributed/index_dataset/index_sampler.cc
+++ b/paddle/fluid/distributed/index_dataset/index_sampler.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/index_dataset/index_sampler.h"
+
+namespace paddle {
+namespace distributed {
+
+std::vector<std::vector<uint64_t>> LayerWiseSampler::sample(
+    const std::vector<std::vector<uint64_t>>& user_inputs,
+    const std::vector<uint64_t>& target_ids, bool with_hierarchy) {
+  auto input_num = target_ids.size();
+  auto user_feature_num = user_inputs[0].size();
+  std::vector<std::vector<uint64_t>> outputs(
+      input_num * layer_counts_sum_,
+      std::vector<uint64_t>(user_feature_num + 2));
+
+  auto max_layer = tree_->Height();
+  size_t idx = 0;
+  for (size_t i = 0; i < input_num; i++) {
+    auto travel_codes =
+        tree_->GetTravelCodes(target_ids[i], start_sample_layer_);
+    auto travel_path = tree_->GetNodes(travel_codes);
+    for (size_t j = 0; j < travel_path.size(); j++) {
+      // user
+      if (j > 0 && with_hierarchy) {
+        auto ancestor_codes =
+            tree_->GetAncestorCodes(user_inputs[i], max_layer - j - 1);
+        auto hierarchical_user = tree_->GetNodes(ancestor_codes);
+        for (int idx_offset = 0; idx_offset <= layer_counts_[j]; idx_offset++) {
+          for (size_t k = 0; k < user_feature_num; k++) {
+            outputs[idx + idx_offset][k] = hierarchical_user[k].id();
+          }
+        }
+      } else {
+        for (int idx_offset = 0; idx_offset <= layer_counts_[j]; idx_offset++) {
+          for (size_t k = 0; k < user_feature_num; k++) {
+            outputs[idx + idx_offset][k] = user_inputs[i][k];
+          }
+        }
+      }
+
+      // sampler ++
+      outputs[idx][user_feature_num] = travel_path[j].id();
+      outputs[idx][user_feature_num + 1] = 1.0;
+      idx += 1;
+      for (int idx_offset = 0; idx_offset < layer_counts_[j]; idx_offset++) {
+        int sample_res = 0;
+        do {
+          sample_res = sampler_vec_[j]->Sample();
+        } while (layer_ids_[j][sample_res].id() == travel_path[j].id());
+        outputs[idx + idx_offset][user_feature_num] =
+            layer_ids_[j][sample_res].id();
+        outputs[idx + idx_offset][user_feature_num + 1] = 0;
+      }
+      idx += layer_counts_[j];
+    }
+  }
+  return outputs;
+}
+
+}  // end namespace distributed
+}  // end namespace paddle
--- a/paddle/fluid/distributed/index_dataset/index_sampler.h
+++ b/paddle/fluid/distributed/index_dataset/index_sampler.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/sampler.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+class IndexSampler {
+ public:
+  virtual ~IndexSampler() {}
+  IndexSampler() {}
+
+  template <typename T>
+  static std::shared_ptr<IndexSampler> Init(const std::string& name) {
+    std::shared_ptr<IndexSampler> instance = nullptr;
+    instance.reset(new T(name));
+    return instance;
+  }
+
+  virtual void init_layerwise_conf(const std::vector<int>& layer_sample_counts,
+                                   int start_sample_layer = 1, int seed = 0) {}
+  virtual void init_beamsearch_conf(const int64_t k) {}
+  virtual std::vector<std::vector<uint64_t>> sample(
+      const std::vector<std::vector<uint64_t>>& user_inputs,
+      const std::vector<uint64_t>& input_targets,
+      bool with_hierarchy = false) = 0;
+};
+
+class LayerWiseSampler : public IndexSampler {
+ public:
+  virtual ~LayerWiseSampler() {}
+  explicit LayerWiseSampler(const std::string& name) {
+    tree_ = IndexWrapper::GetInstance()->get_tree_index(name);
+  }
+
+  void init_layerwise_conf(const std::vector<int>& layer_sample_counts,
+                           int start_sample_layer, int seed) override {
+    seed_ = seed;
+    start_sample_layer_ = start_sample_layer;
+
+    PADDLE_ENFORCE_GT(
+        start_sample_layer_, 0,
+        paddle::platform::errors::InvalidArgument(
+            "start sampler layer = [%d], it should greater than 0.",
+            start_sample_layer_));
+    PADDLE_ENFORCE_LT(start_sample_layer_, tree_->Height(),
+                      paddle::platform::errors::InvalidArgument(
+                          "start sampler layer = [%d], it should less than "
+                          "max_layer, which is [%d].",
+                          start_sample_layer_, tree_->Height()));
+
+    size_t i = 0;
+    layer_counts_sum_ = 0;
+    layer_counts_.clear();
+    int cur_layer = start_sample_layer_;
+    while (cur_layer < tree_->Height()) {
+      int layer_sample_num = 1;
+      if (i < layer_sample_counts.size()) {
+        layer_sample_num = layer_sample_counts[i];
+      }
+      layer_counts_sum_ += layer_sample_num + 1;
+      layer_counts_.push_back(layer_sample_num);
+      VLOG(3) << "[INFO] level " << cur_layer
+              << " sample_layer_counts.push_back: " << layer_sample_num;
+      cur_layer += 1;
+      i += 1;
+    }
+    reverse(layer_counts_.begin(), layer_counts_.end());
+    VLOG(3) << "sample counts sum: " << layer_counts_sum_;
+
+    auto max_layer = tree_->Height();
+    sampler_vec_.clear();
+    layer_ids_.clear();
+
+    auto layer_index = max_layer - 1;
+    size_t idx = 0;
+    while (layer_index >= start_sample_layer_) {
+      auto layer_codes = tree_->GetLayerCodes(layer_index);
+      layer_ids_.push_back(tree_->GetNodes(layer_codes));
+      auto sampler_temp =
+          std::make_shared<paddle::operators::math::UniformSampler>(
+              layer_ids_[idx].size() - 1, seed_);
+      sampler_vec_.push_back(sampler_temp);
+      layer_index--;
+      idx++;
+    }
+  }
+  std::vector<std::vector<uint64_t>> sample(
+      const std::vector<std::vector<uint64_t>>& user_inputs,
+      const std::vector<uint64_t>& target_ids, bool with_hierarchy) override;
+
+ private:
+  std::vector<int> layer_counts_;
+  int64_t layer_counts_sum_{0};
+  std::shared_ptr<TreeIndex> tree_{nullptr};
+  int seed_{0};
+  int start_sample_layer_{1};
+  std::vector<std::shared_ptr<paddle::operators::math::Sampler>> sampler_vec_;
+  std::vector<std::vector<IndexNode>> layer_ids_;
+};
+
+}  // end namespace distributed
+}  // end namespace paddle
--- a/paddle/fluid/distributed/index_dataset/index_wrapper.cc
+++ b/paddle/fluid/distributed/index_dataset/index_wrapper.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/io/fs.h"
+
+#include <boost/algorithm/string.hpp>
+#include <boost/lexical_cast.hpp>
+#include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
+
+namespace paddle {
+namespace distributed {
+
+std::shared_ptr<IndexWrapper> IndexWrapper::s_instance_(nullptr);
+
+int TreeIndex::Load(const std::string filename) {
+  int err_no;
+  auto fp = paddle::framework::fs_open_read(filename, &err_no, "");
+  PADDLE_ENFORCE_NE(
+      fp, nullptr,
+      platform::errors::InvalidArgument(
+          "Open file %s failed. Please check whether the file exists.",
+          filename));
+
+  int num = 0;
+  max_id_ = 0;
+  fake_node_.set_id(0);
+  fake_node_.set_is_leaf(false);
+  fake_node_.set_probability(0.0);
+  max_code_ = 0;
+  size_t ret = fread(&num, sizeof(num), 1, fp.get());
+  while (ret == 1 && num > 0) {
+    std::string content(num, '\0');
+    size_t read_num =
+        fread(const_cast<char*>(content.data()), 1, num, fp.get());
+    PADDLE_ENFORCE_EQ(
+        read_num, static_cast<size_t>(num),
+        platform::errors::InvalidArgument(
+            "Read from file: %s failed. Valid Format is "
+            "an integer representing the length of the following string, "
+            "and the string itself.We got an iteger[% d], "
+            "but the following string's length is [%d].",
+            filename, num, read_num));
+
+    KVItem item;
+    PADDLE_ENFORCE_EQ(
+        item.ParseFromString(content), true,
+        platform::errors::InvalidArgument("Parse from file: %s failed. It's "
+                                          "content can't be parsed by KVItem.",
+                                          filename));
+
+    if (item.key() == ".tree_meta") {
+      meta_.ParseFromString(item.value());
+    } else {
+      auto code = boost::lexical_cast<uint64_t>(item.key());
+      IndexNode node;
+      node.ParseFromString(item.value());
+      PADDLE_ENFORCE_NE(node.id(), 0,
+                        platform::errors::InvalidArgument(
+                            "Node'id should not be equel to zero."));
+      if (node.is_leaf()) {
+        id_codes_map_[node.id()] = code;
+      }
+      data_[code] = node;
+      if (node.id() > max_id_) {
+        max_id_ = node.id();
+      }
+      if (code > max_code_) {
+        max_code_ = code;
+      }
+    }
+    ret = fread(&num, sizeof(num), 1, fp.get());
+  }
+  total_nodes_num_ = data_.size();
+  max_code_ += 1;
+  return 0;
+}
+
+std::vector<IndexNode> TreeIndex::GetNodes(const std::vector<uint64_t>& codes) {
+  std::vector<IndexNode> nodes;
+  nodes.reserve(codes.size());
+  for (size_t i = 0; i < codes.size(); i++) {
+    if (CheckIsValid(codes[i])) {
+      nodes.push_back(data_.at(codes[i]));
+    } else {
+      nodes.push_back(fake_node_);
+    }
+  }
+  return nodes;
+}
+
+std::vector<uint64_t> TreeIndex::GetLayerCodes(int level) {
+  uint64_t level_num = static_cast<uint64_t>(std::pow(meta_.branch(), level));
+  uint64_t level_offset = level_num - 1;
+
+  std::vector<uint64_t> res;
+  res.reserve(level_num);
+  for (uint64_t i = 0; i < level_num; i++) {
+    auto code = level_offset + i;
+    if (CheckIsValid(code)) {
+      res.push_back(code);
+    }
+  }
+  return res;
+}
+
+std::vector<uint64_t> TreeIndex::GetAncestorCodes(
+    const std::vector<uint64_t>& ids, int level) {
+  std::vector<uint64_t> res;
+  res.reserve(ids.size());
+
+  int cur_level;
+  for (size_t i = 0; i < ids.size(); i++) {
+    if (id_codes_map_.find(ids[i]) == id_codes_map_.end()) {
+      res.push_back(max_code_);
+    } else {
+      auto code = id_codes_map_.at(ids[i]);
+      cur_level = meta_.height() - 1;
+
+      while (level >= 0 && cur_level > level) {
+        code = (code - 1) / meta_.branch();
+        cur_level--;
+      }
+      res.push_back(code);
+    }
+  }
+  return res;
+}
+
+std::vector<uint64_t> TreeIndex::GetChildrenCodes(uint64_t ancestor,
+                                                  int level) {
+  auto level_code_num = static_cast<uint64_t>(std::pow(meta_.branch(), level));
+  auto code_min = level_code_num - 1;
+  auto code_max = meta_.branch() * level_code_num - 1;
+
+  std::vector<uint64_t> parent;
+  parent.push_back(ancestor);
+  std::vector<uint64_t> res;
+  size_t p_idx = 0;
+  while (true) {
+    size_t p_size = parent.size();
+    for (; p_idx < p_size; p_idx++) {
+      for (int i = 0; i < meta_.branch(); i++) {
+        auto code = parent[p_idx] * meta_.branch() + i + 1;
+        if (data_.find(code) != data_.end()) parent.push_back(code);
+      }
+    }
+    if ((code_min <= parent[p_idx]) && (parent[p_idx] < code_max)) {
+      break;
+    }
+  }
+
+  return std::vector<uint64_t>(parent.begin() + p_idx, parent.end());
+}
+
+std::vector<uint64_t> TreeIndex::GetTravelCodes(uint64_t id, int start_level) {
+  std::vector<uint64_t> res;
+  PADDLE_ENFORCE_NE(id_codes_map_.find(id), id_codes_map_.end(),
+                    paddle::platform::errors::InvalidArgument(
+                        "id = %d doesn't exist in Tree.", id));
+  auto code = id_codes_map_.at(id);
+  int level = meta_.height() - 1;
+
+  while (level >= start_level) {
+    res.push_back(code);
+    code = (code - 1) / meta_.branch();
+    level--;
+  }
+  return res;
+}
+
+std::vector<IndexNode> TreeIndex::GetAllLeafs() {
+  std::vector<IndexNode> res;
+  res.reserve(id_codes_map_.size());
+  for (auto& ite : id_codes_map_) {
+    auto code = ite.second;
+    res.push_back(data_.at(code));
+  }
+  return res;
+}
+
+}  // end namespace distributed
+}  // end namespace paddle
--- a/paddle/fluid/distributed/index_dataset/index_wrapper.h
+++ b/paddle/fluid/distributed/index_dataset/index_wrapper.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cmath>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/distributed/index_dataset/index_dataset.pb.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+class Index {
+ public:
+  Index() {}
+  ~Index() {}
+};
+
+class TreeIndex : public Index {
+ public:
+  TreeIndex() {}
+  ~TreeIndex() {}
+
+  int Height() { return meta_.height(); }
+  int Branch() { return meta_.branch(); }
+  uint64_t TotalNodeNums() { return total_nodes_num_; }
+  uint64_t EmbSize() { return max_id_ + 1; }
+  int Load(const std::string path);
+
+  inline bool CheckIsValid(int code) {
+    if (data_.find(code) != data_.end()) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  std::vector<IndexNode> GetNodes(const std::vector<uint64_t>& codes);
+  std::vector<uint64_t> GetLayerCodes(int level);
+  std::vector<uint64_t> GetAncestorCodes(const std::vector<uint64_t>& ids,
+                                         int level);
+  std::vector<uint64_t> GetChildrenCodes(uint64_t ancestor, int level);
+  std::vector<uint64_t> GetTravelCodes(uint64_t id, int start_level);
+  std::vector<IndexNode> GetAllLeafs();
+
+  std::unordered_map<uint64_t, IndexNode> data_;
+  std::unordered_map<uint64_t, uint64_t> id_codes_map_;
+  uint64_t total_nodes_num_;
+  TreeMeta meta_;
+  uint64_t max_id_;
+  uint64_t max_code_;
+  IndexNode fake_node_;
+};
+
+using TreePtr = std::shared_ptr<TreeIndex>;
+
+class IndexWrapper {
+ public:
+  virtual ~IndexWrapper() {}
+  IndexWrapper() {}
+
+  void clear_tree() { tree_map.clear(); }
+
+  TreePtr get_tree_index(const std::string name) {
+    PADDLE_ENFORCE_NE(tree_map.find(name), tree_map.end(),
+                      paddle::platform::errors::InvalidArgument(
+                          "tree [%s] doesn't exist. Please insert it firstly "
+                          "by API[\' insert_tree_index \'].",
+                          name));
+    return tree_map[name];
+  }
+
+  void insert_tree_index(const std::string name, const std::string tree_path) {
+    if (tree_map.find(name) != tree_map.end()) {
+      VLOG(0) << "Tree " << name << " has already existed.";
+      return;
+    }
+    TreePtr tree = std::make_shared<TreeIndex>();
+    int ret = tree->Load(tree_path);
+    PADDLE_ENFORCE_EQ(ret, 0, paddle::platform::errors::InvalidArgument(
+                                  "Load tree[%s] from path[%s] failed. Please "
+                                  "check whether the file exists.",
+                                  name, tree_path));
+    tree_map.insert(std::pair<std::string, TreePtr>{name, tree});
+  }
+
+  static std::shared_ptr<IndexWrapper> GetInstancePtr() {
+    if (NULL == s_instance_) {
+      s_instance_.reset(new paddle::distributed::IndexWrapper());
+    }
+    return s_instance_;
+  }
+
+  static IndexWrapper* GetInstance() {
+    if (NULL == s_instance_) {
+      s_instance_.reset(new paddle::distributed::IndexWrapper());
+    }
+    return s_instance_.get();
+  }
+
+ private:
+  static std::shared_ptr<IndexWrapper> s_instance_;
+  std::unordered_map<std::string, TreePtr> tree_map;
+};
+
+}  // end namespace distributed
+}  // end namespace paddle
--- a/paddle/fluid/distributed/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/service/CMakeLists.txt
@@ -16,6 +16,7 @@ set_source_files_properties(communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUT
 set_source_files_properties(service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(brpc_ps_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(brpc_ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(ps_local_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})

 set_source_files_properties(brpc_utils.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(heter_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -24,11 +25,13 @@ set_source_files_properties(heter_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUT
 set_source_files_properties(client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-
+set_source_files_properties(graph_brpc_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(graph_brpc_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(brpc_utils SRCS brpc_utils.cc DEPS tensor device_context ${COMMON_DEPS} ${RPC_DEPS})

-cc_library(downpour_server SRCS brpc_ps_server.cc DEPS boost eigen3 table brpc_utils ${RPC_DEPS})
-cc_library(downpour_client SRCS brpc_ps_client.cc DEPS boost eigen3 table brpc_utils ${RPC_DEPS})
+cc_library(downpour_server SRCS graph_brpc_server.cc brpc_ps_server.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})
+cc_library(downpour_client SRCS graph_brpc_client.cc brpc_ps_client.cc
+ps_local_client.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})

 cc_library(client SRCS ps_client.cc DEPS downpour_client boost ${RPC_DEPS})
 cc_library(server SRCS server.cc DEPS downpour_server boost ${RPC_DEPS})
@@ -38,3 +41,6 @@ cc_library(ps_service SRCS service.cc DEPS communicator client server boost ${RP

 cc_library(heter_server SRCS heter_server.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
 cc_library(heter_client SRCS heter_client.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
+
+set_source_files_properties(graph_py_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(graph_py_service SRCS graph_py_service.cc DEPS ps_service)
--- a/paddle/fluid/distributed/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/service/brpc_ps_client.cc
@@ -768,8 +768,8 @@ std::future<int32_t> BrpcPsClient::push_global_step(int table_id,

 std::future<int32_t> BrpcPsClient::pull_sparse(float **select_values,
                                               size_t table_id,
-                                               const uint64_t *keys,
-                                               size_t num) {
+                                               const uint64_t *keys, size_t num,
+                                               bool is_training) {
  size_t request_call_num = _server_channels.size();

  auto shard_sorted_kvs = std::make_shared<
@@ -837,16 +837,27 @@ std::future<int32_t> BrpcPsClient::pull_sparse(float **select_values,
    uint32_t kv_request_count = 0;
    size_t sorted_kv_size = sorted_kvs.size();
    auto &request_buffer = closure->cntl(i)->request_attachment();
+
+    request_buffer.append((void *)&is_training, sizeof(bool));
+    std::vector<uint32_t> keys_counter;
+    keys_counter.reserve(sorted_kv_size);
+
    for (size_t kv_idx = 0; kv_idx < sorted_kv_size; ++kv_idx) {
      ++kv_request_count;
+      uint32_t keys = 1;
      last_key = sorted_kvs[kv_idx].first;
      request_buffer.append((void *)&last_key, sizeof(uint64_t));
      while (kv_idx < sorted_kv_size - 1 &&
             last_key == sorted_kvs[kv_idx + 1].first) {
        ++kv_idx;
+        ++keys;
      }
+      keys_counter.push_back(keys);
    }

+    request_buffer.append((void *)keys_counter.data(),
+                          sizeof(uint32_t) * keys_counter.size());
+
    if (kv_request_count == 0) {
      closure->Run();
    } else {
@@ -869,8 +880,8 @@ std::future<int32_t> BrpcPsClient::send_client2client_msg(
  auto promise = std::make_shared<std::promise<int32_t>>();
  std::future<int> fut = promise->get_future();
  if (to_client_id >= _client_channels.size()) {
-    LOG(FATAL) << "to_client_id is out of range clients, which size is "
-               << _client_channels.size();
+    VLOG(0) << "to_client_id is out of range clients, which size is "
+            << _client_channels.size();
    promise->set_value(-1);
    return fut;
  }
@@ -956,7 +967,7 @@ int32_t BrpcPsClient::recv_and_save_table(const uint64_t table_id,
  }

  auto status = pull_sparse((float **)save_vec.data(), table_id,
-                            save_key.data(), save_key.size());
+                            save_key.data(), save_key.size(), true);
  status.wait();

  // create lod tensor

--- a/paddle/fluid/distributed/service/brpc_ps_client.h
+++ b/paddle/fluid/distributed/service/brpc_ps_client.h
@@ -148,7 +148,8 @@ class BrpcPsClient : public PSClient {

  virtual std::future<int32_t> pull_sparse(float **select_values,
                                           size_t table_id,
-                                           const uint64_t *keys, size_t num);
+                                           const uint64_t *keys, size_t num,
+                                           bool is_training);

  virtual std::future<int32_t> print_table_stat(uint32_t table_id);

@@ -170,9 +171,22 @@ class BrpcPsClient : public PSClient {
  virtual int32_t recv_and_save_table(const uint64_t table_id,
                                      const std::string &path);

- private:
+ protected:
+  virtual size_t get_server_nums() { return _server_channels.size(); }
+  inline brpc::Channel *get_sparse_channel(size_t server_id) {
+    return _server_channels[server_id][0].get();
+  }
+  inline brpc::Channel *get_dense_channel(size_t server_id) {
+    return _server_channels[server_id][1].get();
+  }
+  inline brpc::Channel *get_cmd_channel(size_t server_id) {
+    return _server_channels[server_id][2].get();
+  }
  virtual int32_t initialize() override;

+ private:
+  // virtual int32_t initialize() override;
+
  inline uint32_t dense_dim_per_shard(uint32_t dense_dim_total,
                                      uint32_t shard_num) {
    return dense_dim_total / shard_num + 1;
@@ -184,16 +198,6 @@ class BrpcPsClient : public PSClient {
  std::future<int32_t> send_save_cmd(uint32_t table_id, int cmd_id,
                                     const std::vector<std::string> &param);

-  inline brpc::Channel *get_sparse_channel(size_t server_id) {
-    return _server_channels[server_id][0].get();
-  }
-  inline brpc::Channel *get_dense_channel(size_t server_id) {
-    return _server_channels[server_id][1].get();
-  }
-  inline brpc::Channel *get_cmd_channel(size_t server_id) {
-    return _server_channels[server_id][2].get();
-  }
-
  bool _running = false;
  bool _flushing = false;
  std::atomic<uint32_t> _async_call_num;  //异步请求计数
@@ -220,8 +224,6 @@ class BrpcPsClient : public PSClient {
                                                 size_t num,
                                                 void *done) override;

-  virtual size_t get_server_nums() { return _server_channels.size(); }
-
 private:
  int32_t start_client_service();


--- a/paddle/fluid/distributed/service/brpc_ps_server.cc
+++ b/paddle/fluid/distributed/service/brpc_ps_server.cc
@@ -14,6 +14,8 @@

 #include "paddle/fluid/distributed/service/brpc_ps_server.h"
 #include <thread>  // NOLINT
+#include "butil/object_pool.h"
+#include "paddle/fluid/distributed/table/depends/sparse_utils.h"
 #include "paddle/fluid/distributed/table/table.h"
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -60,7 +62,8 @@ uint64_t BrpcPsServer::start(const std::string &ip, uint32_t port) {
  std::unique_lock<std::mutex> lock(mutex_);

  std::string ip_port = ip + ":" + std::to_string(port);
-  VLOG(3) << "server of rank " << _rank << " starts at " << ip_port;
+  VLOG(0) << "running server with rank id: " << _rank
+          << ", endpoint: " << ip_port;
  brpc::ServerOptions options;

  int num_threads = std::thread::hardware_concurrency();
@@ -194,12 +197,13 @@ int32_t BrpcPsService::pull_dense(Table *table, const PsRequestMessage &request,
    return 0;
  }

-  std::vector<float> res_data;
-  res_data.resize(num * table->value_accesor()->select_size() / sizeof(float));
-  table->pull_dense(res_data.data(), num);
+  auto res_data = butil::get_object<std::vector<float>>();
+  res_data->resize(num * table->value_accesor()->select_size() / sizeof(float));
+  table->pull_dense(res_data->data(), num);

-  cntl->response_attachment().append((char *)res_data.data(),
-                                     res_data.size() * sizeof(float));
+  cntl->response_attachment().append((char *)(res_data->data()),
+                                     res_data->size() * sizeof(float));
+  butil::return_object(res_data);

  return 0;
 }
@@ -336,35 +340,42 @@ int32_t BrpcPsService::pull_sparse(Table *table,
                                   brpc::Controller *cntl) {
  platform::RecordEvent record_event("PsService->pull_sparse");
  CHECK_TABLE_EXIST(table, request, response)
-  thread_local std::string push_sparse_request_buffer;
+
  auto &req_io_buffer = cntl->request_attachment();
  auto req_buffer_size = req_io_buffer.size();
+
  if (req_buffer_size < 1) {
    set_response_code(response, -1, "req attachment is empty");
    return 0;
  }
+
  if (request.params_size() < 1) {
    set_response_code(response, -1,
                      "PsRequestMessage.params is requeired at "
                      "least 1 for num of sparse_key");
    return 0;
  }
+
  uint32_t num = *(uint32_t *)(request.params(0).c_str());
-  push_sparse_request_buffer.resize(0);
-  push_sparse_request_buffer.reserve(req_buffer_size);
-  const char *data = (const char *)cntl->request_attachment().fetch(
-      const_cast<char *>(push_sparse_request_buffer.data()), req_buffer_size);
-  /*
-  Attachment Content:
-  |---keysData---|
-  |---8*{num}B---|
-  */
-  const uint64_t *keys = (const uint64_t *)data;
-  std::vector<float> res_data;
-  res_data.resize(num * table->value_accesor()->select_size() / sizeof(float));
-  table->pull_sparse(res_data.data(), keys, num);
-  cntl->response_attachment().append((char *)res_data.data(),
-                                     res_data.size() * sizeof(float));
+  auto dim = table->value_accesor()->select_dim();
+
+  thread_local std::string req_buffer;
+  req_buffer.reserve(req_buffer_size);
+
+  const void *data = cntl->request_attachment().fetch(
+      const_cast<char *>(req_buffer.data()), req_buffer_size);
+
+  auto value = PullSparseValue(num, dim);
+
+  value.DeserializeFromBytes(const_cast<void *>(data));
+
+  auto res_data = butil::get_object<std::vector<float>>();
+  res_data->resize(num * dim);
+  table->pull_sparse(res_data->data(), value);
+
+  cntl->response_attachment().append((char *)(res_data->data()),
+                                     res_data->size() * sizeof(float));
+  butil::return_object(res_data);
  return 0;
 }

@@ -538,7 +549,7 @@ int32_t BrpcPsService::stop_server(Table *table,
  auto *p_server = _server;
  std::thread t_stop([p_server]() {
    p_server->stop();
-    LOG(INFO) << "Server Stoped";
+    VLOG(3) << "Server Stoped";
  });
  t_stop.detach();
  return 0;

--- a/paddle/fluid/distributed/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/service/brpc_utils.cc
@@ -324,7 +324,7 @@ std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port) {

  while (hp->h_addr_list[i] != NULL) {
    int_ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]);
-    VLOG(0) << "Brpc Get host by name, host:" << ip << " -> ip: " << int_ip;
+    VLOG(3) << "Brpc Get host by name, host:" << ip << " -> ip: " << int_ip;
    break;
  }


--- a/paddle/fluid/distributed/service/communicator.cc
+++ b/paddle/fluid/distributed/service/communicator.cc
@@ -320,9 +320,11 @@ void Communicator::RpcRecvSparse(const std::string &varname, int table_id,
    push_g_vec.push_back(tensor->data<float>() + i * dim);
  }

+  bool training = true;
+
  auto status = _worker_ptr->pull_sparse(
      (float **)push_g_vec.data(), table_id,  // NOLINT
-      sparse_push_keys.data(), sparse_push_keys.size());
+      sparse_push_keys.data(), sparse_push_keys.size(), training);
  status.wait();
  return;
 }

--- a/paddle/fluid/distributed/service/communicator.h
+++ b/paddle/fluid/distributed/service/communicator.h
@@ -310,6 +310,8 @@ class Communicator {
    return _worker_ptr;
  }

+  RecvCtxMap &GetRecvCtxMap() { return recv_varname_to_ctx_; }
+
  std::shared_ptr<PSClient> _worker_ptr;  // pointer to worker

 protected:

--- a/paddle/fluid/distributed/service/env.h
+++ b/paddle/fluid/distributed/service/env.h
@@ -39,7 +39,7 @@ struct PSHost {

  // |---ip---|---port---|--rank--|
  // |-32bit--|--20bit---|--12bit-|
-  // for pslib
+
  uint64_t serialize_to_uint64() {
    uint64_t host_label = 0;
    host_label = inet_addr(ip.c_str());
@@ -175,14 +175,12 @@ class PSEnvironment {
    host.ip = ip;
    host.port = port;
    host.rank = rank;
-    if (sign_set.count(rank) > 0) {
-      LOG(WARNING) << "ps-host :" << host.ip << ":" << host.port
-                   << ", rank:" << host.rank
-                   << " already register, ignore register";
-    } else {
+
+    if (sign_set.count(rank) == 0) {
      host_list.push_back(host);
      sign_set.insert(rank);
    }
+
    return 0;
  }


--- a/paddle/fluid/distributed/service/graph_brpc_client.cc
+++ b/paddle/fluid/distributed/service/graph_brpc_client.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include <algorithm>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+#include "Eigen/Dense"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/framework/archive.h"
+#include "paddle/fluid/string/string_helper.h"
+namespace paddle {
+namespace distributed {
+
+void GraphPsService_Stub::service(
+    ::google::protobuf::RpcController *controller,
+    const ::paddle::distributed::PsRequestMessage *request,
+    ::paddle::distributed::PsResponseMessage *response,
+    ::google::protobuf::Closure *done) {
+  if (graph_service != NULL && local_channel == channel()) {
+    // VLOG(0)<<"use local";
+    task_pool->enqueue([this, controller, request, response, done]() -> int {
+      this->graph_service->service(controller, request, response, done);
+      return 0;
+    });
+  } else {
+    // VLOG(0)<<"use server";
+    PsService_Stub::service(controller, request, response, done);
+  }
+}
+
+int GraphBrpcClient::get_server_index_by_id(uint64_t id) {
+  int shard_num = get_shard_num();
+  int shard_per_server = shard_num % server_size == 0
+                             ? shard_num / server_size
+                             : shard_num / server_size + 1;
+  return id % shard_num / shard_per_server;
+}
+
+std::future<int32_t> GraphBrpcClient::get_node_feat(
+    const uint32_t &table_id, const std::vector<uint64_t> &node_ids,
+    const std::vector<std::string> &feature_names,
+    std::vector<std::vector<std::string>> &res) {
+  std::vector<int> request2server;
+  std::vector<int> server2request(server_size, -1);
+  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_ids[query_idx]);
+    if (server2request[server_index] == -1) {
+      server2request[server_index] = request2server.size();
+      request2server.push_back(server_index);
+    }
+  }
+  size_t request_call_num = request2server.size();
+  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int>> query_idx_buckets(request_call_num);
+  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_ids[query_idx]);
+    int request_idx = server2request[server_index];
+    node_id_buckets[request_idx].push_back(node_ids[query_idx]);
+    query_idx_buckets[request_idx].push_back(query_idx);
+  }
+
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num,
+      [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        int fail_num = 0;
+        for (int request_idx = 0; request_idx < request_call_num;
+             ++request_idx) {
+          if (closure->check_response(request_idx,
+                                      PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) {
+            ++fail_num;
+          } else {
+            auto &res_io_buffer =
+                closure->cntl(request_idx)->response_attachment();
+            butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+            size_t bytes_size = io_buffer_itr.bytes_left();
+            std::unique_ptr<char[]> buffer_wrapper(new char[bytes_size]);
+            char *buffer = buffer_wrapper.get();
+            io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
+
+            for (size_t feat_idx = 0; feat_idx < feature_names.size();
+                 ++feat_idx) {
+              for (size_t node_idx = 0;
+                   node_idx < query_idx_buckets.at(request_idx).size();
+                   ++node_idx) {
+                int query_idx = query_idx_buckets.at(request_idx).at(node_idx);
+                size_t feat_len = *(size_t *)(buffer);
+                buffer += sizeof(size_t);
+                auto feature = std::string(buffer, feat_len);
+                res[feat_idx][query_idx] = feature;
+                buffer += feat_len;
+              }
+            }
+          }
+          if (fail_num == request_call_num) {
+            ret = -1;
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+
+  for (int request_idx = 0; request_idx < request_call_num; ++request_idx) {
+    int server_index = request2server[request_idx];
+    closure->request(request_idx)->set_cmd_id(PS_GRAPH_GET_NODE_FEAT);
+    closure->request(request_idx)->set_table_id(table_id);
+    closure->request(request_idx)->set_client_id(_client_id);
+    size_t node_num = node_id_buckets[request_idx].size();
+
+    closure->request(request_idx)
+        ->add_params((char *)node_id_buckets[request_idx].data(),
+                     sizeof(uint64_t) * node_num);
+    std::string joint_feature_name =
+        paddle::string::join_strings(feature_names, '\t');
+    closure->request(request_idx)
+        ->add_params(joint_feature_name.c_str(), joint_feature_name.size());
+
+    GraphPsService_Stub rpc_stub =
+        getServiceStub(get_cmd_channel(server_index));
+    closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms());
+    rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx),
+                     closure->response(request_idx), closure);
+  }
+
+  return fut;
+}
+// char* &buffer,int &actual_size
+std::future<int32_t> GraphBrpcClient::batch_sample_neighboors(
+    uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
+    std::vector<std::vector<std::pair<uint64_t, float>>> &res) {
+  std::vector<int> request2server;
+  std::vector<int> server2request(server_size, -1);
+  res.clear();
+  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_ids[query_idx]);
+    if (server2request[server_index] == -1) {
+      server2request[server_index] = request2server.size();
+      request2server.push_back(server_index);
+    }
+    res.push_back(std::vector<std::pair<uint64_t, float>>());
+  }
+  size_t request_call_num = request2server.size();
+  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int>> query_idx_buckets(request_call_num);
+  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_ids[query_idx]);
+    int request_idx = server2request[server_index];
+    node_id_buckets[request_idx].push_back(node_ids[query_idx]);
+    query_idx_buckets[request_idx].push_back(query_idx);
+  }
+
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num,
+      [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        int fail_num = 0;
+        for (int request_idx = 0; request_idx < request_call_num;
+             ++request_idx) {
+          if (closure->check_response(request_idx,
+                                      PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) {
+            ++fail_num;
+          } else {
+            auto &res_io_buffer =
+                closure->cntl(request_idx)->response_attachment();
+            butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+            size_t bytes_size = io_buffer_itr.bytes_left();
+            std::unique_ptr<char[]> buffer_wrapper(new char[bytes_size]);
+            char *buffer = buffer_wrapper.get();
+            io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
+
+            size_t node_num = *(size_t *)buffer;
+            int *actual_sizes = (int *)(buffer + sizeof(size_t));
+            char *node_buffer =
+                buffer + sizeof(size_t) + sizeof(int) * node_num;
+
+            int offset = 0;
+            for (size_t node_idx = 0; node_idx < node_num; ++node_idx) {
+              int query_idx = query_idx_buckets.at(request_idx).at(node_idx);
+              int actual_size = actual_sizes[node_idx];
+              int start = 0;
+              while (start < actual_size) {
+                res[query_idx].push_back(
+                    {*(uint64_t *)(node_buffer + offset + start),
+                     *(float *)(node_buffer + offset + start +
+                                GraphNode::id_size)});
+                start += GraphNode::id_size + GraphNode::weight_size;
+              }
+              offset += actual_size;
+            }
+          }
+          if (fail_num == request_call_num) {
+            ret = -1;
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+
+  for (int request_idx = 0; request_idx < request_call_num; ++request_idx) {
+    int server_index = request2server[request_idx];
+    closure->request(request_idx)->set_cmd_id(PS_GRAPH_SAMPLE_NEIGHBOORS);
+    closure->request(request_idx)->set_table_id(table_id);
+    closure->request(request_idx)->set_client_id(_client_id);
+    size_t node_num = node_id_buckets[request_idx].size();
+
+    closure->request(request_idx)
+        ->add_params((char *)node_id_buckets[request_idx].data(),
+                     sizeof(uint64_t) * node_num);
+    closure->request(request_idx)
+        ->add_params((char *)&sample_size, sizeof(int));
+    // PsService_Stub rpc_stub(get_cmd_channel(server_index));
+    GraphPsService_Stub rpc_stub =
+        getServiceStub(get_cmd_channel(server_index));
+    closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms());
+    rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx),
+                     closure->response(request_idx), closure);
+  }
+
+  return fut;
+}
+std::future<int32_t> GraphBrpcClient::random_sample_nodes(
+    uint32_t table_id, int server_index, int sample_size,
+    std::vector<uint64_t> &ids) {
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) {
+    int ret = 0;
+    auto *closure = (DownpourBrpcClosure *)done;
+    if (closure->check_response(0, PS_GRAPH_SAMPLE_NODES) != 0) {
+      ret = -1;
+    } else {
+      auto &res_io_buffer = closure->cntl(0)->response_attachment();
+      butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+      size_t bytes_size = io_buffer_itr.bytes_left();
+      char buffer[bytes_size];
+      auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
+      int index = 0;
+      while (index < bytes_size) {
+        ids.push_back(*(uint64_t *)(buffer + index));
+        index += GraphNode::id_size;
+      }
+    }
+    closure->set_promise_value(ret);
+  });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  ;
+  closure->request(0)->set_cmd_id(PS_GRAPH_SAMPLE_NODES);
+  closure->request(0)->set_table_id(table_id);
+  closure->request(0)->set_client_id(_client_id);
+  closure->request(0)->add_params((char *)&sample_size, sizeof(int));
+  ;
+  // PsService_Stub rpc_stub(get_cmd_channel(server_index));
+  GraphPsService_Stub rpc_stub = getServiceStub(get_cmd_channel(server_index));
+  closure->cntl(0)->set_log_id(butil::gettimeofday_ms());
+  rpc_stub.service(closure->cntl(0), closure->request(0), closure->response(0),
+                   closure);
+  return fut;
+}
+std::future<int32_t> GraphBrpcClient::pull_graph_list(
+    uint32_t table_id, int server_index, int start, int size, int step,
+    std::vector<FeatureNode> &res) {
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) {
+    int ret = 0;
+    auto *closure = (DownpourBrpcClosure *)done;
+    if (closure->check_response(0, PS_PULL_GRAPH_LIST) != 0) {
+      ret = -1;
+    } else {
+      auto &res_io_buffer = closure->cntl(0)->response_attachment();
+      butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+      size_t bytes_size = io_buffer_itr.bytes_left();
+      char buffer[bytes_size];
+      io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
+      int index = 0;
+      while (index < bytes_size) {
+        FeatureNode node;
+        node.recover_from_buffer(buffer + index);
+        index += node.get_size(false);
+        res.push_back(node);
+      }
+    }
+    closure->set_promise_value(ret);
+  });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  closure->request(0)->set_cmd_id(PS_PULL_GRAPH_LIST);
+  closure->request(0)->set_table_id(table_id);
+  closure->request(0)->set_client_id(_client_id);
+  closure->request(0)->add_params((char *)&start, sizeof(int));
+  closure->request(0)->add_params((char *)&size, sizeof(int));
+  closure->request(0)->add_params((char *)&step, sizeof(int));
+  // PsService_Stub rpc_stub(get_cmd_channel(server_index));
+  GraphPsService_Stub rpc_stub = getServiceStub(get_cmd_channel(server_index));
+  closure->cntl(0)->set_log_id(butil::gettimeofday_ms());
+  rpc_stub.service(closure->cntl(0), closure->request(0), closure->response(0),
+                   closure);
+  return fut;
+}
+int32_t GraphBrpcClient::initialize() {
+  // set_shard_num(_config.shard_num());
+  BrpcPsClient::initialize();
+  server_size = get_server_nums();
+  graph_service = NULL;
+  local_channel = NULL;
+  return 0;
+}
+}
+}
--- a/paddle/fluid/distributed/service/graph_brpc_client.h
+++ b/paddle/fluid/distributed/service/graph_brpc_client.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ThreadPool.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <utility>
+#include "ThreadPool.h"
+#include "brpc/channel.h"
+#include "brpc/controller.h"
+#include "brpc/server.h"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/ps_client.h"
+#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace paddle {
+namespace distributed {
+
+class GraphPsService_Stub : public PsService_Stub {
+ public:
+  GraphPsService_Stub(::google::protobuf::RpcChannel* channel,
+                      ::google::protobuf::RpcChannel* local_channel = NULL,
+                      GraphBrpcService* service = NULL, int thread_num = 1)
+      : PsService_Stub(channel) {
+    this->local_channel = local_channel;
+    this->graph_service = service;
+    task_pool.reset(new ::ThreadPool(thread_num));
+  }
+  virtual ~GraphPsService_Stub() {}
+
+  // implements PsService ------------------------------------------
+  GraphBrpcService* graph_service;
+  std::shared_ptr<::ThreadPool> task_pool;
+  ::google::protobuf::RpcChannel* local_channel;
+  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(GraphPsService_Stub);
+  void service(::google::protobuf::RpcController* controller,
+               const ::paddle::distributed::PsRequestMessage* request,
+               ::paddle::distributed::PsResponseMessage* response,
+               ::google::protobuf::Closure* done);
+};
+class GraphBrpcClient : public BrpcPsClient {
+ public:
+  GraphBrpcClient() {}
+  virtual ~GraphBrpcClient() {}
+  // given a batch of nodes, sample graph_neighboors for each of them
+  virtual std::future<int32_t> batch_sample_neighboors(
+      uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
+      std::vector<std::vector<std::pair<uint64_t, float>>>& res);
+
+  virtual std::future<int32_t> pull_graph_list(uint32_t table_id,
+                                               int server_index, int start,
+                                               int size, int step,
+                                               std::vector<FeatureNode>& res);
+  virtual std::future<int32_t> random_sample_nodes(uint32_t table_id,
+                                                   int server_index,
+                                                   int sample_size,
+                                                   std::vector<uint64_t>& ids);
+  virtual std::future<int32_t> get_node_feat(
+      const uint32_t& table_id, const std::vector<uint64_t>& node_ids,
+      const std::vector<std::string>& feature_names,
+      std::vector<std::vector<std::string>>& res);
+  virtual int32_t initialize();
+  int get_shard_num() { return shard_num; }
+  void set_shard_num(int shard_num) { this->shard_num = shard_num; }
+  int get_server_index_by_id(uint64_t id);
+  void set_local_channel(int index) {
+    this->local_channel = get_cmd_channel(index);
+  }
+  void set_local_graph_service(GraphBrpcService* graph_service) {
+    this->graph_service = graph_service;
+  }
+  GraphPsService_Stub getServiceStub(::google::protobuf::RpcChannel* channel,
+                                     int thread_num = 1) {
+    return GraphPsService_Stub(channel, local_channel, graph_service,
+                               thread_num);
+  }
+
+ private:
+  int shard_num;
+  size_t server_size;
+  ::google::protobuf::RpcChannel* local_channel;
+  GraphBrpcService* graph_service;
+};
+
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/service/graph_brpc_server.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+
+#include <thread>  // NOLINT
+#include "butil/endpoint.h"
+#include "iomanip"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/framework/archive.h"
+#include "paddle/fluid/platform/profiler.h"
+namespace paddle {
+namespace distributed {
+
+int32_t GraphBrpcServer::initialize() {
+  auto &service_config = _config.downpour_server_param().service_param();
+  if (!service_config.has_service_class()) {
+    LOG(ERROR) << "miss service_class in ServerServiceParameter";
+    return -1;
+  }
+  auto *service =
+      CREATE_PSCORE_CLASS(PsBaseService, service_config.service_class());
+  if (service == NULL) {
+    LOG(ERROR) << "service is unregistered, service_name:"
+               << service_config.service_class();
+    return -1;
+  }
+
+  _service.reset(service);
+  if (service->configure(this) != 0 || service->initialize() != 0) {
+    LOG(ERROR) << "service initialize failed, service_name:"
+               << service_config.service_class();
+    return -1;
+  }
+  if (_server.AddService(service, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) {
+    LOG(ERROR) << "service add to brpc failed, service:"
+               << service_config.service_class();
+    return -1;
+  }
+  return 0;
+}
+
+uint64_t GraphBrpcServer::start(const std::string &ip, uint32_t port) {
+  std::unique_lock<std::mutex> lock(mutex_);
+
+  std::string ip_port = ip + ":" + std::to_string(port);
+  VLOG(3) << "server of rank " << _rank << " starts at " << ip_port;
+  brpc::ServerOptions options;
+
+  int num_threads = std::thread::hardware_concurrency();
+  auto trainers = _environment->get_trainers();
+  options.num_threads = trainers > num_threads ? trainers : num_threads;
+
+  if (_server.Start(ip_port.c_str(), &options) != 0) {
+    LOG(ERROR) << "GraphBrpcServer start failed, ip_port=" << ip_port;
+    return 0;
+  }
+  _environment->registe_ps_server(ip, port, _rank);
+  return 0;
+}
+
+int32_t GraphBrpcServer::port() { return _server.listen_address().port; }
+
+int32_t GraphBrpcService::initialize() {
+  _is_initialize_shard_info = false;
+  _service_handler_map[PS_STOP_SERVER] = &GraphBrpcService::stop_server;
+  _service_handler_map[PS_LOAD_ONE_TABLE] = &GraphBrpcService::load_one_table;
+  _service_handler_map[PS_LOAD_ALL_TABLE] = &GraphBrpcService::load_all_table;
+
+  _service_handler_map[PS_PRINT_TABLE_STAT] =
+      &GraphBrpcService::print_table_stat;
+  _service_handler_map[PS_BARRIER] = &GraphBrpcService::barrier;
+  _service_handler_map[PS_START_PROFILER] = &GraphBrpcService::start_profiler;
+  _service_handler_map[PS_STOP_PROFILER] = &GraphBrpcService::stop_profiler;
+
+  _service_handler_map[PS_PULL_GRAPH_LIST] = &GraphBrpcService::pull_graph_list;
+  _service_handler_map[PS_GRAPH_SAMPLE_NEIGHBOORS] =
+      &GraphBrpcService::graph_random_sample_neighboors;
+  _service_handler_map[PS_GRAPH_SAMPLE_NODES] =
+      &GraphBrpcService::graph_random_sample_nodes;
+  _service_handler_map[PS_GRAPH_GET_NODE_FEAT] =
+      &GraphBrpcService::graph_get_node_feat;
+
+  // shard初始化,server启动后才可从env获取到server_list的shard信息
+  initialize_shard_info();
+
+  return 0;
+}
+
+#define CHECK_TABLE_EXIST(table, request, response)        \
+  if (table == NULL) {                                     \
+    std::string err_msg("table not found with table_id:"); \
+    err_msg.append(std::to_string(request.table_id()));    \
+    set_response_code(response, -1, err_msg.c_str());      \
+    return -1;                                             \
+  }
+
+int32_t GraphBrpcService::initialize_shard_info() {
+  if (!_is_initialize_shard_info) {
+    std::lock_guard<std::mutex> guard(_initialize_shard_mutex);
+    if (_is_initialize_shard_info) {
+      return 0;
+    }
+    size_t shard_num = _server->environment()->get_ps_servers().size();
+    auto &table_map = *(_server->table());
+    for (auto itr : table_map) {
+      itr.second->set_shard(_rank, shard_num);
+    }
+    _is_initialize_shard_info = true;
+  }
+  return 0;
+}
+
+void GraphBrpcService::service(google::protobuf::RpcController *cntl_base,
+                               const PsRequestMessage *request,
+                               PsResponseMessage *response,
+                               google::protobuf::Closure *done) {
+  brpc::ClosureGuard done_guard(done);
+  std::string log_label("ReceiveCmd-");
+  if (!request->has_table_id()) {
+    set_response_code(*response, -1, "PsRequestMessage.tabel_id is required");
+    return;
+  }
+
+  response->set_err_code(0);
+  response->set_err_msg("");
+  auto *table = _server->table(request->table_id());
+  brpc::Controller *cntl = static_cast<brpc::Controller *>(cntl_base);
+  auto itr = _service_handler_map.find(request->cmd_id());
+  if (itr == _service_handler_map.end()) {
+    std::string err_msg(
+        "undefined cmd_id, should match PsCmdID in ps.proto, cmd_id:");
+    err_msg.append(std::to_string(request->cmd_id()));
+    set_response_code(*response, -1, err_msg.c_str());
+    return;
+  }
+  serviceFunc handler_func = itr->second;
+  int service_ret = (this->*handler_func)(table, *request, *response, cntl);
+  if (service_ret != 0) {
+    response->set_err_code(service_ret);
+    response->set_err_msg("server internal error");
+  }
+}
+
+int32_t GraphBrpcService::barrier(Table *table, const PsRequestMessage &request,
+                                  PsResponseMessage &response,
+                                  brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+
+  if (request.params_size() < 1) {
+    set_response_code(response, -1,
+                      "PsRequestMessage.params is requeired at "
+                      "least 1 for num of sparse_key");
+    return 0;
+  }
+
+  auto trainer_id = request.client_id();
+  auto barrier_type = request.params(0);
+  table->barrier(trainer_id, barrier_type);
+  return 0;
+}
+
+int32_t GraphBrpcService::print_table_stat(Table *table,
+                                           const PsRequestMessage &request,
+                                           PsResponseMessage &response,
+                                           brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  std::pair<int64_t, int64_t> ret = table->print_table_stat();
+  paddle::framework::BinaryArchive ar;
+  ar << ret.first << ret.second;
+  std::string table_info(ar.Buffer(), ar.Length());
+  response.set_data(table_info);
+
+  return 0;
+}
+
+int32_t GraphBrpcService::load_one_table(Table *table,
+                                         const PsRequestMessage &request,
+                                         PsResponseMessage &response,
+                                         brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 2) {
+    set_response_code(
+        response, -1,
+        "PsRequestMessage.datas is requeired at least 2 for path & load_param");
+    return -1;
+  }
+  if (table->load(request.params(0), request.params(1)) != 0) {
+    set_response_code(response, -1, "table load failed");
+    return -1;
+  }
+  return 0;
+}
+
+int32_t GraphBrpcService::load_all_table(Table *table,
+                                         const PsRequestMessage &request,
+                                         PsResponseMessage &response,
+                                         brpc::Controller *cntl) {
+  auto &table_map = *(_server->table());
+  for (auto &itr : table_map) {
+    if (load_one_table(itr.second.get(), request, response, cntl) != 0) {
+      LOG(ERROR) << "load table[" << itr.first << "] failed";
+      return -1;
+    }
+  }
+  return 0;
+}
+
+int32_t GraphBrpcService::stop_server(Table *table,
+                                      const PsRequestMessage &request,
+                                      PsResponseMessage &response,
+                                      brpc::Controller *cntl) {
+  GraphBrpcServer *p_server = (GraphBrpcServer *)_server;
+  std::thread t_stop([p_server]() {
+    p_server->stop();
+    LOG(INFO) << "Server Stoped";
+  });
+  p_server->export_cv()->notify_all();
+  t_stop.detach();
+  return 0;
+}
+
+int32_t GraphBrpcService::stop_profiler(Table *table,
+                                        const PsRequestMessage &request,
+                                        PsResponseMessage &response,
+                                        brpc::Controller *cntl) {
+  platform::DisableProfiler(platform::EventSortingKey::kDefault,
+                            string::Sprintf("server_%s_profile", _rank));
+  return 0;
+}
+
+int32_t GraphBrpcService::start_profiler(Table *table,
+                                         const PsRequestMessage &request,
+                                         PsResponseMessage &response,
+                                         brpc::Controller *cntl) {
+  platform::EnableProfiler(platform::ProfilerState::kCPU);
+  return 0;
+}
+
+int32_t GraphBrpcService::pull_graph_list(Table *table,
+                                          const PsRequestMessage &request,
+                                          PsResponseMessage &response,
+                                          brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 3) {
+    set_response_code(response, -1,
+                      "pull_graph_list request requires at least 3 arguments");
+    return 0;
+  }
+  int start = *(int *)(request.params(0).c_str());
+  int size = *(int *)(request.params(1).c_str());
+  int step = *(int *)(request.params(2).c_str());
+  std::unique_ptr<char[]> buffer;
+  int actual_size;
+  ((GraphTable *)table)
+      ->pull_graph_list(start, size, buffer, actual_size, false, step);
+  cntl->response_attachment().append(buffer.get(), actual_size);
+  return 0;
+}
+int32_t GraphBrpcService::graph_random_sample_neighboors(
+    Table *table, const PsRequestMessage &request, PsResponseMessage &response,
+    brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 2) {
+    set_response_code(
+        response, -1,
+        "graph_random_sample request requires at least 2 arguments");
+    return 0;
+  }
+  size_t node_num = request.params(0).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
+  int sample_size = *(uint64_t *)(request.params(1).c_str());
+  std::vector<std::unique_ptr<char[]>> buffers(node_num);
+  std::vector<int> actual_sizes(node_num, 0);
+  ((GraphTable *)table)
+      ->random_sample_neighboors(node_data, sample_size, buffers, actual_sizes);
+
+  cntl->response_attachment().append(&node_num, sizeof(size_t));
+  cntl->response_attachment().append(actual_sizes.data(),
+                                     sizeof(int) * node_num);
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    cntl->response_attachment().append(buffers[idx].get(), actual_sizes[idx]);
+  }
+  return 0;
+}
+int32_t GraphBrpcService::graph_random_sample_nodes(
+    Table *table, const PsRequestMessage &request, PsResponseMessage &response,
+    brpc::Controller *cntl) {
+  size_t size = *(uint64_t *)(request.params(0).c_str());
+  std::unique_ptr<char[]> buffer;
+  int actual_size;
+  if (((GraphTable *)table)->random_sample_nodes(size, buffer, actual_size) ==
+      0) {
+    cntl->response_attachment().append(buffer.get(), actual_size);
+  } else
+    cntl->response_attachment().append(NULL, 0);
+
+  return 0;
+}
+
+int32_t GraphBrpcService::graph_get_node_feat(Table *table,
+                                              const PsRequestMessage &request,
+                                              PsResponseMessage &response,
+                                              brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 2) {
+    set_response_code(
+        response, -1,
+        "graph_get_node_feat request requires at least 2 arguments");
+    return 0;
+  }
+  size_t node_num = request.params(0).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
+  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+
+  std::vector<std::string> feature_names =
+      paddle::string::split_string<std::string>(request.params(1), "\t");
+
+  std::vector<std::vector<std::string>> feature(
+      feature_names.size(), std::vector<std::string>(node_num));
+
+  ((GraphTable *)table)->get_node_feat(node_ids, feature_names, feature);
+
+  for (size_t feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+    for (size_t node_idx = 0; node_idx < node_num; ++node_idx) {
+      size_t feat_len = feature[feat_idx][node_idx].size();
+      cntl->response_attachment().append(&feat_len, sizeof(size_t));
+      cntl->response_attachment().append(feature[feat_idx][node_idx].data(),
+                                         feat_len);
+    }
+  }
+
+  return 0;
+}
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/service/graph_brpc_server.h
+++ b/paddle/fluid/distributed/service/graph_brpc_server.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "brpc/channel.h"
+#include "brpc/controller.h"
+#include "brpc/server.h"
+
+#include <memory>
+#include <vector>
+#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/service/server.h"
+#include "paddle/fluid/distributed/table/common_graph_table.h"
+#include "paddle/fluid/distributed/table/table.h"
+namespace paddle {
+namespace distributed {
+class GraphBrpcServer : public PSServer {
+ public:
+  GraphBrpcServer() {}
+  virtual ~GraphBrpcServer() {}
+  PsBaseService *get_service() { return _service.get(); }
+  virtual uint64_t start(const std::string &ip, uint32_t port);
+  virtual int32_t stop() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (stoped_) return 0;
+    stoped_ = true;
+    // cv_.notify_all();
+    _server.Stop(1000);
+    _server.Join();
+    return 0;
+  }
+  virtual int32_t port();
+
+  std::condition_variable *export_cv() { return &cv_; }
+
+ private:
+  virtual int32_t initialize();
+  mutable std::mutex mutex_;
+  std::condition_variable cv_;
+  bool stoped_ = false;
+  brpc::Server _server;
+  std::shared_ptr<PsBaseService> _service;
+  std::vector<std::shared_ptr<brpc::Channel>> _pserver_channels;
+};
+
+class GraphBrpcService;
+
+typedef int32_t (GraphBrpcService::*serviceFunc)(
+    Table *table, const PsRequestMessage &request, PsResponseMessage &response,
+    brpc::Controller *cntl);
+
+class GraphBrpcService : public PsBaseService {
+ public:
+  virtual int32_t initialize() override;
+
+  virtual void service(::google::protobuf::RpcController *controller,
+                       const PsRequestMessage *request,
+                       PsResponseMessage *response,
+                       ::google::protobuf::Closure *done) override;
+
+ protected:
+  std::unordered_map<int32_t, serviceFunc> _service_handler_map;
+  int32_t initialize_shard_info();
+  int32_t pull_graph_list(Table *table, const PsRequestMessage &request,
+                          PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t graph_random_sample_neighboors(Table *table,
+                                         const PsRequestMessage &request,
+                                         PsResponseMessage &response,
+                                         brpc::Controller *cntl);
+  int32_t graph_random_sample_nodes(Table *table,
+                                    const PsRequestMessage &request,
+                                    PsResponseMessage &response,
+                                    brpc::Controller *cntl);
+  int32_t graph_get_node_feat(Table *table, const PsRequestMessage &request,
+                              PsResponseMessage &response,
+                              brpc::Controller *cntl);
+  int32_t barrier(Table *table, const PsRequestMessage &request,
+                  PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t load_one_table(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t load_all_table(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t stop_server(Table *table, const PsRequestMessage &request,
+                      PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t start_profiler(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t stop_profiler(Table *table, const PsRequestMessage &request,
+                        PsResponseMessage &response, brpc::Controller *cntl);
+
+  int32_t print_table_stat(Table *table, const PsRequestMessage &request,
+                           PsResponseMessage &response, brpc::Controller *cntl);
+
+ private:
+  bool _is_initialize_shard_info;
+  std::mutex _initialize_shard_mutex;
+  std::unordered_map<int32_t, serviceHandlerFunc> _msg_handler_map;
+  std::vector<float> _ori_values;
+  const int sample_nodes_ranges = 23;
+};
+
+}  // namespace distributed
+}  // namespace paddle
--- a/paddle/fluid/distributed/service/graph_py_service.cc
+++ b/paddle/fluid/distributed/service/graph_py_service.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/service/graph_py_service.h"
+#include <thread>  // NOLINT
+#include "butil/endpoint.h"
+#include "iomanip"
+#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/framework/archive.h"
+#include "paddle/fluid/platform/profiler.h"
+namespace paddle {
+namespace distributed {
+std::vector<std::string> GraphPyService::split(std::string& str,
+                                               const char pattern) {
+  std::vector<std::string> res;
+  std::stringstream input(str);
+  std::string temp;
+  while (std::getline(input, temp, pattern)) {
+    res.push_back(temp);
+  }
+  return res;
+}
+
+void GraphPyService::add_table_feat_conf(std::string table_name,
+                                         std::string feat_name,
+                                         std::string feat_dtype,
+                                         int32_t feat_shape) {
+  if (this->table_id_map.count(table_name)) {
+    this->table_feat_conf_table_name.push_back(table_name);
+    this->table_feat_conf_feat_name.push_back(feat_name);
+    this->table_feat_conf_feat_dtype.push_back(feat_dtype);
+    this->table_feat_conf_feat_shape.push_back(feat_shape);
+  }
+}
+
+void GraphPyService::set_up(std::string ips_str, int shard_num,
+                            std::vector<std::string> node_types,
+                            std::vector<std::string> edge_types) {
+  set_shard_num(shard_num);
+  set_num_node_types(node_types.size());
+
+  for (size_t table_id = 0; table_id < node_types.size(); table_id++) {
+    this->table_id_map[node_types[table_id]] = this->table_id_map.size();
+  }
+  for (size_t table_id = 0; table_id < edge_types.size(); table_id++) {
+    this->table_id_map[edge_types[table_id]] = this->table_id_map.size();
+  }
+  std::istringstream stream(ips_str);
+  std::string ip;
+  server_size = 0;
+  std::vector<std::string> ips_list = split(ips_str, ';');
+  int index = 0;
+  for (auto ips : ips_list) {
+    auto ip_and_port = split(ips, ':');
+    server_list.push_back(ip_and_port[0]);
+    port_list.push_back(ip_and_port[1]);
+    uint32_t port = stoul(ip_and_port[1]);
+    auto ph_host = paddle::distributed::PSHost(ip_and_port[0], port, index);
+    host_sign_list.push_back(ph_host.serialize_to_string());
+    index++;
+  }
+}
+void GraphPyClient::start_client() {
+  std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
+  dense_regions.insert(
+      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+  auto regions = dense_regions[0];
+  ::paddle::distributed::PSParameter worker_proto = GetWorkerProto();
+  paddle::distributed::PaddlePSEnvironment _ps_env;
+  auto servers_ = host_sign_list.size();
+  _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&host_sign_list, servers_);
+  worker_ptr = std::shared_ptr<paddle::distributed::GraphBrpcClient>(
+      (paddle::distributed::GraphBrpcClient*)
+          paddle::distributed::PSClientFactory::create(worker_proto));
+  worker_ptr->configure(worker_proto, dense_regions, _ps_env, client_id);
+  worker_ptr->set_shard_num(get_shard_num());
+}
+void GraphPyServer::start_server(bool block) {
+  std::string ip = server_list[rank];
+  uint32_t port = std::stoul(port_list[rank]);
+  ::paddle::distributed::PSParameter server_proto = this->GetServerProto();
+
+  auto _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&this->host_sign_list,
+                         this->host_sign_list.size());  // test
+  pserver_ptr = std::shared_ptr<paddle::distributed::GraphBrpcServer>(
+      (paddle::distributed::GraphBrpcServer*)
+          paddle::distributed::PSServerFactory::create(server_proto));
+  VLOG(0) << "pserver-ptr created ";
+  std::vector<framework::ProgramDesc> empty_vec;
+  framework::ProgramDesc empty_prog;
+  empty_vec.push_back(empty_prog);
+  pserver_ptr->configure(server_proto, _ps_env, rank, empty_vec);
+  pserver_ptr->start(ip, port);
+  std::condition_variable* cv_ = pserver_ptr->export_cv();
+  if (block) {
+    std::mutex mutex_;
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_->wait(lock);
+  }
+}
+::paddle::distributed::PSParameter GraphPyServer::GetServerProto() {
+  // Generate server proto desc
+  ::paddle::distributed::PSParameter server_fleet_desc;
+  ::paddle::distributed::ServerParameter* server_proto =
+      server_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("GraphBrpcService");
+  server_service_proto->set_server_class("GraphBrpcServer");
+  server_service_proto->set_client_class("GraphBrpcClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+
+  for (auto& tuple : this->table_id_map) {
+    VLOG(0) << " make a new table " << tuple.second;
+    ::paddle::distributed::TableParameter* sparse_table_proto =
+        downpour_server_proto->add_downpour_table_param();
+    std::vector<std::string> feat_name;
+    std::vector<std::string> feat_dtype;
+    std::vector<int32_t> feat_shape;
+    for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) {
+      if (tuple.first == table_feat_conf_table_name[i]) {
+        feat_name.push_back(table_feat_conf_feat_name[i]);
+        feat_dtype.push_back(table_feat_conf_feat_dtype[i]);
+        feat_shape.push_back(table_feat_conf_feat_shape[i]);
+      }
+    }
+    std::string table_type;
+    if (tuple.second < this->num_node_types) {
+      table_type = "node";
+    } else {
+      table_type = "edge";
+    }
+
+    GetDownpourSparseTableProto(sparse_table_proto, tuple.second, tuple.first,
+                                table_type, feat_name, feat_dtype, feat_shape);
+  }
+
+  return server_fleet_desc;
+}
+
+::paddle::distributed::PSParameter GraphPyClient::GetWorkerProto() {
+  ::paddle::distributed::PSParameter worker_fleet_desc;
+  ::paddle::distributed::WorkerParameter* worker_proto =
+      worker_fleet_desc.mutable_worker_param();
+
+  ::paddle::distributed::DownpourWorkerParameter* downpour_worker_proto =
+      worker_proto->mutable_downpour_worker_param();
+
+  for (auto& tuple : this->table_id_map) {
+    VLOG(0) << " make a new table " << tuple.second;
+    ::paddle::distributed::TableParameter* worker_sparse_table_proto =
+        downpour_worker_proto->add_downpour_table_param();
+    std::vector<std::string> feat_name;
+    std::vector<std::string> feat_dtype;
+    std::vector<int32_t> feat_shape;
+    for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) {
+      if (tuple.first == table_feat_conf_table_name[i]) {
+        feat_name.push_back(table_feat_conf_feat_name[i]);
+        feat_dtype.push_back(table_feat_conf_feat_dtype[i]);
+        feat_shape.push_back(table_feat_conf_feat_shape[i]);
+      }
+    }
+    std::string table_type;
+    if (tuple.second < this->num_node_types) {
+      table_type = "node";
+    } else {
+      table_type = "edge";
+    }
+
+    GetDownpourSparseTableProto(worker_sparse_table_proto, tuple.second,
+                                tuple.first, table_type, feat_name, feat_dtype,
+                                feat_shape);
+  }
+
+  ::paddle::distributed::ServerParameter* server_proto =
+      worker_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("GraphBrpcService");
+  server_service_proto->set_server_class("GraphBrpcServer");
+  server_service_proto->set_client_class("GraphBrpcClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+
+  for (auto& tuple : this->table_id_map) {
+    VLOG(0) << " make a new table " << tuple.second;
+    ::paddle::distributed::TableParameter* sparse_table_proto =
+        downpour_server_proto->add_downpour_table_param();
+    std::vector<std::string> feat_name;
+    std::vector<std::string> feat_dtype;
+    std::vector<int32_t> feat_shape;
+    for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) {
+      if (tuple.first == table_feat_conf_table_name[i]) {
+        feat_name.push_back(table_feat_conf_feat_name[i]);
+        feat_dtype.push_back(table_feat_conf_feat_dtype[i]);
+        feat_shape.push_back(table_feat_conf_feat_shape[i]);
+      }
+    }
+    std::string table_type;
+    if (tuple.second < this->num_node_types) {
+      table_type = "node";
+    } else {
+      table_type = "edge";
+    }
+
+    GetDownpourSparseTableProto(sparse_table_proto, tuple.second, tuple.first,
+                                table_type, feat_name, feat_dtype, feat_shape);
+  }
+
+  return worker_fleet_desc;
+}
+void GraphPyClient::load_edge_file(std::string name, std::string filepath,
+                                   bool reverse) {
+  // 'e' means load edge
+  std::string params = "e";
+  if (reverse) {
+    // 'e<' means load edges from $2 to $1
+    params += "<";
+  } else {
+    // 'e>' means load edges from $1 to $2
+    params += ">";
+  }
+  if (this->table_id_map.count(name)) {
+    VLOG(0) << "loadding data with type " << name << " from " << filepath;
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        get_ps_client()->load(table_id, std::string(filepath), params);
+    status.wait();
+  }
+}
+
+void GraphPyClient::load_node_file(std::string name, std::string filepath) {
+  // 'n' means load nodes and 'node_type' follows
+  std::string params = "n" + name;
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        get_ps_client()->load(table_id, std::string(filepath), params);
+    status.wait();
+  }
+}
+std::vector<std::vector<std::pair<uint64_t, float>>>
+GraphPyClient::batch_sample_neighboors(std::string name,
+                                       std::vector<uint64_t> node_ids,
+                                       int sample_size) {
+  std::vector<std::vector<std::pair<uint64_t, float>>> v;
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        worker_ptr->batch_sample_neighboors(table_id, node_ids, sample_size, v);
+    status.wait();
+  }
+  return v;
+}
+
+std::vector<uint64_t> GraphPyClient::random_sample_nodes(std::string name,
+                                                         int server_index,
+                                                         int sample_size) {
+  std::vector<uint64_t> v;
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        worker_ptr->random_sample_nodes(table_id, server_index, sample_size, v);
+    status.wait();
+  }
+  return v;
+}
+
+// (name, dtype, ndarray)
+std::vector<std::vector<std::string>> GraphPyClient::get_node_feat(
+    std::string node_type, std::vector<uint64_t> node_ids,
+    std::vector<std::string> feature_names) {
+  std::vector<std::vector<std::string>> v(
+      feature_names.size(), std::vector<std::string>(node_ids.size()));
+  if (this->table_id_map.count(node_type)) {
+    uint32_t table_id = this->table_id_map[node_type];
+    auto status =
+        worker_ptr->get_node_feat(table_id, node_ids, feature_names, v);
+    status.wait();
+  }
+  return v;
+}
+
+std::vector<FeatureNode> GraphPyClient::pull_graph_list(std::string name,
+                                                        int server_index,
+                                                        int start, int size,
+                                                        int step) {
+  std::vector<FeatureNode> res;
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status = worker_ptr->pull_graph_list(table_id, server_index, start,
+                                              size, step, res);
+    status.wait();
+  }
+  return res;
+}
+
+void GraphPyClient::stop_server() {
+  VLOG(0) << "going to stop server";
+  std::unique_lock<std::mutex> lock(mutex_);
+  if (stoped_) return;
+  auto status = this->worker_ptr->stop_server();
+  if (status.get() == 0) stoped_ = true;
+}
+void GraphPyClient::finalize_worker() { this->worker_ptr->finalize_worker(); }
+}
+}
--- a/paddle/fluid/distributed/service/graph_py_service.h
+++ b/paddle/fluid/distributed/service/graph_py_service.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <unistd.h>
+#include <condition_variable>  // NOLINT
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_map>
+#include <vector>
+#include "google/protobuf/text_format.h"
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/service/service.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+namespace paddle {
+namespace distributed {
+class GraphPyService {
+ protected:
+  std::vector<std::string> server_list, port_list, host_sign_list;
+  int server_size, shard_num;
+  int num_node_types;
+  std::unordered_map<std::string, uint32_t> table_id_map;
+  std::vector<std::string> table_feat_conf_table_name;
+  std::vector<std::string> table_feat_conf_feat_name;
+  std::vector<std::string> table_feat_conf_feat_dtype;
+  std::vector<int32_t> table_feat_conf_feat_shape;
+
+ public:
+  int get_shard_num() { return shard_num; }
+  void set_shard_num(int shard_num) { this->shard_num = shard_num; }
+  void GetDownpourSparseTableProto(
+      ::paddle::distributed::TableParameter* sparse_table_proto,
+      uint32_t table_id, std::string table_name, std::string table_type,
+      std::vector<std::string> feat_name, std::vector<std::string> feat_dtype,
+      std::vector<int32_t> feat_shape) {
+    sparse_table_proto->set_table_id(table_id);
+    sparse_table_proto->set_table_class("GraphTable");
+    sparse_table_proto->set_shard_num(shard_num);
+    sparse_table_proto->set_type(::paddle::distributed::PS_SPARSE_TABLE);
+    ::paddle::distributed::TableAccessorParameter* accessor_proto =
+        sparse_table_proto->mutable_accessor();
+
+    ::paddle::distributed::CommonAccessorParameter* common_proto =
+        sparse_table_proto->mutable_common();
+
+    // Set GraphTable Parameter
+    common_proto->set_table_name(table_name);
+    common_proto->set_name(table_type);
+    for (size_t i = 0; i < feat_name.size(); i++) {
+      common_proto->add_params(feat_dtype[i]);
+      common_proto->add_dims(feat_shape[i]);
+      common_proto->add_attributes(feat_name[i]);
+    }
+
+    accessor_proto->set_accessor_class("CommMergeAccessor");
+  }
+
+  void set_server_size(int server_size) { this->server_size = server_size; }
+  void set_num_node_types(int num_node_types) {
+    this->num_node_types = num_node_types;
+  }
+  int get_server_size(int server_size) { return server_size; }
+  std::vector<std::string> split(std::string& str, const char pattern);
+  void set_up(std::string ips_str, int shard_num,
+              std::vector<std::string> node_types,
+              std::vector<std::string> edge_types);
+
+  void add_table_feat_conf(std::string node_type, std::string feat_name,
+                           std::string feat_dtype, int32_t feat_shape);
+};
+class GraphPyServer : public GraphPyService {
+ public:
+  GraphPyServer() {}
+  void set_up(std::string ips_str, int shard_num,
+              std::vector<std::string> node_types,
+              std::vector<std::string> edge_types, int rank) {
+    set_rank(rank);
+    GraphPyService::set_up(ips_str, shard_num, node_types, edge_types);
+  }
+  int get_rank() { return rank; }
+  void set_rank(int rank) { this->rank = rank; }
+
+  void start_server(bool block = true);
+  ::paddle::distributed::PSParameter GetServerProto();
+  std::shared_ptr<paddle::distributed::GraphBrpcServer> get_ps_server() {
+    return pserver_ptr;
+  }
+
+ protected:
+  int rank;
+  std::shared_ptr<paddle::distributed::GraphBrpcServer> pserver_ptr;
+  std::thread* server_thread;
+};
+class GraphPyClient : public GraphPyService {
+ public:
+  void set_up(std::string ips_str, int shard_num,
+              std::vector<std::string> node_types,
+              std::vector<std::string> edge_types, int client_id) {
+    set_client_id(client_id);
+    GraphPyService::set_up(ips_str, shard_num, node_types, edge_types);
+  }
+  std::shared_ptr<paddle::distributed::GraphBrpcClient> get_ps_client() {
+    return worker_ptr;
+  }
+  void bind_local_server(int local_channel_index, GraphPyServer& server) {
+    worker_ptr->set_local_channel(local_channel_index);
+    worker_ptr->set_local_graph_service(
+        (paddle::distributed::GraphBrpcService*)server.get_ps_server()
+            ->get_service());
+  }
+  void stop_server();
+  void finalize_worker();
+  void load_edge_file(std::string name, std::string filepath, bool reverse);
+  void load_node_file(std::string name, std::string filepath);
+  int get_client_id() { return client_id; }
+  void set_client_id(int client_id) { this->client_id = client_id; }
+  void start_client();
+  std::vector<std::vector<std::pair<uint64_t, float>>> batch_sample_neighboors(
+      std::string name, std::vector<uint64_t> node_ids, int sample_size);
+  std::vector<uint64_t> random_sample_nodes(std::string name, int server_index,
+                                            int sample_size);
+  std::vector<std::vector<std::string>> get_node_feat(
+      std::string node_type, std::vector<uint64_t> node_ids,
+      std::vector<std::string> feature_names);
+  std::vector<FeatureNode> pull_graph_list(std::string name, int server_index,
+                                           int start, int size, int step = 1);
+  ::paddle::distributed::PSParameter GetWorkerProto();
+
+ protected:
+  mutable std::mutex mutex_;
+  int client_id;
+  std::shared_ptr<paddle::distributed::GraphBrpcClient> worker_ptr;
+  std::thread* client_thread;
+  bool stoped_ = false;
+};
+}
+}
--- a/paddle/fluid/distributed/service/ps_client.cc
+++ b/paddle/fluid/distributed/service/ps_client.cc
@@ -15,11 +15,15 @@
 #include "paddle/fluid/distributed/service/ps_client.h"
 #include "glog/logging.h"
 #include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/service/ps_local_client.h"
 #include "paddle/fluid/distributed/table/table.h"

 namespace paddle {
 namespace distributed {
 REGISTER_PSCORE_CLASS(PSClient, BrpcPsClient);
+REGISTER_PSCORE_CLASS(PSClient, PsLocalClient);
+REGISTER_PSCORE_CLASS(PSClient, GraphBrpcClient);

 int32_t PSClient::configure(
    const PSParameter &config,
@@ -78,8 +82,7 @@ PSClient *PSClientFactory::create(const PSParameter &ps_config) {
  }

  TableManager::instance().initialize();
-  LOG(INFO) << "Create PSClient[" << service_param.client_class()
-            << "] success";
+  VLOG(3) << "Create PSClient[" << service_param.client_class() << "] success";
  return client;
 }
 }  // namespace distributed

--- a/paddle/fluid/distributed/service/ps_client.h
+++ b/paddle/fluid/distributed/service/ps_client.h
@@ -24,16 +24,11 @@
 #include "paddle/fluid/distributed/service/env.h"
 #include "paddle/fluid/distributed/service/sendrecv.pb.h"
 #include "paddle/fluid/distributed/table/accessor.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"

 namespace paddle {
 namespace distributed {

-class PSEnvironment;
-class PsRequestMessage;
-class PsResponseMessage;
-class ValueAccessor;
-struct Region;
-
 using paddle::distributed::PsRequestMessage;
 using paddle::distributed::PsResponseMessage;

@@ -117,10 +112,22 @@ class PSClient {
  // future结束前keys和values缓冲区不能再次使用
  // 整合多个线程请求的keys，聚集并分散发送到server
  // 返回结果后，遍历buffer并对values赋值
+  // is_training 用于区分请求是训练/预测，server端对于特征和准入会有不同的处理.
  virtual std::future<int32_t> pull_sparse(float **select_values,
                                           size_t table_id,
-                                           const uint64_t *keys,
-                                           size_t num) = 0;
+                                           const uint64_t *keys, size_t num,
+                                           bool is_training) = 0;
+
+  virtual ::std::future<int32_t> pull_sparse_ptr(char **select_values,
+                                                 size_t table_id,
+                                                 const uint64_t *keys,
+                                                 size_t num) {
+    VLOG(0) << "Did not implement";
+    std::promise<int32_t> promise;
+    std::future<int> fut = promise.get_future();
+    promise.set_value(-1);
+    return fut;
+  }

  virtual std::future<int32_t> print_table_stat(uint32_t table_id) = 0;

@@ -154,12 +161,13 @@ class PSClient {
  virtual std::future<int32_t> send_client2client_msg(int msg_type,
                                                      int to_client_id,
                                                      const std::string &msg) {
-    LOG(FATAL) << "Did not implement";
+    VLOG(0) << "Did not implement";
    std::promise<int32_t> promise;
    std::future<int> fut = promise.get_future();
    promise.set_value(-1);
    return fut;
  }
+
  // client2client消息处理，std::function<int32_t (int, int, const std::string&)
  // -> ret (msg_type, from_client_id, msg)
  typedef std::function<int32_t(int, int, const std::string &)> MsgHandlerFunc;

--- a/paddle/fluid/distributed/service/ps_local_client.cc
+++ b/paddle/fluid/distributed/service/ps_local_client.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/service/ps_local_client.h"
+#include "paddle/fluid/distributed/table/table.h"
+
+//#define pslib_debug_dense_compress
+
+namespace paddle {
+namespace distributed {
+int32_t PsLocalClient::initialize() {
+  const auto& downpour_param = _config.server_param().downpour_server_param();
+  TableManager::instance().initialize();
+  for (size_t i = 0; i < downpour_param.downpour_table_param_size(); ++i) {
+    auto* table = CREATE_PSCORE_CLASS(
+        Table, downpour_param.downpour_table_param(i).table_class());
+    table->initialize(downpour_param.downpour_table_param(i),
+                      _config.fs_client_param());
+    table->set_shard(0, 1);
+    _table_map[downpour_param.downpour_table_param(i).table_id()].reset(table);
+  }
+  return 0;
+}
+
+::std::future<int32_t> PsLocalClient::shrink(uint32_t table_id,
+                                             const std::string threshold) {
+  // TODO
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::load(const std::string& epoch,
+                                           const std::string& mode) {
+  // TODO
+  // for (auto& it : _table_map) {
+  //    load(it.first, epoch, mode);
+  //}
+  return done();
+}
+::std::future<int32_t> PsLocalClient::load(uint32_t table_id,
+                                           const std::string& epoch,
+                                           const std::string& mode) {
+  // TODO
+  // auto* table_ptr = table(table_id);
+  // table_ptr->load(epoch, mode);
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::save(const std::string& epoch,
+                                           const std::string& mode) {
+  // TODO
+  for (auto& it : _table_map) {
+    save(it.first, epoch, mode);
+  }
+  return done();
+}
+::std::future<int32_t> PsLocalClient::save(uint32_t table_id,
+                                           const std::string& epoch,
+                                           const std::string& mode) {
+  // TODO
+  auto* table_ptr = table(table_id);
+  table_ptr->flush();
+  table_ptr->save(epoch, mode);
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::clear() {
+  // TODO
+  return done();
+}
+::std::future<int32_t> PsLocalClient::clear(uint32_t table_id) {
+  // TODO
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::flush() {
+  // no need
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::stop_server() {
+  // no need
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::pull_dense(Region* regions,
+                                                 size_t region_num,
+                                                 size_t table_id) {
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  uint32_t num_per_shard = dense_dim_per_shard(accessor->fea_dim(), 1);
+  std::vector<float> region_buffer;
+  region_buffer.resize(num_per_shard);
+  table_ptr->pull_dense(region_buffer.data(), region_buffer.size());
+
+  size_t region_idx = 0;
+  size_t region_data_idx = 0;
+  size_t shard_data_size = num_per_shard;
+  size_t shard_buffer_remain = shard_data_size * sizeof(float);
+  PADDLE_ENFORCE_EQ(
+      shard_buffer_remain, region_buffer.size() * sizeof(float),
+      platform::errors::PreconditionNotMet("pull dense size error."));
+  size_t index = 0;
+  while (shard_buffer_remain > 0 && region_idx < region_num) {
+    auto& region = regions[region_idx];
+    if (region.size - region_data_idx >= shard_buffer_remain) {
+      memcpy((void*)(region.data + region_data_idx),
+             (uint8_t*)(void*)(region_buffer.data()) + index,
+             shard_buffer_remain);
+      region_data_idx += shard_buffer_remain;
+      shard_buffer_remain = 0;
+    } else if (region.size - region_data_idx == 0) {
+      ++region_idx;
+      region_data_idx = 0;
+    } else {
+      memcpy((void*)(region.data + region_data_idx),
+             (uint8_t*)(void*)(region_buffer.data()) + index,
+             region.size - region_data_idx);
+      shard_buffer_remain -= (region.size - region_data_idx);
+      index += (region.size - region_data_idx);
+      ++region_idx;
+      region_data_idx = 0;
+    }
+  }
+
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_dense_param(const Region* regions,
+                                                       size_t region_num,
+                                                       size_t table_id) {
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  std::vector<float> region_buffer;
+  region_buffer.resize(dense_dim_per_shard(accessor->fea_dim(), 1), 0);
+  for (size_t i = 0, offset = 0; i < region_num; ++i) {
+    uint32_t data_num = regions[i].size / sizeof(float);
+    memcpy(region_buffer.data() + offset, regions[i].data, regions[i].size);
+    offset += data_num;
+  }
+
+  // table_ptr->push_dense_param(region_buffer.data(), region_buffer.size());
+
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_dense_raw_gradient(
+    int table_id, float* total_send_data, size_t total_send_data_size,
+    void* callback) {
+  VLOG(1) << "wxx push_dense_raw_gradient";
+
+  PSClientClosure* closure = reinterpret_cast<PSClientClosure*>(callback);
+
+  auto* table_ptr = table(table_id);
+
+  table_ptr->push_dense(total_send_data, total_send_data_size);
+  delete closure;
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_dense(const Region* regions,
+                                                 size_t region_num,
+                                                 size_t table_id) {
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  std::vector<float> region_buffer;
+  region_buffer.resize(dense_dim_per_shard(accessor->fea_dim(), 1));
+  size_t data_size = region_buffer.size();
+  for (size_t i = 0, offset = 0; i < region_num; ++i) {
+    uint32_t data_num = regions[i].size / sizeof(float);
+    PADDLE_ENFORCE_LE(
+        offset + data_num, data_size,
+        platform::errors::PreconditionNotMet(
+            "invalid dense size, cur pos[%d] data_num[%d] size[%d]", offset,
+            data_num, data_size));
+    memcpy(region_buffer.data() + offset, regions[i].data, regions[i].size);
+    offset += data_num;
+  }
+
+  table_ptr->push_dense(region_buffer.data(), region_buffer.size());
+
+  return done();
+}
+
+//::std::future<int32_t> PsLocalClient::pull_sparse(float** select_values,
+//                                                  size_t table_id,
+//                                                  const uint64_t* keys,
+//                                                  size_t num) {
+//  // FIXME
+//  // auto timer =
+//  // std::make_shared<CostTimer>("pslib_downpour_client_pull_sparse");
+//  // auto local_timer =
+//  // std::make_shared<CostTimer>("pslib_downpour_client_pull_sparse_local");
+//  //将key拆分到各shard请求，并记录原始对应value指针
+//  auto* accessor = table_accessor(table_id);
+//  auto* table_ptr = table(table_id);
+//  size_t value_size = accessor->select_size();
+//
+//  // table_ptr->pull_sparse(keys, num);
+//  std::vector<float> res_data;
+//  res_data.resize(num * value_size / sizeof(float));
+//  table_ptr->pull_sparse(res_data.data(), keys, num);
+//  // memcpy(select_values[0], res_data->data(), res_data->size() *
+//  // sizeof(float));
+//  size_t offset = 0;
+//  for (int i = 0; i < num; ++i) {
+//    memcpy(select_values[i], (char*)res_data.data() + offset, value_size);
+//    offset += value_size;
+//  }
+//
+//  // return fut;
+//  return done();
+//}
+
+::std::future<int32_t> PsLocalClient::pull_sparse_ptr(char** select_values,
+                                                      size_t table_id,
+                                                      const uint64_t* keys,
+                                                      size_t num) {
+  // FIXME
+  // auto timer =
+  // std::make_shared<CostTimer>("pslib_downpour_client_pull_sparse");
+  // auto local_timer =
+  // std::make_shared<CostTimer>("pslib_downpour_client_pull_sparse_local");
+  //将key拆分到各shard请求，并记录原始对应value指针
+  auto* table_ptr = table(table_id);
+
+  table_ptr->pull_sparse_ptr(select_values, keys, num);
+
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_sparse_raw_gradient(
+    size_t table_id, const uint64_t* keys, const float** update_values,
+    size_t num, void* callback) {
+  VLOG(1) << "wxx push_sparse_raw_gradient";
+  PSClientClosure* closure = reinterpret_cast<PSClientClosure*>(callback);
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  table_ptr->push_sparse(keys, update_values, num);
+  delete closure;
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_sparse(size_t table_id,
+                                                  const uint64_t* keys,
+                                                  const float** update_values,
+                                                  size_t num) {
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  table_ptr->push_sparse(keys, update_values, num);
+  return done();
+}
+}
+}
--- a/paddle/fluid/distributed/service/ps_local_client.h
+++ b/paddle/fluid/distributed/service/ps_local_client.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License 0//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "brpc/channel.h"
+#include "brpc/controller.h"
+#include "brpc/server.h"
+#include "paddle/fluid/distributed/service/ps_client.h"
+
+namespace paddle {
+namespace distributed {
+
+class Table;
+
+class PsLocalClient : public PSClient {
+ public:
+  PsLocalClient() {}
+  virtual ~PsLocalClient() { _running = false; }
+  virtual int32_t create_client2client_connection(int pslib_timeout_ms,
+                                                  int pslib_connect_timeout_ms,
+                                                  int max_retry) {
+    return 0;
+  }
+
+  virtual ::std::future<int32_t> shrink(uint32_t table_id,
+                                        const std::string threshold) override;
+  virtual ::std::future<int32_t> load(const std::string& epoch,
+                                      const std::string& mode) override;
+  virtual ::std::future<int32_t> load(uint32_t table_id,
+                                      const std::string& epoch,
+                                      const std::string& mode) override;
+
+  virtual ::std::future<int32_t> save(const std::string& epoch,
+                                      const std::string& mode) override;
+  virtual ::std::future<int32_t> save(uint32_t table_id,
+                                      const std::string& epoch,
+                                      const std::string& mode) override;
+
+  virtual ::std::future<int32_t> clear() override;
+  virtual ::std::future<int32_t> clear(uint32_t table_id) override;
+
+  virtual ::std::future<int32_t> stop_server() override;
+
+  virtual void finalize_worker() override {}
+  virtual ::std::future<int32_t> pull_dense(Region* regions, size_t region_num,
+                                            size_t table_id);
+
+  virtual ::std::future<int32_t> push_dense(const Region* regions,
+                                            size_t region_num, size_t table_id);
+
+  virtual ::std::future<int32_t> push_dense_param(const Region* regions,
+                                                  size_t region_num,
+                                                  size_t table_id);
+
+  virtual ::std::future<int32_t> pull_sparse(float** select_values,
+                                             size_t table_id,
+                                             const uint64_t* keys, size_t num,
+                                             bool is_training) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual ::std::future<int32_t> pull_sparse_ptr(char** select_values,
+                                                 size_t table_id,
+                                                 const uint64_t* keys,
+                                                 size_t num);
+
+  virtual ::std::future<int32_t> print_table_stat(uint32_t table_id) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+  virtual ::std::future<int32_t> push_sparse(size_t table_id,
+                                             const uint64_t* keys,
+                                             const float** update_values,
+                                             size_t num);
+
+  virtual ::std::future<int32_t> flush();
+  // server profilera
+  virtual std::future<int32_t> start_profiler() {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  };
+
+  virtual std::future<int32_t> stop_profiler() {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual std::future<int32_t> barrier(size_t table_id, uint32_t barrier_type) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual std::future<int32_t> pull_geo_param(size_t table_id,
+                                              std::vector<float>* values,
+                                              std::vector<uint64_t>* keys,
+                                              int pserver_idx) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual std::future<int32_t> push_global_step(int table_id,
+                                                int64_t* total_send_data,
+                                                void* done) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  // recv table from server and save it in LodTensor
+  virtual int32_t recv_and_save_table(const uint64_t table_id,
+                                      const std::string& path) {
+    return 0;
+  }
+
+  virtual ::std::future<int32_t> send_client2client_msg(
+      int msg_type, int to_client_id, const std::string& msg) override {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+  virtual size_t get_server_nums() { return 1; }
+
+  virtual std::future<int32_t> push_dense_raw_gradient(
+      int table_id, float* total_send_data, size_t total_send_data_size,
+      void* callback) override;
+
+  virtual std::future<int32_t> push_sparse_raw_gradient(
+      size_t table_id, const uint64_t* keys, const float** update_values,
+      size_t num, void* callback) override;
+
+  virtual std::future<int32_t> push_sparse_raw_gradient_partial(
+      size_t table_id, const uint64_t* keys, const float** update_values,
+      uint32_t num, void* done, int pserver_idx) override {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual std::future<int32_t> push_sparse_param(size_t table_id,
+                                                 const uint64_t* keys,
+                                                 const float** update_values,
+                                                 size_t num,
+                                                 void* done) override {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+ private:
+  virtual int32_t initialize() override;
+
+  std::future<int32_t> done() {
+    std::shared_ptr<std::promise<int32_t>> prom =
+        std::make_shared<std::promise<int32_t>>();
+    std::future<int32_t> fut = prom->get_future();
+    prom->set_value(0);
+    return fut;
+  }
+
+  inline uint32_t dense_dim_per_shard(uint32_t dense_dim_total,
+                                      uint32_t shard_num) {
+    return dense_dim_total / shard_num + 1;
+  }
+
+  inline std::unordered_map<uint32_t, std::shared_ptr<Table>>* table() {
+    return &_table_map;
+  }
+
+  inline Table* table(size_t table_id) {
+    auto itr = _table_map.find(table_id);
+    if (itr != _table_map.end()) {
+      return itr->second.get();
+    }
+    LOG(ERROR) << "table not found " << table_id;
+    return NULL;
+  }
+
+  std::unordered_map<uint32_t, std::shared_ptr<Table>> _table_map;
+
+  bool _running = false;
+  bool _flushing = false;
+
+ private:
+  float _mae = 0;
+  float _mse = 0;
+  uint16_t _push_times = 0;
+};
+}
+}
--- a/paddle/fluid/operators/distributed/parameter_send.h
+++ b/paddle/fluid/operators/distributed/parameter_send.h
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,22 +14,24 @@

 #pragma once

-#include <string>
+#include <memory>
 #include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
+#include "paddle/fluid/distributed/service/server.h"

 namespace paddle {
-namespace operators {
 namespace distributed {

-template <typename T>
-struct ParameterSend {
-  void operator()(const CommContext &rpc_ctx, const framework::Scope &scope,
-                  bool sync, int multi_parts);
-};
+class PsLocalServer : public PSServer {
+ public:
+  PsLocalServer() {}
+  virtual ~PsLocalServer() {}
+  virtual uint64_t start() { return 0; }
+  virtual uint64_t start(const std::string& ip, uint32_t port) { return 0; }
+  virtual int32_t stop() { return 0; }
+  virtual int32_t port() { return 0; }

-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
+ private:
+  virtual int32_t initialize() { return 0; }
+};
+}
+}
--- a/paddle/fluid/distributed/service/sendrecv.proto
+++ b/paddle/fluid/distributed/service/sendrecv.proto
@@ -48,6 +48,10 @@ enum PsCmdID {
  PS_START_PROFILER = 27;
  PS_STOP_PROFILER = 28;
  PS_PUSH_GLOBAL_STEP = 29;
+  PS_PULL_GRAPH_LIST = 30;
+  PS_GRAPH_SAMPLE_NEIGHBOORS = 31;
+  PS_GRAPH_SAMPLE_NODES = 32;
+  PS_GRAPH_GET_NODE_FEAT = 33;
 }

 message PsRequestMessage {
@@ -111,4 +115,4 @@ message MultiVariableMessage {
 service PsService {
  rpc service(PsRequestMessage) returns (PsResponseMessage);
  rpc SendAndRecvVariable(MultiVariableMessage) returns (MultiVariableMessage);
-};
\ No newline at end of file
+};
--- a/paddle/fluid/distributed/service/server.cc
+++ b/paddle/fluid/distributed/service/server.cc
@@ -16,13 +16,18 @@

 #include "glog/logging.h"
 #include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/ps_local_server.h"
 #include "paddle/fluid/distributed/table/table.h"

 namespace paddle {
 namespace distributed {

 REGISTER_PSCORE_CLASS(PSServer, BrpcPsServer);
+REGISTER_PSCORE_CLASS(PSServer, PsLocalServer);
 REGISTER_PSCORE_CLASS(PsBaseService, BrpcPsService);
+REGISTER_PSCORE_CLASS(PSServer, GraphBrpcServer);
+REGISTER_PSCORE_CLASS(PsBaseService, GraphBrpcService);

 PSServer *PSServerFactory::create(const PSParameter &ps_config) {
  const auto &config = ps_config.server_param();

--- a/paddle/fluid/distributed/service/service.cc
+++ b/paddle/fluid/distributed/service/service.cc
@@ -47,7 +47,7 @@ paddle::distributed::PSParameter load_from_prototxt(
 }

 void PSCore::init_gflag(const std::string& gflags) {
-  LOG(INFO) << "Init With Gflags:" << gflags;
+  VLOG(3) << "Init With Gflags:" << gflags;
  std::vector<std::string> flags = paddle::string::split_string(gflags);
  if (flags.size() < 1) {
    flags.push_back("-max_body_size=314217728");

--- a/paddle/fluid/distributed/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/table/CMakeLists.txt
 set_property(GLOBAL PROPERTY TABLE_DEPS string_helper)
-
+set(graphDir graph)
 get_property(TABLE_DEPS GLOBAL PROPERTY TABLE_DEPS)
-
+set_source_files_properties(${graphDir}/graph_edge.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(graph_edge SRCS ${graphDir}/graph_edge.cc)
+set_source_files_properties(${graphDir}/graph_weighted_sampler.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(WeightedSampler SRCS ${graphDir}/graph_weighted_sampler.cc DEPS graph_edge)
+set_source_files_properties(${graphDir}/graph_node.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(graph_node SRCS ${graphDir}/graph_node.cc DEPS WeightedSampler)
 set_source_files_properties(common_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(common_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+
+get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)

-cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc DEPS ${TABLE_DEPS} device_context string_helper simple_threadpool xxhash generator)
+cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc
+sparse_geo_table.cc barrier_table.cc common_graph_table.cc DEPS ${TABLE_DEPS}
+${RPC_DEPS} graph_edge graph_node device_context string_helper simple_threadpool xxhash generator)

 set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})

--- a/paddle/fluid/distributed/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/table/common_graph_table.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/common_graph_table.h"
+#include <time.h>
+#include <algorithm>
+#include <set>
+#include <sstream>
+#include "paddle/fluid/distributed/common/utils.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/string/string_helper.h"
+namespace paddle {
+namespace distributed {
+
+std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {
+  if (start < 0) start = 0;
+  std::vector<Node *> res;
+  for (int pos = start; pos < std::min(end, (int)bucket.size()); pos += step) {
+    res.push_back(bucket[pos]);
+  }
+  return res;
+}
+
+size_t GraphShard::get_size() { return bucket.size(); }
+
+GraphNode *GraphShard::add_graph_node(uint64_t id) {
+  if (node_location.find(id) == node_location.end()) {
+    node_location[id] = bucket.size();
+    bucket.push_back(new GraphNode(id));
+  }
+  return (GraphNode *)bucket[node_location[id]];
+}
+
+FeatureNode *GraphShard::add_feature_node(uint64_t id) {
+  if (node_location.find(id) == node_location.end()) {
+    node_location[id] = bucket.size();
+    bucket.push_back(new FeatureNode(id));
+  }
+  return (FeatureNode *)bucket[node_location[id]];
+}
+
+void GraphShard::add_neighboor(uint64_t id, uint64_t dst_id, float weight) {
+  find_node(id)->add_edge(dst_id, weight);
+}
+
+Node *GraphShard::find_node(uint64_t id) {
+  auto iter = node_location.find(id);
+  return iter == node_location.end() ? nullptr : bucket[iter->second];
+}
+
+int32_t GraphTable::load(const std::string &path, const std::string &param) {
+  bool load_edge = (param[0] == 'e');
+  bool load_node = (param[0] == 'n');
+  if (load_edge) {
+    bool reverse_edge = (param[1] == '<');
+    return this->load_edges(path, reverse_edge);
+  }
+  if (load_node) {
+    std::string node_type = param.substr(1);
+    return this->load_nodes(path, node_type);
+  }
+  return 0;
+}
+
+int32_t GraphTable::get_nodes_ids_by_ranges(
+    std::vector<std::pair<int, int>> ranges, std::vector<uint64_t> &res) {
+  int start = 0, end, index = 0, total_size = 0;
+  res.clear();
+  std::vector<std::future<std::vector<uint64_t>>> tasks;
+  // std::string temp = "";
+  // for(int i = 0;i < shards.size();i++)
+  //   temp+= std::to_string((int)shards[i].get_size()) + " ";
+  // VLOG(0)<<"range distribution "<<temp;
+  for (int i = 0; i < shards.size() && index < ranges.size(); i++) {
+    end = total_size + shards[i].get_size();
+    start = total_size;
+    while (start < end && index < ranges.size()) {
+      if (ranges[index].second <= start)
+        index++;
+      else if (ranges[index].first >= end) {
+        break;
+      } else {
+        int first = std::max(ranges[index].first, start);
+        int second = std::min(ranges[index].second, end);
+        start = second;
+        first -= total_size;
+        second -= total_size;
+        // VLOG(0)<<" FIND RANGE "<<i<<" "<<first<<" "<<second;
+        tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
+            [this, first, second, i]() -> std::vector<uint64_t> {
+              return shards[i].get_ids_by_range(first, second);
+            }));
+      }
+    }
+    total_size += shards[i].get_size();
+  }
+  for (int i = 0; i < tasks.size(); i++) {
+    auto vec = tasks[i].get();
+    for (auto &id : vec) {
+      res.push_back(id);
+      std::swap(res[rand() % res.size()], res[(int)res.size() - 1]);
+    }
+  }
+  return 0;
+}
+
+int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
+  auto paths = paddle::string::split_string<std::string>(path, ";");
+  int64_t count = 0;
+  int64_t valid_count = 0;
+  for (auto path : paths) {
+    std::ifstream file(path);
+    std::string line;
+    while (std::getline(file, line)) {
+      count++;
+      auto values = paddle::string::split_string<std::string>(line, "\t");
+      if (values.size() < 2) continue;
+      auto id = std::stoull(values[1]);
+
+      size_t shard_id = id % shard_num;
+      if (shard_id >= shard_end || shard_id < shard_start) {
+        VLOG(4) << "will not load " << id << " from " << path
+                << ", please check id distribution";
+        continue;
+      }
+
+      if (count % 1000000 == 0) {
+        VLOG(0) << count << " nodes are loaded from filepath";
+      }
+
+      std::string nt = values[0];
+      if (nt != node_type) {
+        continue;
+      }
+
+      size_t index = shard_id - shard_start;
+
+      auto node = shards[index].add_feature_node(id);
+
+      node->set_feature_size(feat_name.size());
+
+      for (size_t slice = 2; slice < values.size(); slice++) {
+        auto feat = this->parse_feature(values[slice]);
+        if (feat.first >= 0) {
+          node->set_feature(feat.first, feat.second);
+        } else {
+          VLOG(4) << "Node feature:  " << values[slice]
+                  << " not in feature_map.";
+        }
+      }
+      valid_count++;
+    }
+  }
+
+  VLOG(0) << valid_count << "/" << count << " nodes in type " << node_type
+          << " are loaded successfully in " << path;
+  return 0;
+}
+
+int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
+  auto paths = paddle::string::split_string<std::string>(path, ";");
+  int64_t count = 0;
+  std::string sample_type = "random";
+  bool is_weighted = false;
+  int valid_count = 0;
+
+  for (auto path : paths) {
+    std::ifstream file(path);
+    std::string line;
+    while (std::getline(file, line)) {
+      auto values = paddle::string::split_string<std::string>(line, "\t");
+      count++;
+      if (values.size() < 2) continue;
+      auto src_id = std::stoull(values[0]);
+      auto dst_id = std::stoull(values[1]);
+      if (reverse_edge) {
+        std::swap(src_id, dst_id);
+      }
+      float weight = 1;
+      if (values.size() == 3) {
+        weight = std::stof(values[2]);
+        sample_type = "weighted";
+        is_weighted = true;
+      }
+
+      size_t src_shard_id = src_id % shard_num;
+
+      if (src_shard_id >= shard_end || src_shard_id < shard_start) {
+        VLOG(4) << "will not load " << src_id << " from " << path
+                << ", please check id distribution";
+        continue;
+      }
+      if (count % 1000000 == 0) {
+        VLOG(0) << count << " edges are loaded from filepath";
+      }
+
+      size_t index = src_shard_id - shard_start;
+      shards[index].add_graph_node(src_id)->build_edges(is_weighted);
+      shards[index].add_neighboor(src_id, dst_id, weight);
+      valid_count++;
+    }
+  }
+  VLOG(0) << valid_count << "/" << count << " edges are loaded successfully in "
+          << path;
+
+  // Build Sampler j
+
+  for (auto &shard : shards) {
+    auto bucket = shard.get_bucket();
+    for (int i = 0; i < bucket.size(); i++) {
+      bucket[i]->build_sampler(sample_type);
+    }
+  }
+  return 0;
+}
+
+Node *GraphTable::find_node(uint64_t id) {
+  size_t shard_id = id % shard_num;
+  if (shard_id >= shard_end || shard_id < shard_start) {
+    return nullptr;
+  }
+  size_t index = shard_id - shard_start;
+  Node *node = shards[index].find_node(id);
+  return node;
+}
+uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
+  return node_id % shard_num % shard_num_per_table % task_pool_size_;
+}
+int32_t GraphTable::random_sample_nodes(int sample_size,
+                                        std::unique_ptr<char[]> &buffer,
+                                        int &actual_size) {
+  bool need_feature = false;
+  int total_size = 0;
+  for (int i = 0; i < shards.size(); i++) {
+    total_size += shards[i].get_size();
+  }
+  if (sample_size > total_size) sample_size = total_size;
+  int range_num = random_sample_nodes_ranges;
+  if (range_num > sample_size) range_num = sample_size;
+  if (sample_size == 0 || range_num == 0) return 0;
+  std::vector<int> ranges_len, ranges_pos;
+  int remain = sample_size, last_pos = -1, num;
+  std::set<int> separator_set;
+  for (int i = 0; i < range_num - 1; i++) {
+    while (separator_set.find(num = rand() % (sample_size - 1)) !=
+           separator_set.end())
+      ;
+    separator_set.insert(num);
+  }
+  for (auto p : separator_set) {
+    ranges_len.push_back(p - last_pos);
+    last_pos = p;
+  }
+  ranges_len.push_back(sample_size - 1 - last_pos);
+  remain = total_size - sample_size + range_num;
+  separator_set.clear();
+  for (int i = 0; i < range_num; i++) {
+    while (separator_set.find(num = rand() % remain) != separator_set.end())
+      ;
+    separator_set.insert(num);
+  }
+  int used = 0, index = 0;
+  last_pos = -1;
+  for (auto p : separator_set) {
+    used += p - last_pos - 1;
+    last_pos = p;
+    ranges_pos.push_back(used);
+    used += ranges_len[index++];
+  }
+  std::vector<std::pair<int, int>> first_half, second_half;
+  int start_index = rand() % total_size;
+  for (int i = 0; i < ranges_len.size() && i < ranges_pos.size(); i++) {
+    if (ranges_pos[i] + ranges_len[i] - 1 + start_index < total_size)
+      first_half.push_back({ranges_pos[i] + start_index,
+                            ranges_pos[i] + ranges_len[i] + start_index});
+    else if (ranges_pos[i] + start_index >= total_size) {
+      second_half.push_back(
+          {ranges_pos[i] + start_index - total_size,
+           ranges_pos[i] + ranges_len[i] + start_index - total_size});
+    } else {
+      first_half.push_back({ranges_pos[i] + start_index, total_size});
+      second_half.push_back(
+          {0, ranges_pos[i] + ranges_len[i] + start_index - total_size});
+    }
+  }
+  for (auto &pair : first_half) second_half.push_back(pair);
+  std::vector<uint64_t> res;
+  get_nodes_ids_by_ranges(second_half, res);
+  actual_size = res.size() * sizeof(uint64_t);
+  buffer.reset(new char[actual_size]);
+  char *pointer = buffer.get();
+  memcpy(pointer, res.data(), actual_size);
+  return 0;
+}
+int32_t GraphTable::random_sample_neighboors(
+    uint64_t *node_ids, int sample_size,
+    std::vector<std::unique_ptr<char[]>> &buffers,
+    std::vector<int> &actual_sizes) {
+  size_t node_num = buffers.size();
+  std::vector<std::future<int>> tasks;
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    uint64_t &node_id = node_ids[idx];
+    std::unique_ptr<char[]> &buffer = buffers[idx];
+    int &actual_size = actual_sizes[idx];
+    tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
+        [&]() -> int {
+          Node *node = find_node(node_id);
+
+          if (node == nullptr) {
+            actual_size = 0;
+            return 0;
+          }
+          std::vector<int> res = node->sample_k(sample_size);
+          actual_size = res.size() * (Node::id_size + Node::weight_size);
+          int offset = 0;
+          uint64_t id;
+          float weight;
+          char *buffer_addr = new char[actual_size];
+          buffer.reset(buffer_addr);
+          for (int &x : res) {
+            id = node->get_neighbor_id(x);
+            weight = node->get_neighbor_weight(x);
+            memcpy(buffer_addr + offset, &id, Node::id_size);
+            offset += Node::id_size;
+            memcpy(buffer_addr + offset, &weight, Node::weight_size);
+            offset += Node::weight_size;
+          }
+          return 0;
+        }));
+  }
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    tasks[idx].get();
+  }
+  return 0;
+}
+
+int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
+                                  const std::vector<std::string> &feature_names,
+                                  std::vector<std::vector<std::string>> &res) {
+  size_t node_num = node_ids.size();
+  std::vector<std::future<int>> tasks;
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    uint64_t node_id = node_ids[idx];
+    tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
+        [&, idx, node_id]() -> int {
+          Node *node = find_node(node_id);
+
+          if (node == nullptr) {
+            return 0;
+          }
+          for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+            const std::string &feature_name = feature_names[feat_idx];
+            if (feat_id_map.find(feature_name) != feat_id_map.end()) {
+              // res[feat_idx][idx] =
+              // node->get_feature(feat_id_map[feature_name]);
+              auto feat = node->get_feature(feat_id_map[feature_name]);
+              res[feat_idx][idx] = feat;
+            }
+          }
+          return 0;
+        }));
+  }
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    tasks[idx].get();
+  }
+  return 0;
+}
+
+std::pair<int32_t, std::string> GraphTable::parse_feature(
+    std::string feat_str) {
+  // Return (feat_id, btyes) if name are in this->feat_name, else return (-1,
+  // "")
+  auto fields = paddle::string::split_string<std::string>(feat_str, " ");
+  if (this->feat_id_map.count(fields[0])) {
+    int32_t id = this->feat_id_map[fields[0]];
+    std::string dtype = this->feat_dtype[id];
+    int32_t shape = this->feat_shape[id];
+    std::vector<std::string> values(fields.begin() + 1, fields.end());
+    if (dtype == "feasign") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), paddle::string::join_strings(values, ' '));
+    } else if (dtype == "string") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), paddle::string::join_strings(values, ' '));
+    } else if (dtype == "float32") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), FeatureNode::parse_value_to_bytes<float>(values));
+    } else if (dtype == "float64") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), FeatureNode::parse_value_to_bytes<double>(values));
+    } else if (dtype == "int32") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), FeatureNode::parse_value_to_bytes<int32_t>(values));
+    } else if (dtype == "int64") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), FeatureNode::parse_value_to_bytes<int64_t>(values));
+    }
+  }
+  return std::make_pair<int32_t, std::string>(-1, "");
+}
+
+int32_t GraphTable::pull_graph_list(int start, int total_size,
+                                    std::unique_ptr<char[]> &buffer,
+                                    int &actual_size, bool need_feature,
+                                    int step) {
+  if (start < 0) start = 0;
+  int size = 0, cur_size;
+  std::vector<std::future<std::vector<Node *>>> tasks;
+  for (size_t i = 0; i < shards.size() && total_size > 0; i++) {
+    cur_size = shards[i].get_size();
+    if (size + cur_size <= start) {
+      size += cur_size;
+      continue;
+    }
+    int count = std::min(1 + (size + cur_size - start - 1) / step, total_size);
+    int end = start + (count - 1) * step + 1;
+    tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
+        [this, i, start, end, step, size]() -> std::vector<Node *> {
+
+          return this->shards[i].get_batch(start - size, end - size, step);
+        }));
+    start += count * step;
+    total_size -= count;
+    size += cur_size;
+  }
+  for (size_t i = 0; i < tasks.size(); ++i) {
+    tasks[i].wait();
+  }
+  size = 0;
+  std::vector<std::vector<Node *>> res;
+  for (size_t i = 0; i < tasks.size(); i++) {
+    res.push_back(tasks[i].get());
+    for (size_t j = 0; j < res.back().size(); j++) {
+      size += res.back()[j]->get_size(need_feature);
+    }
+  }
+  char *buffer_addr = new char[size];
+  buffer.reset(buffer_addr);
+  int index = 0;
+  for (size_t i = 0; i < res.size(); i++) {
+    for (size_t j = 0; j < res[i].size(); j++) {
+      res[i][j]->to_buffer(buffer_addr + index, need_feature);
+      index += res[i][j]->get_size(need_feature);
+    }
+  }
+  actual_size = size;
+  return 0;
+}
+int32_t GraphTable::initialize() {
+  _shards_task_pool.resize(task_pool_size_);
+  for (size_t i = 0; i < _shards_task_pool.size(); ++i) {
+    _shards_task_pool[i].reset(new ::ThreadPool(1));
+  }
+  server_num = _shard_num;
+  // VLOG(0) << "in init graph table server num = " << server_num;
+  /*
+  _shard_num is actually server number here
+  when a server initialize its tables, it sets tables' _shard_num to server_num,
+  and _shard_idx to server
+  rank
+  */
+  auto common = _config.common();
+
+  this->table_name = common.table_name();
+  this->table_type = common.name();
+  VLOG(0) << " init graph table type " << this->table_type << " table name "
+          << this->table_name;
+  int feat_conf_size = static_cast<int>(common.attributes().size());
+  for (int i = 0; i < feat_conf_size; i++) {
+    auto &f_name = common.attributes()[i];
+    auto &f_shape = common.dims()[i];
+    auto &f_dtype = common.params()[i];
+    this->feat_name.push_back(f_name);
+    this->feat_shape.push_back(f_shape);
+    this->feat_dtype.push_back(f_dtype);
+    this->feat_id_map[f_name] = i;
+    VLOG(0) << "init graph table feat conf name:" << f_name
+            << " shape:" << f_shape << " dtype:" << f_dtype;
+  }
+
+  shard_num = _config.shard_num();
+  VLOG(0) << "in init graph table shard num = " << shard_num << " shard_idx"
+          << _shard_idx;
+  shard_num_per_table = sparse_local_shard_num(shard_num, server_num);
+  shard_start = _shard_idx * shard_num_per_table;
+  shard_end = shard_start + shard_num_per_table;
+  VLOG(0) << "in init graph table shard idx = " << _shard_idx << " shard_start "
+          << shard_start << " shard_end " << shard_end;
+  // shards.resize(shard_num_per_table);
+  shards = std::vector<GraphShard>(shard_num_per_table, GraphShard(shard_num));
+  return 0;
+}
+}
+};
--- a/paddle/fluid/distributed/table/common_graph_table.h
+++ b/paddle/fluid/distributed/table/common_graph_table.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ThreadPool.h>
+#include <assert.h>
+#include <pthread.h>
+#include <list>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/distributed/table/accessor.h"
+#include "paddle/fluid/distributed/table/common_table.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/framework/rw_lock.h"
+#include "paddle/fluid/string/string_helper.h"
+namespace paddle {
+namespace distributed {
+class GraphShard {
+ public:
+  size_t get_size();
+  GraphShard() {}
+  GraphShard(int shard_num) { this->shard_num = shard_num; }
+  std::vector<Node *> &get_bucket() { return bucket; }
+  std::vector<Node *> get_batch(int start, int end, int step);
+  std::vector<uint64_t> get_ids_by_range(int start, int end) {
+    std::vector<uint64_t> res;
+    for (int i = start; i < end && i < bucket.size(); i++) {
+      res.push_back(bucket[i]->get_id());
+    }
+    return res;
+  }
+  GraphNode *add_graph_node(uint64_t id);
+  FeatureNode *add_feature_node(uint64_t id);
+  Node *find_node(uint64_t id);
+  void add_neighboor(uint64_t id, uint64_t dst_id, float weight);
+  std::unordered_map<uint64_t, int> get_node_location() {
+    return node_location;
+  }
+
+ private:
+  std::unordered_map<uint64_t, int> node_location;
+  int shard_num;
+  std::vector<Node *> bucket;
+};
+class GraphTable : public SparseTable {
+ public:
+  GraphTable() {}
+  virtual ~GraphTable() {}
+  virtual int32_t pull_graph_list(int start, int size,
+                                  std::unique_ptr<char[]> &buffer,
+                                  int &actual_size, bool need_feature,
+                                  int step);
+
+  virtual int32_t random_sample_neighboors(
+      uint64_t *node_ids, int sample_size,
+      std::vector<std::unique_ptr<char[]>> &buffers,
+      std::vector<int> &actual_sizes);
+
+  int32_t random_sample_nodes(int sample_size, std::unique_ptr<char[]> &buffers,
+                              int &actual_sizes);
+
+  virtual int32_t get_nodes_ids_by_ranges(
+      std::vector<std::pair<int, int>> ranges, std::vector<uint64_t> &res);
+  virtual int32_t initialize();
+
+  int32_t load(const std::string &path, const std::string &param);
+
+  int32_t load_edges(const std::string &path, bool reverse);
+
+  int32_t load_nodes(const std::string &path, std::string node_type);
+
+  Node *find_node(uint64_t id);
+
+  virtual int32_t pull_sparse(float *values,
+                              const PullSparseValue &pull_value) {
+    return 0;
+  }
+
+  virtual int32_t push_sparse(const uint64_t *keys, const float *values,
+                              size_t num) {
+    return 0;
+  }
+
+  virtual void clear() {}
+  virtual int32_t flush() { return 0; }
+  virtual int32_t shrink(const std::string &param) { return 0; }
+  //指定保存路径
+  virtual int32_t save(const std::string &path, const std::string &converter) {
+    return 0;
+  }
+  virtual int32_t initialize_shard() { return 0; }
+  virtual uint32_t get_thread_pool_index(uint64_t node_id);
+  virtual std::pair<int32_t, std::string> parse_feature(std::string feat_str);
+
+  virtual int32_t get_node_feat(const std::vector<uint64_t> &node_ids,
+                                const std::vector<std::string> &feature_names,
+                                std::vector<std::vector<std::string>> &res);
+
+ protected:
+  std::vector<GraphShard> shards;
+  size_t shard_start, shard_end, server_num, shard_num_per_table, shard_num;
+  const int task_pool_size_ = 24;
+  const int random_sample_nodes_ranges = 3;
+
+  std::vector<std::string> feat_name;
+  std::vector<std::string> feat_dtype;
+  std::vector<int32_t> feat_shape;
+  std::unordered_map<std::string, int32_t> feat_id_map;
+  std::string table_name;
+  std::string table_type;
+
+  std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
+};
+}  // namespace distributed
+};  // namespace paddle
--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/table/common_sparse_table.cc
--- a/paddle/fluid/distributed/table/common_sparse_table.h
+++ b/paddle/fluid/distributed/table/common_sparse_table.h
--- a/paddle/fluid/distributed/table/common_table.h
+++ b/paddle/fluid/distributed/table/common_table.h
--- a/paddle/fluid/distributed/table/depends/dense.h
+++ b/paddle/fluid/distributed/table/depends/dense.h
--- a/paddle/fluid/distributed/table/depends/large_scale_kv.h
+++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h
--- a/paddle/fluid/distributed/table/depends/sparse.h
+++ b/paddle/fluid/distributed/table/depends/sparse.h
--- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
--- a/paddle/fluid/distributed/table/graph/graph_edge.cc
+++ b/paddle/fluid/distributed/table/graph/graph_edge.cc
--- a/paddle/fluid/distributed/table/graph/graph_edge.h
+++ b/paddle/fluid/distributed/table/graph/graph_edge.h
--- a/paddle/fluid/distributed/table/graph/graph_node.cc
+++ b/paddle/fluid/distributed/table/graph/graph_node.cc
--- a/paddle/fluid/distributed/table/graph/graph_node.h
+++ b/paddle/fluid/distributed/table/graph/graph_node.h
--- a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc
+++ b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc
--- a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h
+++ b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h
--- a/paddle/fluid/distributed/table/graph_edge.cc
+++ b/paddle/fluid/distributed/table/graph_edge.cc
--- a/paddle/fluid/distributed/table/graph_edge.h
+++ b/paddle/fluid/distributed/table/graph_edge.h
--- a/paddle/fluid/distributed/table/graph_node.cc
+++ b/paddle/fluid/distributed/table/graph_node.cc
--- a/paddle/fluid/distributed/table/graph_node.h
+++ b/paddle/fluid/distributed/table/graph_node.h
--- a/paddle/fluid/distributed/table/graph_weighted_sampler.cc
+++ b/paddle/fluid/distributed/table/graph_weighted_sampler.cc
--- a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc
+++ b/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc
--- a/paddle/fluid/distributed/table/sparse_geo_table.cc
+++ b/paddle/fluid/distributed/table/sparse_geo_table.cc
--- a/paddle/fluid/distributed/table/table.cc
+++ b/paddle/fluid/distributed/table/table.cc
--- a/paddle/fluid/distributed/table/table.h
+++ b/paddle/fluid/distributed/table/table.h
--- a/paddle/fluid/distributed/table/tensor_table.h
+++ b/paddle/fluid/distributed/table/tensor_table.h
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
--- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
--- a/paddle/fluid/distributed/test/geo_table_test.cc
+++ b/paddle/fluid/distributed/test/geo_table_test.cc
--- a/paddle/fluid/distributed/test/graph_node_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
--- a/paddle/fluid/distributed/test/sparse_table_test.cc
+++ b/paddle/fluid/distributed/test/sparse_table_test.cc
--- a/paddle/fluid/distributed/thirdparty/round_robin.h
+++ b/paddle/fluid/distributed/thirdparty/round_robin.h
--- a/paddle/fluid/extension/include/ext_dispatch.h
+++ b/paddle/fluid/extension/include/ext_dispatch.h
--- a/paddle/fluid/extension/include/ext_dtype.h
+++ b/paddle/fluid/extension/include/ext_dtype.h
--- a/paddle/fluid/extension/include/ext_op_meta_info.h
+++ b/paddle/fluid/extension/include/ext_op_meta_info.h
--- a/paddle/fluid/extension/include/ext_tensor.h
+++ b/paddle/fluid/extension/include/ext_tensor.h
--- a/paddle/fluid/extension/src/ext_tensor.cc
+++ b/paddle/fluid/extension/src/ext_tensor.cc
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
--- a/paddle/fluid/framework/c/c_api.cc
+++ b/paddle/fluid/framework/c/c_api.cc
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
--- a/paddle/fluid/framework/custom_operator.h
+++ b/paddle/fluid/framework/custom_operator.h
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
--- a/paddle/fluid/framework/custom_tensor_utils.h
+++ b/paddle/fluid/framework/custom_tensor_utils.h
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
--- a/paddle/fluid/framework/executor_cache.h
+++ b/paddle/fluid/framework/executor_cache.h
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
--- a/paddle/fluid/framework/fleet/ascend_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.cc
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
--- a/paddle/fluid/framework/fleet/heter_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_wrapper.h
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
--- a/paddle/fluid/framework/heter_service.h
+++ b/paddle/fluid/framework/heter_service.h
--- a/paddle/fluid/framework/heter_util.h
+++ b/paddle/fluid/framework/heter_util.h
--- a/paddle/fluid/framework/heterbox_worker.cc
+++ b/paddle/fluid/framework/heterbox_worker.cc
--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
--- a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc
+++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
--- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
--- a/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc
--- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
+++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
--- a/paddle/fluid/framework/ir/placement_pass_base.cc
+++ b/paddle/fluid/framework/ir/placement_pass_base.cc
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
--- a/paddle/fluid/framework/load_op_lib.h
+++ b/paddle/fluid/framework/load_op_lib.h
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
--- a/paddle/fluid/framework/ps_gpu_worker.cc
+++ b/paddle/fluid/framework/ps_gpu_worker.cc
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
--- a/paddle/fluid/framework/unused_var_check.cc
+++ b/paddle/fluid/framework/unused_var_check.cc
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
--- a/paddle/fluid/framework/variable_helper.h
+++ b/paddle/fluid/framework/variable_helper.h
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
--- a/paddle/fluid/imperative/amp_auto_cast.h
+++ b/paddle/fluid/imperative/amp_auto_cast.h
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
--- a/paddle/fluid/imperative/basic_engine.h
+++ b/paddle/fluid/imperative/basic_engine.h
--- a/paddle/fluid/imperative/bkcl_context.cc
+++ b/paddle/fluid/imperative/bkcl_context.cc
--- a/paddle/fluid/imperative/bkcl_context.h
+++ b/paddle/fluid/imperative/bkcl_context.h
--- a/paddle/fluid/imperative/data_loader.cc
+++ b/paddle/fluid/imperative/data_loader.cc
--- a/paddle/fluid/imperative/dygraph_grad_maker.h
+++ b/paddle/fluid/imperative/dygraph_grad_maker.h
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
--- a/paddle/fluid/imperative/hooks.h
+++ b/paddle/fluid/imperative/hooks.h
--- a/paddle/fluid/imperative/jit/program_desc_tracer.cc
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.cc
--- a/paddle/fluid/imperative/jit/program_desc_tracer.h
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.h
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
--- a/paddle/fluid/imperative/op_base.h
+++ b/paddle/fluid/imperative/op_base.h
--- a/paddle/fluid/imperative/parallel_context.h
+++ b/paddle/fluid/imperative/parallel_context.h
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
--- a/paddle/fluid/imperative/py_layer_fwd.h
+++ b/paddle/fluid/imperative/py_layer_fwd.h
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
--- a/paddle/fluid/inference/analysis/helper.cc
+++ b/paddle/fluid/inference/analysis/helper.cc
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
--- a/paddle/fluid/operators/distributed/large_scale_kv.cc
+++ b/paddle/fluid/operators/distributed/large_scale_kv.cc
--- a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
--- a/paddle/fluid/inference/api/demo_ci/clean.sh
+++ b/paddle/fluid/inference/api/demo_ci/clean.sh
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
--- a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
+++ b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
--- a/paddle/fluid/inference/api/demo_ci/windows_inference.md
+++ b/paddle/fluid/inference/api/demo_ci/windows_inference.md
--- a/paddle/fluid/inference/api/helper.cc
+++ b/paddle/fluid/inference/api/helper.cc
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
--- a/paddle/fluid/inference/capi/pd_predictor.cc
+++ b/paddle/fluid/inference/capi/pd_predictor.cc
--- a/paddle/fluid/inference/capi_exp/CMakeLists.txt
+++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt
--- a/paddle/fluid/inference/capi_exp/lod_demo.cc
+++ b/paddle/fluid/inference/capi_exp/lod_demo.cc
--- a/paddle/fluid/inference/capi_exp/pd_common.h
+++ b/paddle/fluid/inference/capi_exp/pd_common.h
--- a/paddle/fluid/inference/capi_exp/pd_config.cc
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
--- a/paddle/fluid/inference/capi_exp/pd_config.h
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
--- a/paddle/fluid/operators/distributed/distributed_pb.h
+++ b/paddle/fluid/operators/distributed/distributed_pb.h
--- a/paddle/fluid/inference/capi_exp/pd_predictor.cc
+++ b/paddle/fluid/inference/capi_exp/pd_predictor.cc
--- a/paddle/fluid/inference/capi_exp/pd_predictor.h
+++ b/paddle/fluid/inference/capi_exp/pd_predictor.h
--- a/paddle/fluid/inference/capi_exp/pd_tensor.cc
+++ b/paddle/fluid/inference/capi_exp/pd_tensor.cc
--- a/paddle/fluid/inference/capi_exp/pd_tensor.h
+++ b/paddle/fluid/inference/capi_exp/pd_tensor.h
--- a/paddle/fluid/operators/distributed/parameter_recv.h
+++ b/paddle/fluid/operators/distributed/parameter_recv.h
--- a/paddle/fluid/inference/capi_exp/pd_utils.cc
+++ b/paddle/fluid/inference/capi_exp/pd_utils.cc
--- a/paddle/fluid/inference/capi_exp/pd_utils.h
+++ b/paddle/fluid/inference/capi_exp/pd_utils.h
--- a/paddle/fluid/inference/capi_exp/types_internal.h
+++ b/paddle/fluid/inference/capi_exp/types_internal.h
--- a/paddle/fluid/inference/capi_exp/utils_internal.h
+++ b/paddle/fluid/inference/capi_exp/utils_internal.h
--- a/paddle/fluid/inference/engine.h
+++ b/paddle/fluid/inference/engine.h
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
--- a/paddle/fluid/inference/lite/engine.h
+++ b/paddle/fluid/inference/lite/engine.h
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
--- a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/gather_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gather_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
--- a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
--- a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
--- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
--- a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
--- a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
--- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
--- a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
--- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
--- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc
--- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
--- a/paddle/fluid/memory/allocation/npu_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_allocator.cc
--- a/paddle/fluid/operators/distributed/rpc_client.cc
+++ b/paddle/fluid/operators/distributed/rpc_client.cc
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ b/paddle/fluid/memory/detail/CMakeLists.txt
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
--- a/paddle/fluid/memory/memcpy.h
+++ b/paddle/fluid/memory/memcpy.h
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
--- a/paddle/fluid/operators/abs_op.cu
+++ b/paddle/fluid/operators/abs_op.cu
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
--- a/paddle/fluid/operators/activation_op_npu.cc
+++ b/paddle/fluid/operators/activation_op_npu.cc
--- a/paddle/fluid/operators/addmm_op.h
+++ b/paddle/fluid/operators/addmm_op.h
--- a/paddle/fluid/operators/amp/CMakeLists.txt
+++ b/paddle/fluid/operators/amp/CMakeLists.txt
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.cc
+++ b/paddle/fluid/operators/distributed_ops/allreduce_op.cc
--- a/paddle/fluid/operators/expand_op.cu
+++ b/paddle/fluid/operators/expand_op.cu
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
--- a/paddle/fluid/operators/assign_op_npu.cc
+++ b/paddle/fluid/operators/assign_op_npu.cc
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
--- a/paddle/fluid/operators/cast_op_npu.cc
+++ b/paddle/fluid/operators/cast_op_npu.cc
--- a/paddle/fluid/operators/cast_op_xpu.cc
+++ b/paddle/fluid/operators/cast_op_xpu.cc
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
--- a/paddle/fluid/operators/clip_op.cu
+++ b/paddle/fluid/operators/clip_op.cu
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
--- a/paddle/fluid/operators/collective/allreduce_op.cc
+++ b/paddle/fluid/operators/collective/allreduce_op.cc
--- a/paddle/fluid/operators/collective/allreduce_op.cu.cc
+++ b/paddle/fluid/operators/collective/allreduce_op.cu.cc
--- a/paddle/fluid/operators/collective/alltoall_op.cc
+++ b/paddle/fluid/operators/collective/alltoall_op.cc
--- a/paddle/fluid/operators/collective/alltoall_op.cu.cc
+++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.h
+++ b/paddle/fluid/operators/distributed_ops/split_byref_op.h
--- a/paddle/fluid/operators/collective/c_allgather_op.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cc
--- a/paddle/fluid/operators/collective/c_allgather_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc
+++ b/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
--- a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
--- a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
--- a/paddle/fluid/operators/collective/c_broadcast_op.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cc
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
--- a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
--- a/paddle/fluid/operators/collective/c_concat_op.cc
+++ b/paddle/fluid/operators/collective/c_concat_op.cc
--- a/paddle/fluid/operators/collective/c_concat_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc
--- a/paddle/fluid/operators/collective/c_concat_op.h
+++ b/paddle/fluid/operators/collective/c_concat_op.h
--- a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
--- a/paddle/fluid/operators/collective/c_identity_op.cc
+++ b/paddle/fluid/operators/collective/c_identity_op.cc
--- a/paddle/fluid/operators/collective/c_identity_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_identity_op.cu.cc
--- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h
--- a/paddle/fluid/operators/collective/c_identity_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_identity_op_npu.cc
--- a/paddle/fluid/operators/expand_as_v2_op.cu
+++ b/paddle/fluid/operators/expand_as_v2_op.cu
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc
+++ b/paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc
--- a/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
--- a/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
--- a/paddle/fluid/operators/expand_as_op.cu
+++ b/paddle/fluid/operators/expand_as_op.cu
--- a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cc
--- a/paddle/fluid/operators/collective/c_reducescatter_op.h
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.h
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
--- a/paddle/fluid/operators/collective/c_split_op.cc
+++ b/paddle/fluid/operators/collective/c_split_op.cc
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.h
+++ b/paddle/fluid/operators/distributed_ops/allreduce_op.h
--- a/paddle/fluid/operators/collective/c_split_op.h
+++ b/paddle/fluid/operators/collective/c_split_op.h
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
--- a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
--- a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
--- a/paddle/fluid/framework/c/c_api.h
+++ b/paddle/fluid/framework/c/c_api.h
--- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
--- a/paddle/fluid/operators/collective/recv_v2_op.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cc
--- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
--- a/paddle/fluid/operators/collective/send_v2_op.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cc
--- a/paddle/fluid/operators/collective/send_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
--- a/paddle/fluid/operators/concat_op_npu.cc
+++ b/paddle/fluid/operators/concat_op_npu.cc
--- a/paddle/fluid/operators/concat_op_xpu.cc
+++ b/paddle/fluid/operators/concat_op_xpu.cc
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
--- a/paddle/fluid/operators/controlflow/compare_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_op.cu
--- a/paddle/fluid/operators/controlflow/compare_op.h
+++ b/paddle/fluid/operators/controlflow/compare_op.h
--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cc
+++ b/paddle/fluid/operators/distributed_ops/broadcast_op.cc
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
--- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_op.cc
--- a/paddle/fluid/operators/controlflow/logical_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
--- a/paddle/fluid/operators/conv_miopen_helper.h
+++ b/paddle/fluid/operators/conv_miopen_helper.h
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
--- a/paddle/fluid/operators/copy_cross_scope_op.cc
+++ b/paddle/fluid/operators/copy_cross_scope_op.cc
--- a/paddle/fluid/operators/copy_cross_scope_test.cc
+++ b/paddle/fluid/operators/copy_cross_scope_test.cc
--- a/paddle/fluid/operators/correlation_op.cu
+++ b/paddle/fluid/operators/correlation_op.cu
--- a/paddle/fluid/operators/cudnn_lstm_cache.h
+++ b/paddle/fluid/operators/cudnn_lstm_cache.h
--- a/paddle/fluid/operators/cudnn_rnn_cache.h
+++ b/paddle/fluid/operators/cudnn_rnn_cache.h
--- a/python/paddle/fluid/tests/custom_op/relu_op3.cc
+++ b/python/paddle/fluid/tests/custom_op/relu_op3.cc
--- a/paddle/fluid/operators/decode_jpeg_op.cu
+++ b/paddle/fluid/operators/decode_jpeg_op.cu
--- a/paddle/fluid/operators/detection/anchor_generator_op.cu
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cu
--- a/paddle/fluid/operators/detection/anchor_generator_op.h
+++ b/paddle/fluid/operators/detection/anchor_generator_op.h
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
--- a/paddle/fluid/operators/diag_embed_op.cu
+++ b/paddle/fluid/operators/diag_embed_op.cu
--- a/paddle/fluid/operators/dist_op.h
+++ b/paddle/fluid/operators/dist_op.h
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
+++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
+++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.h
--- a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc
--- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
--- a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
--- a/paddle/fluid/operators/distributed/brpc/brpc_server.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_server.cc
--- a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc
--- a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h
--- a/paddle/fluid/operators/distributed/collective_client.h
+++ b/paddle/fluid/operators/distributed/collective_client.h
--- a/paddle/fluid/operators/distributed/collective_server.cc
+++ b/paddle/fluid/operators/distributed/collective_server.cc
--- a/paddle/fluid/operators/distributed/collective_server.h
+++ b/paddle/fluid/operators/distributed/collective_server.h
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ b/paddle/fluid/operators/distributed/collective_server_test.cc
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
--- a/paddle/fluid/operators/distributed/communicator_common.h
+++ b/paddle/fluid/operators/distributed/communicator_common.h
--- a/paddle/fluid/operators/distributed/communicator_test.cc
+++ b/paddle/fluid/operators/distributed/communicator_test.cc
--- a/paddle/fluid/operators/distributed/distributed.h
+++ b/paddle/fluid/operators/distributed/distributed.h
--- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
--- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.h
--- a/paddle/fluid/operators/distributed/grpc/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_service.h
--- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
--- a/paddle/fluid/operators/distributed/heart_beat_monitor.cc
+++ b/paddle/fluid/operators/distributed/heart_beat_monitor.cc
--- a/paddle/fluid/operators/distributed/heart_beat_monitor.h
+++ b/paddle/fluid/operators/distributed/heart_beat_monitor.h
--- a/paddle/fluid/operators/distributed/large_scale_kv.h
+++ b/paddle/fluid/operators/distributed/large_scale_kv.h
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
--- a/paddle/fluid/operators/distributed/proto_encoder_helper.h
+++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
--- a/paddle/fluid/operators/distributed/varhandle_test.cc
+++ b/paddle/fluid/operators/distributed/varhandle_test.cc
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
+++ b/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
--- a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
--- a/paddle/fluid/operators/distributed_ops/fake_init_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
--- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
--- a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc
--- a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
+++ b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.cc
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.cc
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.h
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.h
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_init_op.cc
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_init_op.cc
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.cc
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.cc
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.h
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.h
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_read_op.cc
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_read_op.cc
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_write_op.cc
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_write_op.cc
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
+++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.h
+++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.h
--- a/paddle/fluid/operators/distributed_ops/prefetch_op.cc
+++ b/paddle/fluid/operators/distributed_ops/prefetch_op.cc
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
--- a/paddle/fluid/operators/distributed_ops/recv_save_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_save_op.cc
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
+++ b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
--- a/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc
--- a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
--- a/paddle/fluid/operators/distributed_ops/send_recv_op_test.cc
+++ b/paddle/fluid/operators/distributed_ops/send_recv_op_test.cc
--- a/paddle/fluid/operators/distributed_ops/send_recv_util.h
+++ b/paddle/fluid/operators/distributed_ops/send_recv_util.h
--- a/paddle/fluid/operators/distributed_ops/sparse_tensor_load_op.cc
+++ b/paddle/fluid/operators/distributed_ops/sparse_tensor_load_op.cc
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.cc
+++ b/paddle/fluid/operators/distributed_ops/split_byref_op.cc
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.cc
+++ b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.h
+++ b/paddle/fluid/operators/distributed_ops/split_ids_op.h
--- a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
--- a/paddle/fluid/operators/dlnne/CMakeLists.txt
+++ b/paddle/fluid/operators/dlnne/CMakeLists.txt
--- a/paddle/fluid/operators/distributed/collective_client.cc
+++ b/paddle/fluid/operators/distributed/collective_client.cc
--- a/paddle/fluid/operators/dlnne/dlnne_engine_op.h
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
--- a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
--- a/paddle/fluid/operators/eigen/CMakeLists.txt
+++ b/paddle/fluid/operators/eigen/CMakeLists.txt
--- a/paddle/fluid/operators/eigen/broadcast.cc
+++ b/paddle/fluid/operators/eigen/broadcast.cc
--- a/paddle/fluid/operators/eigen/broadcast.cu
+++ b/paddle/fluid/operators/eigen/broadcast.cu
--- a/paddle/fluid/operators/eigen/eigen_function.h
+++ b/paddle/fluid/operators/eigen/eigen_function.h
--- a/paddle/fluid/operators/elementwise/CMakeLists.txt
+++ b/paddle/fluid/operators/elementwise/CMakeLists.txt
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
--- a/paddle/fluid/operators/expand_as_op.h
+++ b/paddle/fluid/operators/expand_as_op.h
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
--- a/paddle/fluid/operators/expand_as_v2_op.h
+++ b/paddle/fluid/operators/expand_as_v2_op.h
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
--- a/paddle/fluid/operators/expand_op_npu.cc
+++ b/paddle/fluid/operators/expand_op_npu.cc
--- a/paddle/fluid/operators/expand_op_npu_test.cc
+++ b/paddle/fluid/operators/expand_op_npu_test.cc
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
--- a/paddle/fluid/operators/expand_v2_op.h
+++ b/paddle/fluid/operators/expand_v2_op.h
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
--- a/paddle/fluid/operators/fill_constant_op.cu.cc
+++ b/paddle/fluid/operators/fill_constant_op.cu.cc
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
--- a/paddle/fluid/operators/flatten_op.cu.cc
+++ b/paddle/fluid/operators/flatten_op.cu.cc
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
--- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
--- a/paddle/fluid/operators/gather_op_npu.cc
+++ b/paddle/fluid/operators/gather_op_npu.cc
--- a/paddle/fluid/operators/gather_op_npu_test.cc
+++ b/paddle/fluid/operators/gather_op_npu_test.cc
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
--- a/paddle/fluid/operators/gelu_op_npu.cc
+++ b/paddle/fluid/operators/gelu_op_npu.cc
--- a/paddle/fluid/operators/gelu_op_npu_test.cc
+++ b/paddle/fluid/operators/gelu_op_npu_test.cc
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
--- a/paddle/fluid/operators/increment_op_npu.cc
+++ b/paddle/fluid/operators/increment_op_npu.cc
--- a/paddle/fluid/operators/increment_op_npu_test.cc
+++ b/paddle/fluid/operators/increment_op_npu_test.cc
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
--- a/paddle/fluid/operators/interpolate_v2_op.h
+++ b/paddle/fluid/operators/interpolate_v2_op.h
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
--- a/paddle/fluid/operators/layer_norm_op_npu.cc
+++ b/paddle/fluid/operators/layer_norm_op_npu.cc
--- a/paddle/fluid/operators/load_combine_op_npu.cc
+++ b/paddle/fluid/operators/load_combine_op_npu.cc
--- a/paddle/fluid/operators/load_op_npu.cc
+++ b/paddle/fluid/operators/load_op_npu.cc
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ b/paddle/fluid/operators/log_softmax_op.cu
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
--- a/paddle/fluid/operators/math/matrix_inverse.cc
+++ b/paddle/fluid/operators/math/matrix_inverse.cc
--- a/paddle/fluid/operators/math/matrix_inverse.cu.cc
+++ b/paddle/fluid/operators/math/matrix_inverse.cu.cc
--- a/paddle/fluid/operators/math/matrix_inverse.h
+++ b/paddle/fluid/operators/math/matrix_inverse.h
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/fluid/operators/math/segment_pooling.cu
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
--- a/paddle/fluid/operators/matmul_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_op_xpu.cc
--- a/paddle/fluid/operators/matmul_v2_op_npu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_npu.cc
--- a/paddle/fluid/operators/matmul_v2_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc
--- a/paddle/fluid/operators/mean_op_npu.cc
+++ b/paddle/fluid/operators/mean_op_npu.cc
--- a/paddle/fluid/operators/memcpy_op.cc
+++ b/paddle/fluid/operators/memcpy_op.cc
--- a/paddle/fluid/operators/memcpy_op.h
+++ b/paddle/fluid/operators/memcpy_op.h
--- a/paddle/fluid/operators/meshgrid_op.cc
+++ b/paddle/fluid/operators/meshgrid_op.cc
--- a/paddle/fluid/operators/meshgrid_op.cu
+++ b/paddle/fluid/operators/meshgrid_op.cu
--- a/paddle/fluid/operators/meshgrid_op.h
+++ b/paddle/fluid/operators/meshgrid_op.h
--- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
--- a/paddle/fluid/operators/miopen_lstm_cache.h
+++ b/paddle/fluid/operators/miopen_lstm_cache.h
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
--- a/paddle/fluid/operators/modified_huber_loss_op.cu
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
--- a/paddle/fluid/operators/mul_op_npu.cc
+++ b/paddle/fluid/operators/mul_op_npu.cc
--- a/paddle/fluid/operators/nll_loss_op.h
+++ b/paddle/fluid/operators/nll_loss_op.h
--- a/paddle/fluid/operators/norm_op.cu
+++ b/paddle/fluid/operators/norm_op.cu
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
--- a/paddle/fluid/operators/npu_op_runner.cc
+++ b/paddle/fluid/operators/npu_op_runner.cc
--- a/paddle/fluid/operators/npu_op_runner.h
+++ b/paddle/fluid/operators/npu_op_runner.h
--- a/paddle/fluid/operators/one_hot_op.cc
+++ b/paddle/fluid/operators/one_hot_op.cc
--- a/paddle/fluid/operators/one_hot_op_xpu.cc
+++ b/paddle/fluid/operators/one_hot_op_xpu.cc
--- a/paddle/fluid/operators/one_hot_v2_op.cc
+++ b/paddle/fluid/operators/one_hot_v2_op.cc
--- a/paddle/fluid/operators/one_hot_v2_op_xpu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_xpu.cc
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
--- a/paddle/fluid/operators/optimizers/adam_op.cu
+++ b/paddle/fluid/operators/optimizers/adam_op.cu
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
--- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
--- a/paddle/fluid/operators/optimizers/sgd_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op_npu.cc
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
--- a/paddle/fluid/operators/pscore/heter_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_server_test.cc
--- a/paddle/fluid/operators/pull_box_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_sparse_op.h
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
--- a/paddle/fluid/operators/py_layer_op.cc
+++ b/paddle/fluid/operators/py_layer_op.cc
--- a/paddle/fluid/operators/py_layer_op.h
+++ b/paddle/fluid/operators/py_layer_op.h
--- a/paddle/fluid/operators/range_op.cu
+++ b/paddle/fluid/operators/range_op.cu
--- a/paddle/fluid/operators/range_op.h
+++ b/paddle/fluid/operators/range_op.h
--- a/paddle/fluid/operators/range_op_npu.cc
+++ b/paddle/fluid/operators/range_op_npu.cc
--- a/paddle/fluid/operators/range_op_npu_test.cc
+++ b/paddle/fluid/operators/range_op_npu_test.cc
--- a/python/paddle/fluid/tests/custom_op/relu_op3.cu
+++ b/python/paddle/fluid/tests/custom_op/relu_op3.cu
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
--- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
--- a/paddle/fluid/operators/reduce_ops/cub_reduce.h
+++ b/paddle/fluid/operators/reduce_ops/cub_reduce.h
--- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_max_mkldnn_op.cc
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_max_mkldnn_op.cc
--- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc
--- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_min_mkldnn_op.cc
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_min_mkldnn_op.cc
--- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
--- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/reshape_op_npu.cc
+++ b/paddle/fluid/operators/reshape_op_npu.cc
--- a/paddle/fluid/operators/rnn_op.cu.cc
+++ b/paddle/fluid/operators/rnn_op.cu.cc
--- a/paddle/fluid/operators/rnn_op_xpu.cc
+++ b/paddle/fluid/operators/rnn_op_xpu.cc
--- a/paddle/fluid/operators/save_combine_op.h
+++ b/paddle/fluid/operators/save_combine_op.h
--- a/paddle/fluid/operators/save_combine_op_npu.cc
+++ b/paddle/fluid/operators/save_combine_op_npu.cc
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
--- a/paddle/fluid/operators/save_op.h
+++ b/paddle/fluid/operators/save_op.h
--- a/paddle/fluid/operators/save_op_npu.cc
+++ b/paddle/fluid/operators/save_op_npu.cc
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
--- a/paddle/fluid/operators/scale_op_npu.cc
+++ b/paddle/fluid/operators/scale_op_npu.cc
--- a/paddle/fluid/operators/scatter.h
+++ b/paddle/fluid/operators/scatter.h
--- a/paddle/fluid/operators/scatter_nd_add_op.cc
+++ b/paddle/fluid/operators/scatter_nd_add_op.cc
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ b/paddle/fluid/operators/scatter_op_npu.cc
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
--- a/paddle/fluid/operators/shape_op_npu.cc
+++ b/paddle/fluid/operators/shape_op_npu.cc
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
--- a/paddle/fluid/operators/softmax_cudnn_op.cu
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu
--- a/paddle/fluid/operators/distributed/brpc/brpc_server.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_server.h
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
--- a/paddle/fluid/operators/softmax_op_npu.cc
+++ b/paddle/fluid/operators/softmax_op_npu.cc
--- a/paddle/fluid/operators/softmax_op_npu_test.cc
+++ b/paddle/fluid/operators/softmax_op_npu_test.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
--- a/paddle/fluid/operators/expand_v2_op.cu
+++ b/paddle/fluid/operators/expand_v2_op.cu
--- a/paddle/fluid/operators/squeeze_op_npu_test.cc
+++ b/paddle/fluid/operators/squeeze_op_npu_test.cc
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
--- a/paddle/fluid/operators/stack_op_npu.cc
+++ b/paddle/fluid/operators/stack_op_npu.cc
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h
+++ b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h
--- a/paddle/fluid/operators/sync_batch_norm_op.cu.h
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h
--- a/paddle/fluid/operators/temporal_shift_op.cc
+++ b/paddle/fluid/operators/temporal_shift_op.cc
--- a/paddle/fluid/operators/temporal_shift_op.cu
+++ b/paddle/fluid/operators/temporal_shift_op.cu
--- a/paddle/fluid/operators/temporal_shift_op.h
+++ b/paddle/fluid/operators/temporal_shift_op.h
--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
--- a/paddle/fluid/operators/tensor_formatter.cc
+++ b/paddle/fluid/operators/tensor_formatter.cc
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
--- a/paddle/fluid/operators/tile_op.cc
+++ b/paddle/fluid/operators/tile_op.cc
--- a/paddle/fluid/operators/tile_op.h
+++ b/paddle/fluid/operators/tile_op.h
--- a/paddle/fluid/operators/top_k_op_npu.cc
+++ b/paddle/fluid/operators/top_k_op_npu.cc
--- a/paddle/fluid/operators/trace_op.cu
+++ b/paddle/fluid/operators/trace_op.cu
--- a/paddle/fluid/operators/transpose_op_npu.cc
+++ b/paddle/fluid/operators/transpose_op_npu.cc
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu
--- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
--- a/paddle/fluid/operators/tile_op.cu
+++ b/paddle/fluid/operators/tile_op.cu
--- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc
+++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
--- a/paddle/fluid/operators/unstack_op.cc
+++ b/paddle/fluid/operators/unstack_op.cc
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
--- a/paddle/fluid/operators/where_index_op.cu
+++ b/paddle/fluid/operators/where_index_op.cu
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/ascend_npu_info.cc
+++ b/paddle/fluid/platform/ascend_npu_info.cc
--- a/paddle/fluid/operators/split_selected_rows_op.cu
+++ b/paddle/fluid/operators/split_selected_rows_op.cu
--- a/paddle/fluid/platform/bfloat16.h
+++ b/paddle/fluid/platform/bfloat16.h
--- a/paddle/fluid/platform/bfloat16_test.cc
+++ b/paddle/fluid/platform/bfloat16_test.cc
--- a/paddle/fluid/platform/bfloat16_test.cu
+++ b/paddle/fluid/platform/bfloat16_test.cu
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
--- a/paddle/fluid/platform/collective_helper_npu.cc
+++ b/paddle/fluid/platform/collective_helper_npu.cc
--- a/paddle/fluid/platform/complex128.h
+++ b/paddle/fluid/platform/complex128.h
--- a/paddle/fluid/platform/complex64.h
+++ b/paddle/fluid/platform/complex64.h
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
--- a/paddle/fluid/platform/denormal.cc
+++ b/paddle/fluid/platform/denormal.cc
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cu.cc
+++ b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cu.cc
--- a/paddle/fluid/platform/dynload/hccl.h
+++ b/paddle/fluid/platform/dynload/hccl.h
--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
--- a/paddle/fluid/platform/dynload/nvjpeg.cc
+++ b/paddle/fluid/platform/dynload/nvjpeg.cc
--- a/paddle/fluid/platform/dynload/nvjpeg.h
+++ b/paddle/fluid/platform/dynload/nvjpeg.h
--- a/paddle/fluid/platform/dynload/tensorrt.cc
+++ b/paddle/fluid/platform/dynload/tensorrt.cc
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
--- a/paddle/fluid/platform/eigen_ext.h
+++ b/paddle/fluid/platform/eigen_ext.h
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
--- a/paddle/fluid/platform/gen_comm_id_helper.cc
+++ b/paddle/fluid/platform/gen_comm_id_helper.cc
--- a/paddle/fluid/platform/gen_comm_id_helper.h
+++ b/paddle/fluid/platform/gen_comm_id_helper.h
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/fluid/platform/hccl_helper.h
+++ b/paddle/fluid/platform/hccl_helper.h
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
--- a/paddle/fluid/platform/lodtensor_printer.cc
+++ b/paddle/fluid/platform/lodtensor_printer.cc
--- a/paddle/fluid/platform/lodtensor_printer.h
+++ b/paddle/fluid/platform/lodtensor_printer.h
--- a/paddle/fluid/platform/lodtensor_printer_test.cc
+++ b/paddle/fluid/platform/lodtensor_printer_test.cc
--- a/paddle/fluid/platform/miopen_desc.h
+++ b/paddle/fluid/platform/miopen_desc.h
--- a/paddle/fluid/platform/miopen_helper.h
+++ b/paddle/fluid/platform/miopen_helper.h
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
--- a/paddle/fluid/platform/monitor.cc
+++ b/paddle/fluid/platform/monitor.cc
--- a/paddle/fluid/platform/monitor.h
+++ b/paddle/fluid/platform/monitor.h
--- a/paddle/fluid/platform/npu_info.cc
+++ b/paddle/fluid/platform/npu_info.cc
--- a/paddle/fluid/platform/npu_info.h
+++ b/paddle/fluid/platform/npu_info.h
--- a/paddle/fluid/platform/npu_profiler.h
+++ b/paddle/fluid/platform/npu_profiler.h
--- a/paddle/fluid/platform/npu_resource_pool.cc
+++ b/paddle/fluid/platform/npu_resource_pool.cc
--- a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
--- a/paddle/fluid/platform/stream/CMakeLists.txt
+++ b/paddle/fluid/platform/stream/CMakeLists.txt
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
--- a/paddle/fluid/platform/stream/npu_stream.cc
+++ b/paddle/fluid/platform/stream/npu_stream.cc
--- a/paddle/fluid/platform/stream/npu_stream.h
+++ b/paddle/fluid/platform/stream/npu_stream.h
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/ascend_wrapper_py.cc
+++ b/paddle/fluid/pybind/ascend_wrapper_py.cc
--- a/paddle/fluid/pybind/ascend_wrapper_py.h
+++ b/paddle/fluid/pybind/ascend_wrapper_py.h
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
--- a/paddle/fluid/pybind/fleet_py.h
+++ b/paddle/fluid/pybind/fleet_py.h
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.h
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
--- a/paddle/fluid/string/tinyformat/tinyformat.h
+++ b/paddle/fluid/string/tinyformat/tinyformat.h
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
--- a/paddle/fluid/train/demo/CMakeLists.txt
+++ b/paddle/fluid/train/demo/CMakeLists.txt
--- a/paddle/fluid/train/demo/README.md
+++ b/paddle/fluid/train/demo/README.md
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
--- a/paddle/fluid/train/demo/run.sh
+++ b/paddle/fluid/train/demo/run.sh
--- a/paddle/fluid/train/imdb_demo/CMakeLists.txt
+++ b/paddle/fluid/train/imdb_demo/CMakeLists.txt
--- a/paddle/fluid/train/imdb_demo/README.md
+++ b/paddle/fluid/train/imdb_demo/README.md
--- a/paddle/fluid/train/imdb_demo/demo_trainer.cc
+++ b/paddle/fluid/train/imdb_demo/demo_trainer.cc
--- a/paddle/fluid/train/imdb_demo/generate_program.py
+++ b/paddle/fluid/train/imdb_demo/generate_program.py
--- a/paddle/fluid/train/imdb_demo/imdb_reader.py
+++ b/paddle/fluid/train/imdb_demo/imdb_reader.py
--- a/paddle/fluid/train/imdb_demo/include/save_model.h
+++ b/paddle/fluid/train/imdb_demo/include/save_model.h
--- a/paddle/fluid/train/imdb_demo/nets.py
+++ b/paddle/fluid/train/imdb_demo/nets.py
--- a/paddle/fluid/train/imdb_demo/run.sh
+++ b/paddle/fluid/train/imdb_demo/run.sh
--- a/paddle/fluid/train/imdb_demo/save_model.cc
+++ b/paddle/fluid/train/imdb_demo/save_model.cc
--- a/paddle/fluid/train/imdb_demo/train.cfg
+++ b/paddle/fluid/train/imdb_demo/train.cfg
--- a/paddle/fluid/train/imdb_demo/train_filelist.txt
+++ b/paddle/fluid/train/imdb_demo/train_filelist.txt
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
--- a/paddle/scripts/build_docker_images.sh
+++ b/paddle/scripts/build_docker_images.sh
--- a/paddle/scripts/docker/root/.scripts/git-completion.sh
+++ b/paddle/scripts/docker/root/.scripts/git-completion.sh
--- a/paddle/scripts/fast_install.sh
+++ b/paddle/scripts/fast_install.sh
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/paddle/scripts/windows_build/build.bat
+++ b/paddle/scripts/windows_build/build.bat
--- a/paddle/scripts/windows_build/config.ini
+++ b/paddle/scripts/windows_build/config.ini
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
--- a/patches/eigen/BinaryFunctors.h
+++ b/patches/eigen/BinaryFunctors.h
--- a/patches/eigen/Geometry_SSE.h
+++ b/patches/eigen/Geometry_SSE.h
--- a/patches/eigen/Half.h
+++ b/patches/eigen/Half.h
--- a/patches/eigen/MathFunctions.h
+++ b/patches/eigen/MathFunctions.h
--- a/patches/eigen/Meta.h
+++ b/patches/eigen/Meta.h
--- a/patches/eigen/Tensor
+++ b/patches/eigen/Tensor
--- a/patches/eigen/TensorBlock.h
+++ b/patches/eigen/TensorBlock.h
--- a/patches/eigen/TensorReductionGpu.h
+++ b/patches/eigen/TensorReductionGpu.h
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/README.md
+++ b/python/paddle/README.md
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
--- a/python/paddle/autograd/__init__.py
+++ b/python/paddle/autograd/__init__.py
--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
--- a/python/paddle/autograd/py_layer.py
+++ b/python/paddle/autograd/py_layer.py
--- a/python/paddle/callbacks.py
+++ b/python/paddle/callbacks.py
--- a/python/paddle/check_import_scipy.py
+++ b/python/paddle/check_import_scipy.py
--- a/python/paddle/dataset/__init__.py
+++ b/python/paddle/dataset/__init__.py
--- a/python/paddle/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
--- a/python/paddle/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
--- a/python/paddle/dataset/tests/cifar_test.py
+++ b/python/paddle/dataset/tests/cifar_test.py
--- a/python/paddle/dataset/tests/flowers_test.py
+++ b/python/paddle/dataset/tests/flowers_test.py
--- a/python/paddle/dataset/tests/imdb_test.py
+++ b/python/paddle/dataset/tests/imdb_test.py
--- a/python/paddle/dataset/tests/imikolov_test.py
+++ b/python/paddle/dataset/tests/imikolov_test.py
--- a/python/paddle/dataset/tests/mnist_test.py
+++ b/python/paddle/dataset/tests/mnist_test.py
--- a/python/paddle/dataset/tests/test_image.py
+++ b/python/paddle/dataset/tests/test_image.py
--- a/python/paddle/dataset/tests/voc2012_test.py
+++ b/python/paddle/dataset/tests/voc2012_test.py
--- a/python/paddle/dataset/tests/wmt16_test.py
+++ b/python/paddle/dataset/tests/wmt16_test.py
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
--- a/python/paddle/dataset/voc2012.py
+++ b/python/paddle/dataset/voc2012.py
--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
--- a/python/paddle/device.py
+++ b/python/paddle/device.py
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
--- a/python/paddle/distributed/cloud_utils.py
+++ b/python/paddle/distributed/cloud_utils.py
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
--- a/python/paddle/distributed/entry_attr.py
+++ b/python/paddle/distributed/entry_attr.py
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
--- a/python/paddle/distributed/fleet/ascend_utils.py
+++ b/python/paddle/distributed/fleet/ascend_utils.py
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
--- a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
+++ b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
--- a/python/paddle/distributed/fleet/base/private_helper_function.py
+++ b/python/paddle/distributed/fleet/base/private_helper_function.py
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
--- a/python/paddle/distributed/fleet/base/runtime_factory.py
+++ b/python/paddle/distributed/fleet/base/runtime_factory.py
--- a/python/paddle/distributed/fleet/base/strategy_compiler.py
+++ b/python/paddle/distributed/fleet/base/strategy_compiler.py
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
--- a/python/paddle/distributed/fleet/cloud_utils.py
+++ b/python/paddle/distributed/fleet/cloud_utils.py
--- a/python/paddle/distributed/fleet/data_generator/__init__.py
+++ b/python/paddle/distributed/fleet/data_generator/__init__.py
--- a/python/paddle/distributed/fleet/data_generator/data_generator.py
+++ b/python/paddle/distributed/fleet/data_generator/data_generator.py
--- a/python/paddle/distributed/fleet/dataset/__init__.py
+++ b/python/paddle/distributed/fleet/dataset/__init__.py
--- a/python/paddle/distributed/fleet/dataset/dataset.py
+++ b/python/paddle/distributed/fleet/dataset/dataset.py
--- a/python/paddle/distributed/fleet/dataset/index_dataset.py
+++ b/python/paddle/distributed/fleet/dataset/index_dataset.py
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
--- a/paddle/fluid/train/demo/clean.sh
+++ b/paddle/fluid/train/demo/clean.sh
--- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
--- a/python/paddle/fluid/incubate/data_generator/test_data_generator.py
+++ b/python/paddle/fluid/incubate/data_generator/test_data_generator.py
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py