提交 5b5941c7 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into move_sgd_to_phi

要显示的变更太多。

To preserve performance only 1000 of 1000+ files are displayed.
......@@ -20,6 +20,7 @@ build/
build_doc/
*.user
*.tmp
*.pyc
.vscode
.idea
......
......@@ -25,7 +25,7 @@ repos:
description: Format files with ClangFormat.
entry: bash ./tools/codestyle/clang_format.hook -i
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps)$
- repo: local
hooks:
- id: cpplint-cpp-source
......@@ -48,7 +48,7 @@ repos:
name: copyright_checker
entry: python ./tools/codestyle/copyright.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps|py|sh)$
exclude: |
(?x)^(
paddle/utils/.*
......
......@@ -330,6 +330,7 @@ if(WITH_BRPC_RDMA)
endif()
endif()
if(WITH_GPU)
include(cuda)
# lite subgraph compilation depends on CUDNN_ROOT,
......
......@@ -99,7 +99,7 @@ endfunction()
function(mlir_add_rewriter td_base)
set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
mlir_tablegen(${td_base}.hpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass")
mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass")
add_public_tablegen_target(${td_base}_IncGen)
add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
endfunction()
......
......@@ -36,7 +36,7 @@ ENDIF()
if(NOT DEFINED XPU_BASE_URL)
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220215")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220219")
else()
SET(XPU_BASE_URL "${XPU_BASE_URL}")
endif()
......
......@@ -116,19 +116,19 @@ function(find_fluid_modules TARGET_NAME)
endif()
endfunction(find_fluid_modules)
set_property(GLOBAL PROPERTY PTEN_MODULES "")
# find all pten modules is used for paddle static library
set_property(GLOBAL PROPERTY PHI_MODULES "")
# find all phi modules is used for paddle static library
# for building inference libs
function(find_pten_modules TARGET_NAME)
function(find_phi_modules TARGET_NAME)
get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
string(FIND "${__target_path}" "phi" pos)
if(pos GREATER 1)
get_property(pten_modules GLOBAL PROPERTY PTEN_MODULES)
set(pten_modules ${pten_modules} ${TARGET_NAME})
set_property(GLOBAL PROPERTY PTEN_MODULES "${pten_modules}")
get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
set(phi_modules ${phi_modules} ${TARGET_NAME})
set_property(GLOBAL PROPERTY PHI_MODULES "${phi_modules}")
endif()
endfunction(find_pten_modules)
endfunction(find_phi_modules)
function(common_link TARGET_NAME)
if (WITH_PROFILER)
......@@ -324,7 +324,7 @@ function(cc_library TARGET_NAME)
else()
add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
find_fluid_modules(${TARGET_NAME})
find_pten_modules(${TARGET_NAME})
find_phi_modules(${TARGET_NAME})
endif()
if(cc_library_DEPS)
# Don't need link libwarpctc.so
......@@ -497,7 +497,7 @@ function(nv_library TARGET_NAME)
else()
add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
find_fluid_modules(${TARGET_NAME})
find_pten_modules(${TARGET_NAME})
find_phi_modules(${TARGET_NAME})
endif()
if (nv_library_DEPS)
add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
......@@ -588,7 +588,7 @@ function(hip_library TARGET_NAME)
else()
hip_add_library(${TARGET_NAME} STATIC ${hip_library_SRCS})
find_fluid_modules(${TARGET_NAME})
find_pten_modules(${TARGET_NAME})
find_phi_modules(${TARGET_NAME})
endif()
if (hip_library_DEPS)
add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
......
......@@ -224,7 +224,7 @@ copy(inference_lib_dist
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
# copy api headers for pten & custom op
# copy api headers for phi & custom op
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/api/ext/*.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext/)
......@@ -244,11 +244,11 @@ copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/extension.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
# the header file of pten is copied to the experimental directory,
# the include path of pten needs to be changed to adapt to inference api path
# the header file of phi is copied to the experimental directory,
# the include path of phi needs to be changed to adapt to inference api path
add_custom_command(TARGET inference_lib_dist POST_BUILD
COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/pten_header.cmake"
COMMENT "Change pten header include path to adapt to inference api path")
COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/phi_header.cmake"
COMMENT "Change phi header include path to adapt to inference api path")
# CAPI inference library for only inference
set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING
......
......@@ -73,6 +73,12 @@ function(op_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu)
endif()
# rename in KP: .kps -> .cu
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
file(COPY ${TARGET}.kps DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
endif()
if (WITH_NV_JETSON)
list(REMOVE_ITEM cu_srcs "decode_jpeg_op.cu")
endif()
......@@ -96,6 +102,12 @@ function(op_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND hip_srcs ${TARGET}.cu)
endif()
# rename in KP: .kps -> .cu
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
file(COPY ${TARGET}.kps DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
list(APPEND hip_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
......@@ -125,6 +137,9 @@ function(op_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.xpu)
list(APPEND xpu_kp_cc_srcs ${TARGET}.xpu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
list(APPEND xpu_kp_cc_srcs ${TARGET}.kps)
endif()
endif()
if(WITH_ASCEND_CL)
string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
......@@ -162,6 +177,8 @@ function(op_library TARGET)
list(APPEND xpu_cc_srcs ${src})
elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.xpu$")
list(APPEND xpu_kp_cc_srcs ${src})
elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$")
list(APPEND xpu_kp_cc_srcs ${src})
elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
list(APPEND npu_cc_srcs ${src})
elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")
......@@ -384,7 +401,15 @@ function(op_library TARGET)
# pybind USE_OP_DEVICE_KERNEL for XPU KP
if (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, KP);\n")
foreach(xpu_kp_src ${xpu_kp_cc_srcs})
set(op_name "")
find_register(${xpu_kp_src} "REGISTER_OP_KERNEL" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, KP);\n")
message(STATUS "Building KP Target: ${op_name}")
set(pybind_flag 1)
endif()
endforeach()
endif()
# pybind USE_OP_DEVICE_KERNEL for NPU
......
......@@ -51,33 +51,41 @@ function(generate_unify_header DIR_NAME)
endforeach()
# append header into extension.h
string(REPLACE "${PADDLE_SOURCE_DIR}\/" "" header_file "${header_file}")
file(APPEND ${pten_extension_header_file} "#include \"${header_file}\"\n")
file(APPEND ${phi_extension_header_file} "#include \"${header_file}\"\n")
endfunction()
# call kernel_declare need to make sure whether the target of input exists
function(kernel_declare TARGET_LIST)
foreach(kernel_path ${TARGET_LIST})
file(READ ${kernel_path} kernel_impl)
# TODO(chenweihang): rename PT_REGISTER_KERNEL to PT_REGISTER_KERNEL
# NOTE(chenweihang): now we don't recommend to use digit in kernel name
string(REGEX MATCH "(PT_REGISTER_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
string(REGEX MATCH "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*,[ \t\r\n\/]*[a-z0-9_]*" first_registry "${kernel_impl}")
if (NOT first_registry STREQUAL "")
# some gpu kernel only can run on cuda, not support rocm, so we add this branch
if (WITH_ROCM)
string(FIND "${first_registry}" "cuda_only" pos)
if(pos GREATER 1)
continue()
endif()
endif()
# parse the first kernel name
string(REPLACE "PT_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
string(REPLACE "PT_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}")
string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}")
string(REPLACE "," "" kernel_name "${kernel_name}")
string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}")
string(REGEX REPLACE "//cuda_only" "" kernel_name "${kernel_name}")
# append kernel declare into declarations.h
# TODO(chenweihang): default declare ALL_LAYOUT for each kernel
if (${kernel_path} MATCHES "./cpu\/")
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
elseif (${kernel_path} MATCHES "./gpu\/")
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
elseif (${kernel_path} MATCHES "./xpu\/")
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
elseif (${kernel_path} MATCHES "./gpudnn\/")
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n")
else ()
# deal with device independent kernel, now we use CPU temporaary
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
endif()
endif()
endforeach()
......@@ -88,6 +96,7 @@ function(kernel_library TARGET)
set(cpu_srcs)
set(gpu_srcs)
set(xpu_srcs)
set(gpudnn_srcs)
set(selected_rows_srcs)
# parse and save the deps kerenl targets
set(all_srcs)
......@@ -95,6 +104,8 @@ function(kernel_library TARGET)
set(oneValueArgs SUB_DIR)
set(multiValueArgs SRCS DEPS)
set(target_build_flag 1)
cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})
......@@ -117,6 +128,9 @@ function(kernel_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
endif()
endif()
if (WITH_XPU)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
......@@ -135,6 +149,7 @@ function(kernel_library TARGET)
list(APPEND all_srcs ${cpu_srcs})
list(APPEND all_srcs ${gpu_srcs})
list(APPEND all_srcs ${xpu_srcs})
list(APPEND all_srcs ${gpudnn_srcs})
foreach(src ${all_srcs})
file(READ ${src} target_content)
string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
......@@ -160,21 +175,22 @@ function(kernel_library TARGET)
list(LENGTH cpu_srcs cpu_srcs_len)
list(LENGTH gpu_srcs gpu_srcs_len)
list(LENGTH xpu_srcs xpu_srcs_len)
list(LENGTH gpudnn_srcs gpudnn_srcs_len)
list(LENGTH selected_rows_srcs selected_rows_srcs_len)
# Build Target according different src organization
if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
${xpu_srcs_len} GREATER 0) AND (${common_srcs_len} GREATER 0 OR
${selected_rows_srcs_len} GREATER 0))
${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) AND
(${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0))
# If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule.
if (WITH_GPU)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
endif()
elseif (WITH_ROCM)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
endif()
else()
......@@ -184,14 +200,14 @@ function(kernel_library TARGET)
endif()
endif()
# If there are only specific device srcs, build target using this rule.
elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
if (WITH_GPU)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
elseif (WITH_ROCM)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
else()
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
......@@ -228,35 +244,40 @@ function(kernel_library TARGET)
cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
else()
message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
set(target_build_flag 0)
endif()
if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR
${selected_rows_srcs_len} GREATER 0)
# append target into PTEN_KERNELS property
get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
set(pten_kernels ${pten_kernels} ${TARGET})
set_property(GLOBAL PROPERTY PTEN_KERNELS ${pten_kernels})
endif()
if (${target_build_flag} EQUAL 1)
if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR
${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)
# append target into PHI_KERNELS property
get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
set(phi_kernels ${phi_kernels} ${TARGET})
set_property(GLOBAL PROPERTY PHI_KERNELS ${phi_kernels})
endif()
# parse kernel name and auto generate kernel declaration
# here, we don't need to check WITH_XXX, because if not WITH_XXX, the
# xxx_srcs_len will be equal to 0
if (${common_srcs_len} GREATER 0)
kernel_declare(${common_srcs})
endif()
if (${cpu_srcs_len} GREATER 0)
kernel_declare(${cpu_srcs})
endif()
if (${gpu_srcs_len} GREATER 0)
kernel_declare(${gpu_srcs})
endif()
if (${xpu_srcs_len} GREATER 0)
kernel_declare(${xpu_srcs})
endif()
if (${selected_rows_srcs_len} GREATER 0)
kernel_declare(${selected_rows_srcs})
# parse kernel name and auto generate kernel declaration
# here, we don't need to check WITH_XXX, because if not WITH_XXX, the
# xxx_srcs_len will be equal to 0
if (${common_srcs_len} GREATER 0)
kernel_declare(${common_srcs})
endif()
if (${cpu_srcs_len} GREATER 0)
kernel_declare(${cpu_srcs})
endif()
if (${gpu_srcs_len} GREATER 0)
kernel_declare(${gpu_srcs})
endif()
if (${xpu_srcs_len} GREATER 0)
kernel_declare(${xpu_srcs})
endif()
if (${gpudnn_srcs_len} GREATER 0)
kernel_declare(${gpudnn_srcs})
endif()
if (${selected_rows_srcs_len} GREATER 0)
kernel_declare(${selected_rows_srcs})
endif()
endif()
endfunction()
......@@ -285,9 +306,9 @@ endfunction()
function(append_op_util_declare TARGET)
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET} target_content)
string(REGEX MATCH "(PT_REGISTER_BASE_KERNEL_NAME|PT_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}")
string(REPLACE "PT_REGISTER_ARG_MAPPING_FN" "PT_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}")
string(REPLACE "PT_REGISTER_BASE_KERNEL_NAME" "PT_DECLARE_BASE_KERNEL_NAME" util_declare "${util_declare}")
string(REGEX MATCH "(PD_REGISTER_BASE_KERNEL_NAME|PD_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}")
string(REPLACE "PD_REGISTER_ARG_MAPPING_FN" "PD_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}")
string(REPLACE "PD_REGISTER_BASE_KERNEL_NAME" "PD_DECLARE_BASE_KERNEL_NAME" util_declare "${util_declare}")
string(APPEND util_declare ");\n")
file(APPEND ${op_utils_header} "${util_declare}")
endfunction()
......
......@@ -14,8 +14,8 @@
set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_dir")
function(pten_header_path_compat TARGET_PATH)
message(STATUS "pten header path compat processing: ${TARGET_PATH}")
function(phi_header_path_compat TARGET_PATH)
message(STATUS "phi header path compat processing: ${TARGET_PATH}")
string(FIND ${TARGET_PATH} "experimental" pos)
if (pos GREATER 1)
file(GLOB HEADERS "${TARGET_PATH}/*" "*.h")
......@@ -25,17 +25,17 @@ if (pos GREATER 1)
string(REPLACE "paddle/phi/" "paddle/include/experimental/phi/" HEADER_CONTENT "${HEADER_CONTENT}")
string(REPLACE "paddle/utils/" "paddle/include/experimental/utils/" HEADER_CONTENT "${HEADER_CONTENT}")
file(WRITE ${header} "${HEADER_CONTENT}")
message(STATUS "pten header path compat processing complete: ${header}")
message(STATUS "phi header path compat processing complete: ${header}")
endif()
endforeach()
endif()
endfunction()
pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental)
pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api)
pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext)
pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/include)
pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common)
phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental)
phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api)
phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext)
phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/include)
phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common)
# In order to be compatible with the original behavior, the header file name needs to be changed
file(RENAME ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/extension.h
......
......@@ -17,7 +17,7 @@ if(NOT WITH_XPU_KP)
endif()
if(NOT XPU_TOOLCHAIN)
set(XPU_TOOLCHAIN /workspace/paddle/xpu-demo/XTDK)
set(XPU_TOOLCHAIN /workspace/output/XTDK-ubuntu_x86_64)
get_filename_component(XPU_TOOLCHAIN ${XPU_TOOLCHAIN} REALPATH)
endif()
if(NOT IS_DIRECTORY ${XPU_TOOLCHAIN})
......@@ -102,7 +102,7 @@ macro(compile_kernel COMPILE_ARGS)
set(XTDK_DIR ${XPU_TOOLCHAIN})
set(CXX_DIR ${HOST_SYSROOT})
set(XPU_CXX_FLAGS -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -DNDEBUG )
set(XPU_CXX_FLAGS -fforce-enable-int128 -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -DNDEBUG )
#include path
get_property(dirs DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
......@@ -127,9 +127,11 @@ macro(compile_kernel COMPILE_ARGS)
kernel_build/${kernel_name}.bin.o
COMMAND
${CMAKE_COMMAND} -E make_directory kernel_build
COMMAND
cp ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu
COMMAND
${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES}
-I. -o kernel_build/${kernel_name}.bin.o.sec ${kernel_path}/${kernel_name}.xpu
-I. -o kernel_build/${kernel_name}.bin.o.sec kernel_build/${kernel_name}.xpu
--xpu-device-only -c -v
COMMAND
${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR}
......@@ -148,9 +150,11 @@ macro(compile_kernel COMPILE_ARGS)
kernel_build/${kernel_name}.host.o
COMMAND
${CMAKE_COMMAND} -E make_directory kernel_build
COMMAND
cp ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu
COMMAND
${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES}
-I. -o kernel_build/${kernel_name}.host.o ${kernel_path}/${kernel_name}.xpu
-I. -o kernel_build/${kernel_name}.host.o kernel_build/${kernel_name}.xpu
--xpu-host-only -c -v
WORKING_DIRECTORY
${CMAKE_CURRENT_BINARY_DIR}
......@@ -185,7 +189,7 @@ macro(xpu_add_library TARGET_NAME)
# Distinguish .xpu file from other files
foreach(cur_xpu_src IN LISTS xpu_srcs_lists)
get_filename_component(language_type_name ${cur_xpu_src} EXT)
if(${language_type_name} STREQUAL ".xpu")
if(${language_type_name} STREQUAL ".kps")
list(APPEND xpu_kernel_lists ${cur_xpu_src})
else()
list(APPEND cc_kernel_lists ${cur_xpu_src})
......
add_subdirectory(collective)
add_subdirectory(store)
if(NOT WITH_PSCORE)
add_subdirectory(fleet_executor)
return()
......
cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
if(WITH_NCCL)
cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
endif()
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda_runtime.h>
#include <error.h>
#include <string>
#include "boost/variant.hpp"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace distributed {
#define NCCLCHECK(cmd) \
do { \
ncclResult_t r = cmd; \
if (r != ncclSuccess) { \
printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, \
platform::dynload::ncclGetErrorString(r)); \
exit(EXIT_FAILURE); \
} \
} while (0)
// NOTE(shenliang03): EventManager are movable not copyable CudaEvent wrapper.
// EventManage is different from paddle::platform::CudaEvent.
// It uses lazy initialization and is only created when the
// Record() method is called for the first time; it also monitors
// device information to ensure that recorded stream and event
// are on the same device.
class EventManager {
public:
EventManager() {}
explicit EventManager(unsigned int flags) : flags_{flags} {}
~EventManager() {
if (is_created_) {
platform::CUDADeviceGuard guard(device_index_);
cudaEventDestroy(event_);
}
}
EventManager(const EventManager&) = delete;
EventManager& operator=(const EventManager&) = delete;
EventManager(EventManager&& other) {
std::swap(flags_, other.flags_);
std::swap(is_created_, other.is_created_);
std::swap(device_index_, other.device_index_);
std::swap(event_, other.event_);
}
EventManager& operator=(EventManager&& other) {
std::swap(flags_, other.flags_);
std::swap(is_created_, other.is_created_);
std::swap(device_index_, other.device_index_);
std::swap(event_, other.event_);
return *this;
}
bool IsCreated() const { return is_created_; }
bool DeviceId() const { return device_index_; }
gpuEvent_t GetRawCudaEvent() const { return event_; }
void Record(const paddle::platform::CUDADeviceContext& ctx) {
auto device_index = ctx.GetPlace().device;
if (!is_created_) {
CreateEvent(device_index);
}
PADDLE_ENFORCE_EQ(device_index, device_index_,
platform::errors::PreconditionNotMet(
"CUDADeviceContext's device %d does not match"
"Event's device %d",
device_index, device_index_));
platform::CUDADeviceGuard guard(device_index_);
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, ctx.stream()));
}
bool Query() const {
gpuError_t err = cudaEventQuery(event_);
if (err == cudaSuccess) {
return true;
} else if (err == cudaErrorNotReady) {
return false;
} else {
PADDLE_ENFORCE_GPU_SUCCESS(err);
return false;
}
}
void Synchronize() const {
if (is_created_) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_));
}
}
void Block(const paddle::platform::CUDADeviceContext& ctx) const {
if (is_created_) {
auto device_index = ctx.GetPlace().device;
PADDLE_ENFORCE_EQ(device_index, device_index_,
platform::errors::PreconditionNotMet(
"CUDADeviceContext's device %d does not match"
"Event's device %d",
device_index, device_index_));
platform::CUDADeviceGuard guard(device_index_);
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(ctx.stream(), event_, 0));
}
}
private:
unsigned int flags_ = cudaEventDefault;
bool is_created_{false};
gpuEvent_t event_{};
int8_t device_index_{0};
private:
void CreateEvent(int device_index) {
device_index_ = device_index;
platform::CUDADeviceGuard guard(device_index);
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(&event_, flags_));
is_created_ = true;
}
};
// NOTE(shenliang03): NCCLCommManager is more lightweight than
// platform::NCCLComm
class NCCLCommManager {
public:
explicit NCCLCommManager(ncclComm_t ncclComm) : nccl_comm_(ncclComm) {}
NCCLCommManager() : NCCLCommManager(nullptr) {}
~NCCLCommManager() noexcept {
std::unique_lock<std::mutex> lock(mutex_);
if (nccl_comm_) {
platform::dynload::ncclCommDestroy(nccl_comm_);
}
}
static std::shared_ptr<NCCLCommManager> Create(int num_ranks, int rank,
ncclUniqueId comm_id) {
auto nccl_manager = std::make_shared<NCCLCommManager>();
NCCLCHECK(platform::dynload::ncclCommInitRank(&(nccl_manager->nccl_comm_),
num_ranks, comm_id, rank));
nccl_manager->nccl_id_ = comm_id;
nccl_manager->rank_ = rank;
return nccl_manager;
}
ncclUniqueId GetNcclId() const {
std::unique_lock<std::mutex> lock(mutex_);
return nccl_id_;
}
ncclComm_t GetNcclComm() const {
std::unique_lock<std::mutex> lock(mutex_);
return nccl_comm_;
}
NCCLCommManager(const NCCLCommManager&) = delete;
NCCLCommManager& operator=(const NCCLCommManager&) = delete;
NCCLCommManager& operator=(NCCLCommManager&& other) = delete;
NCCLCommManager(NCCLCommManager&& other) {
std::unique_lock<std::mutex> lock(other.mutex_);
std::swap(nccl_comm_, other.nccl_comm_);
}
protected:
ncclComm_t nccl_comm_;
ncclUniqueId nccl_id_;
int rank_;
mutable std::mutex mutex_;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
namespace paddle {
namespace distributed {
ProcessGroup::Task::Task(int rank, const std::vector<Tensor>& inputTensors,
CommType comm_type)
: rank_(rank), comm_type_(comm_type) {}
ProcessGroup::Task::~Task() = default;
bool ProcessGroup::Task::IsCompleted() {
std::lock_guard<std::mutex> lock(mutex_);
return is_completed_;
}
bool ProcessGroup::Task::Wait(std::chrono::milliseconds timeout) {
return false;
}
void ProcessGroup::Task::Synchronize() {}
ProcessGroup::ProcessGroup(int rank, int size) : rank_(rank), size_(size) {}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/distributed/collective/Types.h"
#include "paddle/fluid/eager/api/utils/tensor_utils.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/enforce.h"
constexpr auto kWaitTimeout = std::chrono::milliseconds(0);
namespace paddle {
namespace distributed {
using Tensor = paddle::experimental::Tensor;
enum class CommType : std::uint8_t {
BROADCAST = 0,
ALLREDUCE = 1,
ALLREDUCE_SPARSE = 2, // TODO(shenliang03): to support sparse in allreduce
REDUCE = 3,
ALLGATHER = 4,
GATHER = 5,
SCATTER = 6,
REDUCE_SCATTER = 7,
ALLTOALL = 8,
SEND = 9,
RECV = 10,
BARRIER = 11,
UNKNOWN = 100,
};
struct ProcessGroupStrategy {
int nranks_{1};
int local_rank_{0};
std::vector<std::string> trainer_endpoints_{};
std::string current_endpoint_{""};
int nrings_{1};
};
class ProcessGroup {
public:
class Task {
public:
Task(int rank, const std::vector<Tensor>& inputTensors,
CommType opType = CommType::UNKNOWN);
virtual ~Task();
virtual bool IsCompleted();
virtual bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
virtual void Synchronize();
protected:
const int rank_;
CommType comm_type_;
std::mutex mutex_;
bool is_completed_ = false;
};
explicit ProcessGroup(int rank, int size);
virtual ~ProcessGroup() {}
int GetRank() const { return rank_; }
int GetSize() const { return size_; }
virtual const std::string GetBackendName() const = 0;
virtual std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<Tensor>& /* tensors */,
const AllreduceOptions& = AllreduceOptions()) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support allreduce", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<Tensor>& /* tensors */,
const BroadcastOptions& = BroadcastOptions()) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support allreduce", GetBackendName()));
}
protected:
const int rank_;
const int size_;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
DECLARE_bool(nccl_blocking_wait);
DECLARE_bool(use_stream_safe_cuda_allocator);
constexpr int64_t kWaitBlockTImeout = 10;
namespace paddle {
namespace distributed {
static ncclRedOp_t ToNCCLRedType(ReduceOp reduction) {
static const std::map<ReduceOp, ncclRedOp_t> red_type = {
{ReduceOp::MIN, ncclMin},
{ReduceOp::MAX, ncclMax},
{ReduceOp::SUM, ncclSum},
{ReduceOp::PRODUCT, ncclProd},
};
auto it = red_type.find(reduction);
PADDLE_ENFORCE_EQ(it != red_type.end(), true,
platform::errors::InvalidArgument(
"Invalid nccl reduction. Must be ncclMin | ncclMax | "
"ncclProd | ncclSum"));
return it->second;
}
std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID) {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&ncclID);
std::ostringstream oss;
for (auto i = 0; i < NCCL_UNIQUE_ID_BYTES; ++i) {
oss << std::hex << static_cast<int>(bytes[i]);
}
return oss.str();
}
// Get the list of devices from list of tensors
std::vector<Place> GetPlaceList(const std::vector<Tensor>& tensors) {
std::vector<Place> places;
places.reserve(tensors.size());
for (auto& tensor : tensors) {
places.push_back(tensor.inner_place());
}
return places;
}
// Get the deviceList String from the list of devices
std::string GetKeyFromPlaces(const std::vector<Place>& places) {
std::string placeList;
for (auto& place : places) {
std::stringstream tmp;
tmp << place;
if (placeList.empty()) {
placeList += tmp.str();
} else {
placeList += "," + tmp.str();
}
}
return placeList;
}
bool CheckTensorsInCudaPlace(const std::vector<Tensor>& tensors) {
return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
return t.place() == PlaceType::kGPU;
});
}
void SyncDefaultStream(
const std::vector<Place>& places,
std::vector<EventManager>& ncclEvents, // NOLINT
std::vector<std::unique_ptr<CUDADeviceContext>>& dev_ctx) { // NOLINT
for (size_t i = 0; i < places.size(); ++i) {
auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(places[i]));
ncclEvents[i].Record(*dev_ctx[i]);
ncclEvents[i].Block(*default_ctx);
}
}
std::shared_ptr<ProcessGroupNCCL::NCCLTask> ProcessGroupNCCL::CreateTask(
std::vector<Place> places, int rank, CommType comm_type,
const std::vector<Tensor>& inputs) {
return std::make_shared<ProcessGroupNCCL::NCCLTask>(places, rank, comm_type,
inputs);
}
ProcessGroupNCCL::NCCLTask::NCCLTask(const std::vector<Place>& places, int rank,
CommType CommType,
const std::vector<Tensor>& inputs)
: Task(rank, inputs, CommType), places_(places) {
control_events_.resize(places.size());
ncclComms_.resize(places.size());
}
ProcessGroupNCCL::NCCLTask::~NCCLTask() {}
void ProcessGroupNCCL::NCCLTask::SetOutputs(
std::vector<Tensor>& outputs) { // NOLINT
outputs_ = std::make_shared<std::vector<Tensor>>(outputs);
}
void ProcessGroupNCCL::NCCLTask::SynchronizeStreams() {
for (size_t i = 0; i < places_.size(); ++i) {
auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(places_[i]));
default_ctx->WaitEvent(control_events_[i].GetRawCudaEvent());
}
}
bool ProcessGroupNCCL::NCCLTask::IsCompleted() {
for (size_t i = 0; i < places_.size(); ++i) {
if (!control_events_[i].Query()) {
return false;
}
}
return true;
}
// TODO(sheniang03): Add timeout for wait, now timeout unused
bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
SynchronizeStreams();
if (FLAGS_nccl_blocking_wait) {
// NOTE(shenliang03): It will block host for sync
while (!IsCompleted()) {
std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
}
}
return true;
}
// Same as Wait
void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); }
ProcessGroupNCCL::ProcessGroupNCCL(const ProcessGroupStrategy& strategy,
int rank, int size)
: ProcessGroup(rank, size), strategy_(strategy) {}
void ProcessGroupNCCL::BcastNCCLId(
std::vector<ncclUniqueId>& nccl_ids, // NOLINT
int root, int server_fd) {
if (strategy_.local_rank_ == root) {
std::vector<std::string> other_trainers;
for (auto& ep : strategy_.trainer_endpoints_) {
if (ep != strategy_.current_endpoint_) {
other_trainers.push_back(ep);
}
}
platform::SendBroadCastCommID(other_trainers, &nccl_ids);
} else {
platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_,
&nccl_ids);
}
}
void ProcessGroupNCCL::BroadcastUniqueNCCLID(
std::vector<ncclUniqueId>& nccl_ids) { // NOLINT
int server_fd = -1;
if (rank_ != 0) {
server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_)
.socket();
}
BcastNCCLId(nccl_ids, 0, server_fd);
}
// create NCCLManager cache for places_key
void ProcessGroupNCCL::CreateNCCLManagerCache(
const std::string& places_key, const std::vector<Place>& places) {
PADDLE_ENFORCE_EQ(places_key.empty(), false,
platform::errors::PreconditionNotMet(
"Not able to create/get the NCCL Communicator since "
"the GPU place are not known"));
std::vector<std::shared_ptr<NCCLCommManager>> nccl_comms;
nccl_comms.resize(places.size());
// using vector just for broadcast
std::vector<ncclUniqueId> nccl_ids;
nccl_ids.resize(1);
auto& nccl_id = nccl_ids.front();
if (rank_ == 0) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id));
}
BroadcastUniqueNCCLID(nccl_ids);
VLOG(3) << "init nccl rank: " << strategy_.local_rank_
<< ", nranks: " << strategy_.nranks_ << ", place: " << places_key
<< ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id);
std::vector<std::unique_ptr<CUDADeviceContext>> dev_ctx;
dev_ctx.resize(places.size());
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
for (size_t i = 0; i < places.size(); ++i) {
platform::CUDADeviceGuard guard(places[i]);
nccl_comms[i] = NCCLCommManager::Create(GetSize(), GetRank(), nccl_id);
dev_ctx[i].reset(new CUDADeviceContext(places[i]));
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
std::vector<EventManager> events;
events.resize(places.size());
// These caches will be useful to process sync/wait/communicate
places_to_events_.emplace(places_key, std::move(events));
places_to_ncclcomm_.emplace(places_key, std::move(nccl_comms));
places_to_ctx_.emplace(places_key, std::move(dev_ctx));
}
template <typename Fn>
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Collective(
std::vector<Tensor>& inputs, std::vector<Tensor>& outputs, Fn fn,
CommType op_type) {
const auto places = GetPlaceList(inputs);
const auto key = GetKeyFromPlaces(places);
{
std::lock_guard<std::mutex> lock(mutex_);
if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) {
CreateNCCLManagerCache(key, places);
}
}
auto& nccl_comms = places_to_ncclcomm_[key];
SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
auto task = CreateTask(places, rank_, op_type, inputs);
task->SetOutputs(outputs);
// construct uninitialize guard for device
platform::CUDADeviceGuard cuda_guard;
if (FLAGS_use_stream_safe_cuda_allocator) {
for (size_t i = 0; i < inputs.size(); ++i) {
cuda_guard.SetDevice(places[i]);
auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl());
memory::RecordStream(dense_tensor->Holder(),
places_to_ctx_[key][i]->stream());
}
}
{
platform::NCCLGroupGuard nccl_guard;
for (size_t i = 0; i < inputs.size(); ++i) {
cuda_guard.SetDevice(places[i]);
const auto& nccl_stream = places_to_ctx_[key][i]->stream();
fn(inputs[i], outputs[i], nccl_comms[i]->GetNcclComm(), nccl_stream);
}
}
for (size_t i = 0; i < inputs.size(); ++i) {
cuda_guard.SetDevice(places[i]);
task->control_events_[i].Record(*places_to_ctx_[key][i]);
}
return task;
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllReduce(
std::vector<Tensor>& tensors, const AllreduceOptions& opts) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
tensors, tensors,
[&](const Tensor& input, Tensor& output, ncclComm_t comm,
const gpuStream_t& stream) {
auto input_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
auto output_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
return platform::dynload::ncclAllReduce(
input_tensor->data(), output_tensor->data(), input_tensor->numel(),
platform::ToNCCLDataType(input.type()),
ToNCCLRedType(opts.reduce_op), comm, stream);
},
CommType::ALLREDUCE);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
std::vector<Tensor>& tensors, const BroadcastOptions& opts) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
tensors, tensors,
[&](Tensor& input, Tensor& output, ncclComm_t comm,
const gpuStream_t& stream) {
const auto root = opts.source_rank * tensors.size() + opts.source_root;
auto input_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
auto output_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
return platform::dynload::ncclBcast(
input_tensor->data(), input_tensor->numel(),
platform::ToNCCLDataType(input.type()), root, comm, stream);
},
CommType::BROADCAST);
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/stream/cuda_stream.h"
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/distributed/collective/NCCLTools.h"
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
constexpr const char* NCCL_BACKEND_NAME = "NCCL";
namespace paddle {
namespace distributed {
using Place = paddle::platform::Place;
using CUDAStream = platform::stream::CUDAStream;
using CUDADeviceContext = paddle::platform::CUDADeviceContext;
class ProcessGroupNCCL : public ProcessGroup {
public:
class NCCLTask : public ProcessGroup::Task,
public std::enable_shared_from_this<NCCLTask> {
public:
NCCLTask(const std::vector<Place>& places, int rank, CommType CommType,
const std::vector<Tensor>& inputs);
bool IsCompleted();
void SynchronizeStreams();
bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
void Synchronize();
void SetOutputs(std::vector<Tensor>& outputs); // NOLINT
virtual ~NCCLTask();
std::vector<EventManager> control_events_;
protected:
std::vector<Place> places_;
std::vector<std::shared_ptr<NCCLCommManager>> ncclComms_;
std::shared_ptr<std::vector<Tensor>> outputs_;
private:
};
ProcessGroupNCCL(const ProcessGroupStrategy& strategy, int rank, int size);
const std::string GetBackendName() const override {
return std::string(NCCL_BACKEND_NAME);
}
std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<Tensor>& tensors,
const AllreduceOptions& = AllreduceOptions()) override;
std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<Tensor>& tensors,
const BroadcastOptions& = BroadcastOptions()) override;
protected:
virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
std::vector<Place> places, int rank, CommType opType,
const std::vector<Tensor>& inputs);
protected:
ProcessGroupStrategy strategy_;
std::shared_ptr<NCCLCommManager> nccl_comm_;
std::mutex mutex_;
std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLCommManager>>>
places_to_ncclcomm_;
std::unordered_map<std::string, std::vector<EventManager>> places_to_events_;
std::unordered_map<std::string,
std::vector<std::unique_ptr<CUDADeviceContext>>>
places_to_ctx_;
private:
void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids, int root, // NOLINT
int server_fd);
void BroadcastUniqueNCCLID(std::vector<ncclUniqueId>& nccl_ids); // NOLINT
template <typename Fn>
std::shared_ptr<ProcessGroup::Task> Collective(
std::vector<Tensor>& inputs, // NOLINT
std::vector<Tensor>& outputs, // NOLINT
Fn fn, CommType op_type);
void CreateNCCLManagerCache(const std::string& places_key,
const std::vector<Place>& places);
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <cstdint>
#include <vector>
namespace paddle {
namespace distributed {
// TODO(shenliang03): To support AVG for reduce
enum class ReduceOp : std::uint8_t { SUM = 0, AVG, MAX, MIN, PRODUCT };
struct AllreduceOptions {
ReduceOp reduce_op = ReduceOp::SUM;
};
struct BroadcastOptions {
int source_rank = 0;
int source_root = 0;
};
} // namespace distributed
} // namespace paddle
......@@ -52,6 +52,8 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
input_tensor_ptr = input_tensor->mutable_data<float>(dims, place);
} else if (input_data.dtype == DistModelDataType::INT32) {
input_tensor_ptr = input_tensor->mutable_data<int32_t>(dims, place);
} else if (input_data.dtype == DistModelDataType::FLOAT16) {
input_tensor_ptr = input_tensor->mutable_data<float16>(dims, place);
} else {
LOG(ERROR) << "unsupported feed type " << input_data.dtype;
return false;
......@@ -412,6 +414,8 @@ bool DistModel::PrepareFeedAndFetch() {
feeds_to_dtype_.insert({var_name, DistModelDataType::INT32});
} else if (real_var->GetDataType() == framework::proto::VarType::INT64) {
feeds_to_dtype_.insert({var_name, DistModelDataType::INT64});
} else if (real_var->GetDataType() == framework::proto::VarType::FP16) {
feeds_to_dtype_.insert({var_name, DistModelDataType::FLOAT16});
} else {
LOG(ERROR) << "Don't support feed var dtype for: "
<< real_var->GetDataType();
......@@ -503,9 +507,13 @@ bool DistModel::FetchResults(std::vector<DistModelTensor> *output_data,
} else if (type == framework::proto::VarType::INT32) {
rst = FetchResult<int32_t>(fetch, output);
output->dtype = DistModelDataType::INT32;
} else if (type == framework::proto::VarType::FP16) {
rst = FetchResult<float16>(fetch, output);
output->dtype = DistModelDataType::FLOAT16;
} else {
LOG(ERROR) << "DistModel meets unknown fetch data type. DistModel only "
"supports float32, int64 and int32 fetch type for now.";
"supports float32, float16, int64 and int32 fetch type "
"for now.";
}
if (!rst) {
LOG(ERROR) << "DistModel fails to fetch result " << idx_to_fetches_[idx];
......
......@@ -15,6 +15,7 @@
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/macros.h"
namespace paddle {
......@@ -40,6 +41,11 @@ constexpr DistModelDataType DistModelGetDtype<float>() {
return DistModelDataType::FLOAT32;
}
template <>
constexpr DistModelDataType DistModelGetDtype<platform::float16>() {
return DistModelDataType::FLOAT16;
}
class DistModelDataBuf {
public:
explicit DistModelDataBuf(size_t length)
......
......@@ -238,7 +238,7 @@ void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
void* tensor_data = tensor->mutable_data(
place,
framework::TransToPtenDataType(VarMessageToVarType(msg.data_type())));
framework::TransToPhiDataType(VarMessageToVarType(msg.data_type())));
// IO Buffer
if (platform::is_cpu_place(place)) {
......@@ -281,7 +281,7 @@ void DeserializeSelectedRows(
tensor->Resize(phi::make_ddim(vec_dim));
void* tensor_data = tensor->mutable_data(
place,
framework::TransToPtenDataType(VarMessageToVarType(msg.data_type())));
framework::TransToPhiDataType(VarMessageToVarType(msg.data_type())));
// IO Buffer
if (platform::is_cpu_place(place)) {
unsigned long data_len; // NOLINT
......
......@@ -31,7 +31,8 @@ struct CommContext {
const std::vector<std::string> &origin_names, int id,
bool merge_add_ = true, bool is_sparse_ = true,
bool is_distributed_ = false, int table_id_ = -1,
bool is_tensor_table_ = false)
bool is_tensor_table_ = false, bool is_datanorm_table_ = false,
int64_t program_id_ = -1)
: var_name(name),
splited_varnames(names),
epmap(emap),
......@@ -42,7 +43,9 @@ struct CommContext {
is_sparse(is_sparse_),
is_distributed(is_distributed_),
table_id(table_id_),
is_tensor_table(is_tensor_table_) {}
program_id(program_id_),
is_tensor_table(is_tensor_table_),
is_datanorm_table(is_datanorm_table_) {}
CommContext(const CommContext &ctx) {
var_name = ctx.var_name;
......@@ -55,7 +58,9 @@ struct CommContext {
origin_varnames = ctx.origin_varnames;
is_distributed = ctx.is_distributed;
table_id = ctx.table_id;
program_id = ctx.program_id;
is_tensor_table = ctx.is_tensor_table;
is_datanorm_table = ctx.is_datanorm_table;
}
std::string print() const {
......@@ -78,7 +83,9 @@ struct CommContext {
ss << " is_sparse: " << is_sparse;
ss << " is_distributed: " << is_distributed << "\n";
ss << " table_id: " << table_id << "\n";
ss << " program_id: " << program_id << "\n";
ss << " is_tensor_table: " << is_tensor_table << "\n";
ss << " is_datanorm_table: " << is_datanorm_table << "\n";
return ss.str();
}
......@@ -93,7 +100,9 @@ struct CommContext {
bool is_sparse;
bool is_distributed;
int table_id;
int64_t program_id;
bool is_tensor_table;
bool is_datanorm_table;
};
} // namespace distributed
......
cc_library(tcp_store SRCS tcp_store.cc tcp_utils.cc DEPS enforce glog)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <iostream>
#include <string>
#include <vector>
#include "paddle/fluid/distributed/store/tcp_utils.h"
namespace paddle {
namespace distributed {
class Store {
public:
Store() = delete;
explicit Store(const std::chrono::seconds& timeout) : _timeout(timeout) {}
virtual ~Store() = default;
virtual int64_t add(const std::string& key, int64_t value) = 0;
virtual std::vector<uint8_t> get(const std::string& key) = 0;
virtual void wait(const std::string& key) = 0;
virtual const std::chrono::seconds& timeout() const { return _timeout; }
private:
std::chrono::seconds _timeout;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <chrono>
#include <iostream>
#include <thread>
#include "paddle/fluid/distributed/store/tcp_store.h"
#include "paddle/fluid/distributed/store/tcp_utils.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace distributed {
namespace detail {
constexpr int INFTIME = -1;
std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket) {
return std::make_unique<MasterDaemon>(socket);
}
MasterDaemon::MasterDaemon(SocketType socket) : _listen_socket(socket) {
_background_thread = std::thread{&MasterDaemon::run, this};
}
MasterDaemon::~MasterDaemon() {
_background_thread.join();
tcputils::close_socket(_listen_socket);
for (SocketType socket : _sockets) {
tcputils::close_socket(socket);
}
}
void MasterDaemon::_do_add(SocketType socket) {
int64_t new_value{};
std::string key = tcputils::receive_string(socket);
new_value = tcputils::receive_value<int64_t>(socket);
std::vector<uint8_t> old_value;
auto it = _store.find(key);
if (it != _store.end()) {
old_value = it->second;
char* buffer = reinterpret_cast<char*>(it->second.data());
size_t len = old_value.size();
new_value += std::stoll(std::string(buffer, len));
}
std::string new_value_str = std::to_string(new_value);
_store[key] =
std::vector<uint8_t>(new_value_str.begin(), new_value_str.end());
VLOG(3) << "TCPStore: new value (" << new_value << ") for key (" << key
<< ").";
tcputils::send_value<int64_t>(socket, new_value);
}
void MasterDaemon::_do_get(SocketType socket) {
std::string key = tcputils::receive_string(socket);
auto iter = _store.find(key);
PADDLE_ENFORCE_NE(
iter, _store.end(),
platform::errors::InvalidArgument("Key %s not found in TCPStore.", key));
std::vector<uint8_t> value = iter->second;
VLOG(3) << "TCPStore: value ("
<< std::stoll(std::string(reinterpret_cast<char*>(value.data()),
value.size()))
<< ") for key (" << key << ").";
tcputils::send_vector<uint8_t>(socket, value);
}
void MasterDaemon::_do_stop(SocketType socket) {
ReplyType value = ReplyType::STOP_WAIT;
_stop = true;
tcputils::send_value<ReplyType>(socket, value);
}
void MasterDaemon::_do_wait(SocketType socket) {
std::string key = tcputils::receive_string(socket);
auto iter = _store.find(key);
auto reply = ReplyType::STOP_WAIT;
if (iter == _store.end()) {
reply = ReplyType::WAITING;
}
VLOG(3) << "TCPStore: wait reply (" << static_cast<int>(reply)
<< ") for key (" << key << ").";
tcputils::send_value<ReplyType>(socket, reply);
}
void MasterDaemon::run() {
std::vector<struct pollfd> fds;
#ifdef _WIN32
fds.push_back({_listen_socket, POLLIN});
#else
fds.push_back({.fd = _listen_socket, .events = POLLIN, .revents = 0});
#endif
while (!_stop) {
for (size_t i = 0; i < fds.size(); i++) {
fds[i].revents = 0;
}
#ifdef _WIN32
::WSAPoll(fds.data(), fds.size(), INFTIME);
#else
::poll(fds.data(), fds.size(), INFTIME);
#endif
if (fds[0].revents != 0) {
auto socket = tcputils::tcp_accept(_listen_socket);
_sockets.emplace_back(socket);
#ifdef _WIN32
fds.push_back({socket, POLLIN});
#else
fds.push_back({.fd = socket, .events = POLLIN, .revents = 0});
#endif
}
for (size_t i = 1; i < fds.size(); i++) {
if (fds[i].revents == 0) {
continue;
}
Command command = tcputils::receive_value<Command>(fds[i].fd);
VLOG(3) << "TCPStore: recv command: " << static_cast<int>(command) << ".";
switch (command) {
case Command::ADD:
_do_add(fds[i].fd);
break;
case Command::GET:
_do_get(fds[i].fd);
break;
case Command::WAIT:
_do_wait(fds[i].fd);
break;
case Command::STOP:
_do_stop(fds[i].fd);
break;
}
}
}
}
std::unique_ptr<TCPServer> TCPServer::create(uint16_t port) {
int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET);
auto server = std::make_unique<TCPServer>();
server->_master_daemon = MasterDaemon::start(socket);
return server;
}
std::unique_ptr<TCPClient> TCPClient::connect(const std::string host,
uint16_t port) {
int socket = tcputils::tcp_connect(host, std::to_string(port), AF_INET);
return std::make_unique<TCPClient>(socket);
}
void TCPClient::send_command_for_key(Command type, const std::string& key) {
tcputils::send_value<Command>(_socket, type);
if (key.empty()) {
return;
}
tcputils::send_string(_socket, key);
}
template <typename T>
void TCPClient::send_value(const T& value) {
tcputils::send_bytes<T>(_socket, &value, 1);
}
template <typename T>
T TCPClient::receive_value() {
T res;
tcputils::receive_bytes<T>(_socket, &res, 1);
return res;
}
template <typename T>
void TCPClient::send_vector(const std::vector<T>& value) {
tcputils::send_vector<T>(_socket, value);
}
template <typename T>
std::vector<T> TCPClient::receive_vector() {
return tcputils::receive_vector<T>(_socket);
}
} // namespace detail
TCPStore::TCPStore(std::string host, uint16_t port, bool is_master,
size_t num_workers, std::chrono::seconds timeout)
: Store(timeout), _is_master(is_master), _num_workers(num_workers) {
if (_is_master) {
_server = detail::TCPServer::create(port);
}
_client = detail::TCPClient::connect(host, port);
waitWorkers();
}
void TCPStore::waitWorkers() {
if (_num_workers == 0) {
return;
}
add(_init_key, 1);
if (_server) {
auto begin = std::chrono::steady_clock::now();
do {
auto value = get(_init_key);
int completed = std::stoi(std::string(value.begin(), value.end()));
VLOG(3) << completed << " worker ready, total " << _num_workers;
if (completed >= _num_workers) {
break;
}
const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
std::chrono::steady_clock::now() - begin);
std::this_thread::sleep_for(std::chrono::milliseconds(100));
if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) {
PADDLE_ENFORCE_EQ(
completed, _num_workers,
platform::errors::InvalidArgument(
"TCPStore timeouted and not all workers got ready."));
}
} while (true);
}
VLOG(3) << "TCPStore initialized.";
}
int64_t TCPStore::add(const std::string& key, int64_t value) {
_client->send_command_for_key(Command::ADD, _key_prefix + key);
_client->send_value<std::int64_t>(value);
return _client->receive_value<std::int64_t>();
}
std::vector<uint8_t> TCPStore::get(const std::string& key) {
wait(key);
_client->send_command_for_key(Command::GET, _key_prefix + key);
VLOG(3) << "TCPStore get.";
return _client->receive_vector<uint8_t>();
}
void TCPStore::wait(const std::string& key) {
ReplyType reply;
do {
_client->send_command_for_key(Command::WAIT, _key_prefix + key);
reply = _client->receive_value<ReplyType>();
std::this_thread::sleep_for(std::chrono::milliseconds(500));
} while (reply != ReplyType::STOP_WAIT);
}
TCPStore::~TCPStore() {
_client->send_command_for_key(Command::STOP, "");
ReplyType ret = _client->receive_value<ReplyType>();
PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT,
platform::errors::InvalidArgument(
"The reply for TCPStore destructure must be 0."));
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <iostream>
#include <memory>
#include <mutex>
#include <thread>
#include <unordered_map>
#include "paddle/fluid/distributed/store/store.h"
#include "paddle/fluid/distributed/store/tcp_utils.h"
namespace paddle {
namespace distributed {
enum class ReplyType { WAITING, STOP_WAIT };
enum class Command { ADD, GET, WAIT, STOP };
namespace detail {
class MasterDaemon {
public:
static std::unique_ptr<MasterDaemon> start(SocketType listen_socket);
MasterDaemon() = delete;
explicit MasterDaemon(SocketType listen_socket);
~MasterDaemon();
private:
void run();
void _do_add(SocketType socket);
void _do_wait(SocketType socket);
void _do_get(SocketType socket);
void _do_stop(SocketType socket);
SocketType _listen_socket;
std::vector<SocketType> _sockets;
std::unordered_map<std::string, std::vector<uint8_t>> _store;
std::thread _background_thread{};
bool _stop = false;
};
class TCPServer {
public:
TCPServer() = default;
static std::unique_ptr<TCPServer> create(std::uint16_t port);
private:
std::unique_ptr<MasterDaemon> _master_daemon;
};
class TCPClient {
public:
explicit TCPClient(SocketType socket) : _socket{socket} {}
static std::unique_ptr<TCPClient> connect(const std::string host,
uint16_t port);
~TCPClient() { tcputils::close_socket(_socket); }
void send_command_for_key(Command type, const std::string& key);
template <typename T>
void send_value(const T& value);
template <typename T>
void send_vector(const std::vector<T>& value);
template <typename T>
std::vector<T> receive_vector();
template <typename T>
T receive_value();
private:
SocketType _socket;
};
} // namespace detail
class TCPStore : public Store {
public:
static constexpr std::uint16_t kDefaultPort = 6170;
explicit TCPStore(std::string host, uint16_t port = kDefaultPort,
bool is_master = false, size_t num_workers = 1,
std::chrono::seconds timeout = tcputils::kDefaultTimeout);
~TCPStore();
int64_t add(const std::string& key, int64_t value) override;
std::vector<uint8_t> get(const std::string& key) override;
void wait(const std::string& key) override;
private:
void waitWorkers();
std::unique_ptr<detail::TCPServer> _server;
std::unique_ptr<detail::TCPClient> _client;
const std::string _init_key = "init/";
const std::string _key_prefix = "/";
std::chrono::seconds _timeout;
bool _is_master;
int _num_workers;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/store/tcp_utils.h"
#include <cerrno>
#include <cstring>
#include <thread>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace distributed {
namespace tcputils {
std::error_code socket_error() {
#ifdef _WIN32
return std::error_code{::WSAGetLastError(), std::generic_category()};
#else
return std::error_code{errno, std::generic_category()};
#endif
}
void close_socket(SocketType socket) {
#ifdef _WIN32
::closesocket(socket);
#else
::close(socket);
#endif
}
::addrinfo* get_addr_info(const std::string host, const std::string port,
int ai_flags, int family) {
::addrinfo hints{}, *res;
hints.ai_flags = ai_flags;
hints.ai_family = family;
hints.ai_socktype = SOCK_STREAM;
const char* node = host.empty() ? nullptr : host.c_str();
int n;
n = ::getaddrinfo(node, port.c_str(), &hints, &res);
const char* gai_err = ::gai_strerror(n);
const char* proto =
(family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : "");
PADDLE_ENFORCE_EQ(
n, 0, platform::errors::InvalidArgument(
"%s network %s:%s cannot be obtained. Details: %s.", proto,
host, port, gai_err));
return res;
}
void free_addr_info(::addrinfo* hint) {
PADDLE_ENFORCE_NOT_NULL(
hint, platform::errors::InvalidArgument(
"The parameter for free_addr_info cannot be null."));
::freeaddrinfo(hint);
}
SocketType tcp_connect(const std::string host, const std::string port,
int family, std::chrono::seconds timeout) {
int ai_flags = AI_NUMERICSERV | AI_V4MAPPED | AI_ALL;
::addrinfo* res = get_addr_info(host, port, ai_flags, family);
SocketType sockfd = -1;
bool retry = true;
auto deadline = std::chrono::steady_clock::now() + timeout;
do {
for (::addrinfo* cur = res; cur != nullptr; cur = cur->ai_next) {
sockfd = ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
PADDLE_ENFORCE_GT(sockfd, 0, platform::errors::InvalidArgument(
"Create socket to connect %s:%s failed. "
"Details: %s. ",
host, port, socket_error().message()));
if (::connect(sockfd, cur->ai_addr, cur->ai_addrlen) == 0) {
retry = false;
break;
}
VLOG(0) << "Retry to connect to " << host << ":" << port
<< " while the server is not yet listening.";
close_socket(sockfd);
sockfd = -1;
std::this_thread::sleep_for(kDelay);
if (timeout != kNoTimeout &&
std::chrono::steady_clock::now() >= deadline) {
retry = false;
break;
}
}
if (timeout != kNoTimeout && std::chrono::steady_clock::now() >= deadline) {
retry = false;
}
} while (retry);
free_addr_info(res);
PADDLE_ENFORCE_GT(sockfd, 0,
platform::errors::InvalidArgument(
"Network %s:%s cannot be connected.", host, port));
VLOG(0) << "Successfully connected to " << host << ":" << port;
return sockfd;
}
SocketType tcp_listen(const std::string host, const std::string port,
int family) {
int ai_flags = AI_PASSIVE | AI_NUMERICSERV;
::addrinfo* res = get_addr_info(host, port, ai_flags, family);
::addrinfo* cur = res;
SocketType sockfd{};
std::string node = host.empty() ? "IP_ANY" : host;
while (cur) {
sockfd = ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
if (sockfd < 0) {
VLOG(0) << "Cannot create socket on " << node << ":" << port
<< ". Details: " << socket_error().message();
cur = cur->ai_next;
continue;
}
int on = 1;
#ifdef _WIN32
int ret = ::setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR,
reinterpret_cast<char*>(&on), sizeof(on));
#else
int ret = ::setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
#endif
if (ret < 0) {
VLOG(0) << "Set the address reuse option failed on the server.";
}
if (::bind(sockfd, res->ai_addr, res->ai_addrlen) == 0) {
break;
}
close_socket(sockfd);
sockfd = -1;
cur = cur->ai_next;
}
PADDLE_ENFORCE_GT(sockfd, 0,
platform::errors::InvalidArgument(
"Bind network on %s:%s failedd.", node, port));
::listen(sockfd, LISTENQ);
VLOG(0) << "The server starts to listen on " << node << ":" << port;
return sockfd;
}
SocketType tcp_accept(SocketType socket) {
::sockaddr_storage addr_s{};
::socklen_t addr_len = sizeof(addr_s);
SocketType new_socket =
::accept(socket, reinterpret_cast<::sockaddr*>(&addr_s), &addr_len);
PADDLE_ENFORCE_GT(
new_socket, 0,
platform::errors::InvalidArgument(
"The server failed to accept a new connection. Details: %s.",
socket_error().message()));
#ifndef _WIN32
::fcntl(new_socket, F_SETFD, FD_CLOEXEC);
#endif
auto value = 1;
#ifdef _WIN32
::setsockopt(new_socket, IPPROTO_TCP, TCP_NODELAY,
reinterpret_cast<const char*>(&value), sizeof(value));
#else
::setsockopt(new_socket, IPPROTO_TCP, TCP_NODELAY, &value, sizeof(value));
#endif
return new_socket;
}
void send_string(SocketType socket, const std::string& s) {
std::string::size_type size = s.size();
send_bytes<std::string::size_type>(socket, &size, 1);
send_bytes<const char>(socket, s.data(), size);
}
std::string receive_string(SocketType socket) {
std::string::size_type size;
receive_bytes<std::string::size_type>(socket, &size, 1);
std::vector<char> v(size);
receive_bytes<char>(socket, v.data(), size);
return std::string(v.data(), v.size());
}
} // namespace tcputils
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef _WIN32
#include <winsock2.h>
#include <ws2tcpip.h>
#pragma comment(lib, "Ws2_32.lib")
#else
#include <fcntl.h>
#include <netdb.h>
#include <netinet/tcp.h>
#include <poll.h>
#include <sys/socket.h>
#include <unistd.h>
#endif
#include <chrono>
#include <iostream>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
// Utility functions for TCP socket.
namespace paddle {
namespace distributed {
#ifdef _WIN32
using SocketType = SOCKET;
#else
using SocketType = int;
#endif
namespace tcputils {
constexpr int LISTENQ = 2048;
constexpr std::chrono::seconds kDelay = std::chrono::seconds(3);
constexpr std::chrono::seconds kNoTimeout = std::chrono::seconds::zero();
constexpr std::chrono::seconds kDefaultTimeout = std::chrono::seconds(360);
std::error_code socket_error();
void close_socket(SocketType socket);
::addrinfo* get_addr_info(const std::string host, const std::string port,
int ai_flags, int family);
void free_addr_info(::addrinfo*);
SocketType tcp_connect(const std::string host, const std::string port,
int family, std::chrono::seconds timeout = kNoTimeout);
SocketType tcp_listen(const std::string host, const std::string port,
int family);
SocketType tcp_accept(SocketType socket);
void send_string(SocketType socket, const std::string& s);
std::string receive_string(SocketType socket);
template <typename T>
void send_bytes(SocketType socket, const T* buffer, size_t len) {
size_t to_send = len * sizeof(T);
if (to_send == 0) {
return;
}
auto ptr = reinterpret_cast<const char*>(buffer);
while (to_send > 0) {
auto byte_sent = ::send(socket, ptr, to_send, 0);
PADDLE_ENFORCE_GT(byte_sent, 0, platform::errors::InvalidArgument(
"TCP send error. Details: %s.",
socket_error().message()));
to_send -= byte_sent;
ptr += byte_sent;
}
}
template <typename T>
void receive_bytes(SocketType socket, T* buffer, size_t len) {
size_t to_recv = len * sizeof(T);
if (to_recv == 0) {
return;
}
auto ptr = reinterpret_cast<char*>(buffer);
while (to_recv > 0) {
auto byte_received = ::recv(socket, ptr, to_recv, 0);
PADDLE_ENFORCE_GT(byte_received, 0, platform::errors::InvalidArgument(
"TCP receive error. Details: %s.",
socket_error().message()));
to_recv -= byte_received;
ptr += byte_received;
}
}
template <typename T>
void send_vector(SocketType socket, const std::vector<T>& v) {
size_t size = v.size();
send_bytes<size_t>(socket, &size, 1);
send_bytes<T>(socket, v.data(), size);
}
template <typename T>
std::vector<T> receive_vector(SocketType socket) {
size_t size;
receive_bytes<size_t>(socket, &size, 1);
std::vector<T> res(size);
receive_bytes<T>(socket, res.data(), size);
return res;
}
template <typename T>
void send_value(SocketType socket, const T& v) {
send_bytes<T>(socket, &v, 1);
}
template <typename T>
T receive_value(SocketType socket) {
T v;
receive_bytes<T>(socket, &v, 1);
return v;
}
} // namespace tcputils
} // namespace distributed
} // namespace paddle
set(eager_deps pten pten_api hook_utils tensor_utils utils global_utils backward pten_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
set(eager_deps phi phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
set(generated_deps dygraph_function dygraph_node)
......@@ -10,11 +10,11 @@ endif()
add_subdirectory(api)
add_subdirectory(accumulation)
cc_library(grad_node_info SRCS grad_node_info.cc DEPS pten pten_api)
cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi phi_api)
cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator)
cc_library(autograd_meta SRCS autograd_meta.cc DEPS pten pten_api)
cc_library(utils SRCS utils.cc DEPS pten pten_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi phi_api)
cc_library(utils SRCS utils.cc DEPS phi phi_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info)
add_subdirectory(tests)
cc_library(accumulation_node SRCS accumulation_node.cc DEPS gradient_accumulator pten pten_api grad_node_info)
cc_library(accumulation_node SRCS accumulation_node.cc DEPS gradient_accumulator phi phi_api grad_node_info)
......@@ -25,6 +25,8 @@
#include "glog/logging.h"
namespace egr {
static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
const paddle::experimental::Tensor& t) {
if (!tensor->defined() || !tensor->initialized()) {
......@@ -36,17 +38,10 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
}
}
namespace egr {
void GradNodeAccumulation::RetainGrad(
const std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>& hook) {
retain_grad_hook_ = hook;
}
std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation::
operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
PADDLE_ENFORCE(grads.size() == 1,
paddle::platform::errors::Fatal(
"GradNodeAccumulation should take exactly 1 grad tensor"
......@@ -58,17 +53,18 @@ operator()(
"However received: %d in slot %d .",
grads[0].size(), 0));
// Apply Gradient Hooks
paddle::experimental::Tensor grad_out;
if (GradientHooksRegistered()) {
std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads =
ApplyGradientHooks(grads);
// TODO(jiabin): It's little weird
CopyOrAddTensor(&accumulated_grad, hooked_grads[0][0]);
grad_out = hooked_grads[0][0];
} else {
CopyOrAddTensor(&accumulated_grad, grads[0][0]);
grad_out = grads[0][0];
}
if (retain_grad_hook_ != nullptr) {
retain_grad_hook_(accumulated_grad);
if (!weak_grad_.expired()) {
auto grad = weak_grad_.lock();
CopyOrAddTensor(grad.get(), grad_out);
}
// Apply Reduce Hooks
......@@ -76,17 +72,17 @@ operator()(
ApplyReduceHooks();
}
return {{accumulated_grad}};
return {{grad_out}};
}
void GradNodeAccumulation::RegisterReduceHook(
const std::function<void(void)>& hook) {
reduce_hooks_.emplace_back(hook);
std::shared_ptr<TensorVoidHook>&& hook) {
reduce_hooks_.emplace_back(std::move(hook));
}
void GradNodeAccumulation::ApplyReduceHooks() {
for (auto& hook : reduce_hooks_) {
hook();
(*hook)();
}
}
} // namespace egr
......@@ -14,14 +14,19 @@
#pragma once
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/grad_node_info.h"
#include "paddle/fluid/eager/hooks.h"
namespace egr {
class GradNodeAccumulation : public GradNodeBase {
public:
// Constructor: configure fwd input tensors to grad node
GradNodeAccumulation() : GradNodeBase(1, 1) { SetDefaultGradInOutMeta(); }
explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) {
weak_grad_ = meta->WeakGrad();
SetDefaultGradInOutMeta();
}
~GradNodeAccumulation() override = default;
......@@ -30,15 +35,12 @@ class GradNodeAccumulation : public GradNodeBase {
const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
override;
void RetainGrad(const std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>& hook);
paddle::experimental::Tensor* Grad() { return &accumulated_grad; }
std::string name() { return "GradNodeAccumulation"; }
/**
* Register ReduceHook
* **/
void RegisterReduceHook(const std::function<void(void)>& hook);
void RegisterReduceHook(std::shared_ptr<TensorVoidHook>&& hook);
/**
* Apply ReduceHook here
......@@ -47,13 +49,13 @@ class GradNodeAccumulation : public GradNodeBase {
void ApplyReduceHooks();
private:
paddle::experimental::Tensor accumulated_grad;
std::weak_ptr<paddle::experimental::Tensor> weak_grad_;
std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>
retain_grad_hook_;
std::vector<std::function<void(void)>> reduce_hooks_;
std::vector<std::shared_ptr<TensorVoidHook>> reduce_hooks_;
};
} // namespace egr
cc_library(scale_node SRCS scale_node.cc DEPS global_utils pten pten_api grad_node_info)
cc_library(scale_node SRCS scale_node.cc DEPS global_utils phi phi_api grad_node_info)
if(NOT ON_INFER)
cc_library(final_dygraph_node SRCS nodes.cc DEPS ${eager_deps})
......
......@@ -33,36 +33,36 @@ static void ScaleDeviceDispatch(const phi::DenseTensor& dense_tensor,
phi::DenseTensor* dense_out) {
switch (dense_tensor.dtype()) {
case phi::DataType::FLOAT64: {
phi::ScaleKernel<double, typename paddle::framework::ConvertToPtenContext<
phi::ScaleKernel<double, typename paddle::framework::ConvertToPhiContext<
DeviceContext>::TYPE>(
static_cast<const typename paddle::framework::ConvertToPtenContext<
static_cast<const typename paddle::framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
dense_tensor /* tensor */, scale /* scale */, bias /* bias */,
bias_after_scale /* bias_after_scale */, dense_out /* out tensor */);
break;
}
case phi::DataType::FLOAT32: {
phi::ScaleKernel<float, typename paddle::framework::ConvertToPtenContext<
phi::ScaleKernel<float, typename paddle::framework::ConvertToPhiContext<
DeviceContext>::TYPE>(
static_cast<const typename paddle::framework::ConvertToPtenContext<
static_cast<const typename paddle::framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
dense_tensor /* tensor */, scale /* scale */, bias /* bias */,
bias_after_scale /* bias_after_scale */, dense_out /* out tensor */);
break;
}
case phi::DataType::INT64: {
phi::ScaleKernel<int64_t, typename paddle::framework::
ConvertToPtenContext<DeviceContext>::TYPE>(
static_cast<const typename paddle::framework::ConvertToPtenContext<
phi::ScaleKernel<int64_t, typename paddle::framework::ConvertToPhiContext<
DeviceContext>::TYPE>(
static_cast<const typename paddle::framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
dense_tensor /* tensor */, scale /* scale */, bias /* bias */,
bias_after_scale /* bias_after_scale */, dense_out /* out tensor */);
break;
}
case phi::DataType::INT32: {
phi::ScaleKernel<int32_t, typename paddle::framework::
ConvertToPtenContext<DeviceContext>::TYPE>(
static_cast<const typename paddle::framework::ConvertToPtenContext<
phi::ScaleKernel<int32_t, typename paddle::framework::ConvertToPhiContext<
DeviceContext>::TYPE>(
static_cast<const typename paddle::framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
dense_tensor /* tensor */, scale /* scale */, bias /* bias */,
bias_after_scale /* bias_after_scale */, dense_out /* out tensor */);
......
cc_library(eager_scale SRCS scale.cc DEPS pten_api pten autograd_meta scale_node)
cc_library(eager_scale SRCS scale.cc DEPS phi_api phi autograd_meta scale_node)
if(NOT ON_INFER)
cc_library(final_dygraph_function SRCS dygraph_functions.cc DEPS ${eager_deps})
......
cc_library(tensor_utils SRCS tensor_utils.cc DEPS pten pten_api autograd_meta grad_node_info accumulation_node)
cc_library(hook_utils SRCS hook_utils.cc DEPS pten tensor_utils autograd_meta grad_node_info utils accumulation_node)
cc_library(tensor_utils SRCS tensor_utils.cc DEPS phi phi_api autograd_meta grad_node_info accumulation_node)
cc_library(hook_utils SRCS hook_utils.cc DEPS phi tensor_utils autograd_meta grad_node_info utils accumulation_node)
cc_library(global_utils SRCS global_utils.cc DEPS place tracer)
......@@ -22,19 +22,19 @@
namespace egr {
namespace egr_utils_api {
void RegisterGradientHookForTensor(
int64_t RegisterGradientHookForTensor(
const paddle::experimental::Tensor& tensor,
std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>& hook) {
std::shared_ptr<egr::TensorHook>&& hook) {
// Find grad_node and out_rank from AutogradMeta
std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(tensor);
auto rank_info = EagerUtils::unsafe_autograd_meta(tensor)->OutRankInfo();
grad_node->RegisterGradientHook(rank_info.first, rank_info.second, hook);
return grad_node->RegisterGradientHook(rank_info.first, rank_info.second,
std::move(hook));
}
void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor,
const std::function<void(void)>& hook) {
std::shared_ptr<egr::TensorVoidHook>&& hook) {
if (IsLeafTensor(tensor)) {
VLOG(6) << "Register ReduceHook for leaf tensor";
std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(tensor);
......@@ -45,59 +45,56 @@ void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor,
"with type: GradNodeAccumulation"));
auto accumulation_grad_node =
std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node);
accumulation_grad_node->RegisterReduceHook(hook);
accumulation_grad_node->RegisterReduceHook(std::move(hook));
} else {
PADDLE_THROW(paddle::platform::errors::Fatal(
"Only can register reduce hook for leaf Tensor."));
}
}
void RetainGradForTensor(const paddle::experimental::Tensor& tensor) {
// TODO(jiabin): Support More Tensor type here
static void RetainGradForRegularNode(
const paddle::experimental::Tensor& tensor) {
AutogradMeta* meta = EagerUtils::unsafe_autograd_meta(tensor);
if (meta->RetainGrads()) {
return;
} else {
meta->SetRetainGrads(true);
}
std::weak_ptr<paddle::experimental::Tensor> weak_grad_tensor =
meta->WeakGrad();
// Define Hook
std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>
hook = [weak_grad_tensor](const paddle::experimental::Tensor& t) {
if (!weak_grad_tensor.expired()) {
auto grad_tensor = weak_grad_tensor.lock();
if (t.defined()) {
VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name();
// Simply Copy impl() to grad_tensor
grad_tensor->set_impl(t.impl());
return *grad_tensor.get();
} else {
PADDLE_THROW(paddle::platform::errors::Fatal(
"Detected uninitialized variable, causing segmentation "
"fault "
"inside the hook."
"Tensor has to be initialized while we need to set it."
"please check tensor initialization status."));
}
} else {
VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook";
return paddle::experimental::Tensor();
}
};
auto hook = [weak_grad_tensor](const paddle::experimental::Tensor& t) {
if (!weak_grad_tensor.expired()) {
auto grad_tensor = weak_grad_tensor.lock();
if (t.defined()) {
VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name();
// Simply Copy impl() to grad_tensor
grad_tensor->set_impl(t.impl());
return *grad_tensor.get();
} else {
VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook";
return paddle::experimental::Tensor();
}
} else {
VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook";
return paddle::experimental::Tensor();
}
};
if (IsLeafTensor(tensor)) {
// Add RetainGrad as PostHook to AccumulationNode
std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(tensor);
PADDLE_ENFORCE(
grad_node.get() != nullptr,
paddle::platform::errors::Fatal("Detected NULL grad_node"
"Leaf tensor should have had grad_node "
"with type: GradNodeAccumulation"));
auto accumulation_grad_node =
std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node);
accumulation_grad_node->RetainGrad(hook);
// Append to GradientHooks
RegisterGradientHookForTensor(tensor,
std::make_shared<egr::CppTensorHook>(hook));
}
void RetainGradForTensor(const paddle::experimental::Tensor& tensor) {
if (IsLeafTensor(tensor)) {
// Leaf tensor's grad will always be retained
// Refer to implementation of AccumulationNode for more details
return;
} else {
// Append to GradientHooks
RegisterGradientHookForTensor(tensor, hook);
RetainGradForRegularNode(tensor);
}
}
......
......@@ -16,17 +16,17 @@
#include "paddle/fluid/eager/eager_tensor.h"
#include "paddle/fluid/eager/grad_node_info.h"
#include "paddle/fluid/eager/hooks.h"
#include "paddle/phi/api/all.h"
namespace egr {
namespace egr_utils_api {
void RegisterGradientHookForTensor(
int64_t RegisterGradientHookForTensor(
const paddle::experimental::Tensor& tensor,
std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>& hook);
std::shared_ptr<egr::TensorHook>&& hook);
void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor,
const std::function<void(void)>& hook);
std::shared_ptr<egr::TensorVoidHook>&& hook);
void RetainGradForTensor(const paddle::experimental::Tensor& tensor);
} // namespace egr_utils_api
......
......@@ -22,7 +22,7 @@
#include "paddle/phi/api/all.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/pten_utils.h"
#include "paddle/fluid/framework/phi_utils.h"
#include "paddle/fluid/framework/variable.h"
namespace egr {
......@@ -43,11 +43,11 @@ paddle::experimental::Tensor CreateTensorWithValue(
bool is_leaf) {
paddle::experimental::Tensor out = paddle::experimental::full(
phi::vectorize(ddim), paddle::experimental::Scalar(value), dtype,
phi::TransToPtenBackend(place));
phi::TransToPhiBackend(place));
auto meta = EagerUtils::autograd_meta(&out);
if (is_leaf) {
auto accumulation_node = std::make_shared<GradNodeAccumulation>();
auto accumulation_node = std::make_shared<GradNodeAccumulation>(meta);
meta->SetGradNode(accumulation_node);
meta->SetStopGradient(false);
}
......
......@@ -27,7 +27,7 @@
#include "paddle/fluid/pybind/pybind.h"
#include "paddle/fluid/string/string_helper.h"
// pten
// phi
#include "paddle/phi/kernels/declarations.h"
#define NUM_CREATED_DUP_INPUTS 4
......@@ -544,7 +544,7 @@ static bool CheckOpProto(proto::OpProto* op_proto) {
// since only OperatorWithKernel can run in dygraph mode.
auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
if (!all_kernels.count(op_type) &&
!phi::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) {
!phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type)) {
return false;
}
......@@ -554,6 +554,21 @@ static bool CheckOpProto(proto::OpProto* op_proto) {
return true;
}
static bool BeSameAsInput(const std::string& output_name,
const std::set<std::string>& input_names) {
if (output_name.size() < 4) {
return false;
}
if (output_name.substr(output_name.size() - 3, 3) == "Out") {
if (input_names.count(output_name.substr(0, output_name.size() - 3))) {
return true;
}
}
return false;
}
/* --------------------------------------- */
/* --------- Preprocess Ins/Outs --------- */
/* --------------------------------------- */
......@@ -1016,33 +1031,20 @@ static std::string GenerateGradNodeCreationContent(
const std::string& output_name = output.name();
const std::string& output_autograd_name = "p_autograd_" + output_name;
// Skip Intermediate Tensor
if (output.duplicable()) {
const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
" std::vector<egr::AutogradMeta*> %s = "
"egr::EagerUtils::autograd_meta(&%s);\n";
get_autograd_meta_str += paddle::string::Sprintf(
GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
if (op_passing_outs_map[op_type].count(output_name)) {
const std::string output_var_args_name = output_name + "Var";
const char* FWD_OUT_SYNC_BACK_TEMPLATE =
" egr::EagerUtils::OverwriteOutputs(%s, %s);\n";
get_autograd_meta_str += paddle::string::Sprintf(
FWD_OUT_SYNC_BACK_TEMPLATE, output_name, output_var_args_name);
}
} else {
const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
" egr::AutogradMeta* %s = "
"egr::EagerUtils::autograd_meta(&%s);\n";
get_autograd_meta_str += paddle::string::Sprintf(
GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
if (op_passing_outs_map[op_type].count(output_name)) {
const std::string output_var_args_name = output_name + "Var";
const char* FWD_OUT_SYNC_BACK_TEMPLATE =
" egr::EagerUtils::OverwriteOutputs(%s, %s);\n";
get_autograd_meta_str += paddle::string::Sprintf(
FWD_OUT_SYNC_BACK_TEMPLATE, output_name, output_var_args_name);
}
}
}
VLOG(6) << "Generated outputs autograd_meta";
......@@ -1145,6 +1147,8 @@ static std::string GenerateGradNodeCreationContent(
const std::string& output_autograd_name = "p_autograd_" + output_name;
size_t output_position = fwd_outputs_name_pos_map.at(output_name);
// Intermediate Tensor does not require SetHistory, nor RetainGrad
if (output.duplicable()) {
pass_stop_gradient_args += ", &" + output_autograd_name;
const char* SET_OUT_RANK_TEMPLATE =
......@@ -1180,11 +1184,13 @@ static std::string GenerateGradNodeCreationContent(
SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position);
}
VLOG(6) << "Generated Call RetainGradForTensor";
const char* RETAIN_GRAD_TEMPLATE =
" egr::EagerUtils::CheckAndRetainGrad(%s);\n";
grad_node_creation_str +=
paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name);
if (!output.intermediate()) {
VLOG(6) << "Generated Call RetainGradForTensor";
const char* RETAIN_GRAD_TEMPLATE =
" egr::EagerUtils::CheckAndRetainGrad(%s);\n";
grad_node_creation_str +=
paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name);
}
}
VLOG(6) << "Generated SetGradIn/OutMeta";
......@@ -1324,19 +1330,21 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
generated_function_body += "\n";
// Handle Dispensable Inputs
std::set<std::string> input_names;
for (const proto::OpProto::Var& input : in_vars) {
const std::string& input_name = input.name();
input_names.insert(input_name);
if (input.dispensable()) {
if (input.duplicable()) {
const char* FWD_INS_CONTENT_TEMPLATE =
" if(%s.size() > 0) "
"ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s)\n;";
"ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s);\n";
generated_function_body += paddle::string::Sprintf(
FWD_INS_CONTENT_TEMPLATE, input_name, input_name, input_name);
} else {
const char* FWD_INS_CONTENT_TEMPLATE =
" if(%s.initialized()) "
"ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s)\n;";
"ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s);\n";
generated_function_body += paddle::string::Sprintf(
FWD_INS_CONTENT_TEMPLATE, input_name, input_name, input_name);
}
......@@ -1372,11 +1380,21 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
core_ops_args_type_info[op_type].push_back("tensor");
}
const char* FWD_OUTS_CONTENT_TEMPLATE =
"{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },";
outs_contents_str += paddle::string::Sprintf(
FWD_OUTS_CONTENT_TEMPLATE, output_name, output_var_name);
if (BeSameAsInput(output_name, input_names)) {
if (!output.dispensable()) {
std::string input_name =
output_name.substr(0, output_name.size() - 3);
const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },";
outs_contents_str += paddle::string::Sprintf(
FWD_OUTS_CONTENT_TEMPLATE, output_name, input_name);
}
} else {
const char* FWD_OUTS_CONTENT_TEMPLATE =
"{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },";
outs_contents_str += paddle::string::Sprintf(
FWD_OUTS_CONTENT_TEMPLATE, output_name, output_var_name);
}
core_ops_args_info[op_type].push_back(output_var_name);
} else {
......@@ -1415,6 +1433,23 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
generated_function_body += outs_map_str;
generated_function_body += "\n";
for (const proto::OpProto::Var& output : out_vars) {
const std::string& output_name = output.name();
if (op_passing_outs_map[op_type].count(output_name)) {
if (BeSameAsInput(output_name, input_names)) {
if (output.dispensable()) {
std::string input_name =
output_name.substr(0, output_name.size() - 3);
const char* FWD_OUTS_CONTENT_TEMPLATE =
" if (ins.count(\"%s\")) outs[\"%s\"] = ins[\"%s\"];\n";
generated_function_body += paddle::string::Sprintf(
FWD_OUTS_CONTENT_TEMPLATE, input_name, output_name, input_name);
}
}
}
}
generated_function_body += "\n";
VLOG(6) << "Generated Outs Map";
// [Generation] Get Attrs
......@@ -1448,33 +1483,61 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
std::string output_varname = LegalizeVariableName(output_name);
if (output.duplicable()) {
const char* FWD_OUT_TENSORS_TEMPLATE =
" std::vector<paddle::experimental::Tensor> %s = "
"egr::EagerUtils::GetOutputs(outs[\"%s\"]);\n";
out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE,
output_varname, output_name);
if (op_passing_outs_map[op_type].count(output_name)) {
if (output.dispensable()) {
const char* FWD_OUT_TENSORS_TEMPLATE =
" std::vector<paddle::experimental::Tensor> %s;\n"
" if (outs.count(\"%s\")) "
"egr::EagerUtils::GetOutputs(outs[\"%s\"], %s);\n"
" egr::EagerUtils::Output2Result(%s, &%s);\n";
out_tensor_str = paddle::string::Sprintf(
FWD_OUT_TENSORS_TEMPLATE, output_varname, output_name,
output_name, output_var_args_name, output_var_args_name,
output_varname);
} else {
const char* FWD_OUT_TENSORS_TEMPLATE =
" std::vector<paddle::experimental::Tensor> %s;\n"
" egr::EagerUtils::GetOutputs(outs[\"%s\"], %s);\n"
" egr::EagerUtils::Output2Result(%s, &%s);\n";
out_tensor_str = paddle::string::Sprintf(
FWD_OUT_TENSORS_TEMPLATE, output_varname, output_name,
output_var_args_name, output_var_args_name, output_varname);
}
} else {
const char* FWD_OUT_TENSORS_TEMPLATE =
" std::vector<paddle::experimental::Tensor> %s;\n"
" egr::EagerUtils::GetOutputs(outs[\"%s\"], &%s);\n";
out_tensor_str =
paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE, output_varname,
output_name, output_varname);
}
return_types[return_position] =
"std::vector<paddle::experimental::Tensor>";
if (op_passing_outs_map[op_type].count(output_name) &&
bwd_info.GenerateForwardOnly()) {
const char* FWD_OUT_SYNC_BACK_TEMPLATE =
" egr::EagerUtils::OverwriteOutputs(outs[\"%s\"], %s);\n";
out_tensor_str += paddle::string::Sprintf(
FWD_OUT_SYNC_BACK_TEMPLATE, output_name, output_var_args_name);
}
} else {
const char* FWD_OUT_TENSOR_TEMPLATE =
" paddle::experimental::Tensor %s = "
"egr::EagerUtils::GetOutput(outs[\"%s\"][0]);\n";
out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE,
output_varname, output_name);
if (op_passing_outs_map[op_type].count(output_name) &&
bwd_info.GenerateForwardOnly()) {
const char* FWD_OUT_SYNC_BACK_TEMPLATE =
" egr::EagerUtils::OverwriteOutputs(outs[\"%s\"][0], %s);\n";
out_tensor_str += paddle::string::Sprintf(
FWD_OUT_SYNC_BACK_TEMPLATE, output_name, output_var_args_name);
if (op_passing_outs_map[op_type].count(output_name)) {
if (output.dispensable()) {
const char* FWD_OUT_TENSOR_TEMPLATE =
" if (outs.count(\"%s\")) "
"egr::EagerUtils::GetOutput(outs[\"%s\"][0], %s);\n"
" paddle::experimental::Tensor& %s = *%s;\n";
out_tensor_str = paddle::string::Sprintf(
FWD_OUT_TENSOR_TEMPLATE, output_name, output_name,
output_var_args_name, output_varname, output_var_args_name);
} else {
const char* FWD_OUT_TENSOR_TEMPLATE =
" egr::EagerUtils::GetOutput(outs[\"%s\"][0], %s);\n"
" paddle::experimental::Tensor& %s = *%s;\n";
out_tensor_str = paddle::string::Sprintf(
FWD_OUT_TENSOR_TEMPLATE, output_name, output_var_args_name,
output_varname, output_var_args_name);
}
} else {
const char* FWD_OUT_TENSOR_TEMPLATE =
" paddle::experimental::Tensor %s;\n"
" egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n";
out_tensor_str =
paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname,
output_name, output_varname);
}
return_types[return_position] = "paddle::experimental::Tensor";
}
......@@ -1494,6 +1557,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
GenerateGradNodeCreationContent(fwd_info, bwd_info);
generated_function_body += grad_node_creation_body_str;
generated_function_body += "\n";
// [Generation] Call RetainGradForTensor
VLOG(6) << "Generated GradNode Creation codes";
}
......@@ -1588,12 +1652,25 @@ static std::string GenerateSingleOpBase(
const std::string& attrs_name = "attrs_map" + std::to_string(*outs_size);
// [Generation] Get Ins Map
std::unordered_set<std::string> dispensable_input_name_set;
for (const auto& in : in_vars) {
if (in.dispensable()) dispensable_input_name_set.insert(in.name());
}
std::unordered_set<std::string> duplicable_input_name_set;
for (const auto& in : in_vars) {
if (in.duplicable()) duplicable_input_name_set.insert(in.name());
}
std::string ins_contents_str = "";
for (auto iter : grad_ins) {
const std::string& grad_input_name = iter.first;
if (grad_ins_fwd_slotname_map.count(grad_input_name)) {
// Fwd Tensor
const std::string& fwd_name =
grad_ins_fwd_slotname_map.at(grad_input_name);
if (dispensable_input_name_set.count(fwd_name)) {
continue;
}
std::string struct_fwd_input_name =
grad_ins_fwd_slotname_map.at(grad_input_name) + "_";
const char* GRAD_INS_FWD_CONTENT_TEMPLATE =
......@@ -1634,14 +1711,41 @@ static std::string GenerateSingleOpBase(
paddle::string::Sprintf(BWD_INS_MAP_TEMPLATE, ins_name, ins_contents_str);
generated_grad_function_body += ins_map_str;
VLOG(6) << "Generated Ins Map";
for (auto iter : grad_ins) {
const std::string& grad_input_name = iter.first;
// [Generation] Get Outs Map
std::unordered_set<std::string> duplicable_input_name_set;
for (const auto& in : in_vars) {
if (in.duplicable()) duplicable_input_name_set.insert(in.name());
if (grad_ins_fwd_slotname_map.count(grad_input_name)) {
// Fwd Tensor
const std::string& fwd_name =
grad_ins_fwd_slotname_map.at(grad_input_name);
if (dispensable_input_name_set.count(fwd_name)) {
std::string struct_fwd_input_name =
grad_ins_fwd_slotname_map.at(grad_input_name) + "_";
if (duplicable_input_name_set.count(fwd_name)) {
const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE =
" if(this->%s.size() > 0) %s[\"%s\"] = "
"egr::EagerUtils::TrySyncToVars(egr::EagerUtils::"
"RecoverTensorWrapper(&this->%s, nullptr));\n";
generated_grad_function_body += paddle::string::Sprintf(
DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, struct_fwd_input_name,
ins_name, grad_input_name, struct_fwd_input_name);
} else {
const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE =
" auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s, "
"nullptr);\n if(%s.initialized()) %s[\"%s\"] = "
"egr::EagerUtils::TrySyncToVars(%s);\n";
generated_grad_function_body += paddle::string::Sprintf(
DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, grad_input_name,
struct_fwd_input_name, grad_input_name, ins_name, grad_input_name,
grad_input_name);
}
}
}
}
VLOG(6) << "Generated Ins Map";
// [Generation] Get Outs Map
std::string outs_contents_str = "";
for (auto iter : grad_outs) {
const std::string& grad_output_name = iter.first;
......@@ -1936,12 +2040,13 @@ static std::string GenerateGradNodeCCContents(
const char* BWD_RETURN_TEMPLATE =
" std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads = "
"egr::GradNodeBase::ApplyGradientHooks(grads);\n"
"GradNode%s::ApplyGradientHooks(grads);\n"
" std::vector<std::vector<paddle::experimental::Tensor>> outputs(%d);\n"
" %s\n"
" return outputs;\n";
generated_grad_function_body = paddle::string::Sprintf(
BWD_RETURN_TEMPLATE, in_vars.size(), generated_grad_function_body);
generated_grad_function_body =
paddle::string::Sprintf(BWD_RETURN_TEMPLATE, fwd_op_type, in_vars.size(),
generated_grad_function_body);
// [Generation] Get Full Grad Function
const char* GRAD_FUNCTION_TEMPLATE =
......@@ -1987,6 +2092,7 @@ static std::string GenerateGradNodeHeaderContents(
"%s\n"
" // SetAttrMap\n"
"%s\n"
" std::string name() { return \"GradNode%s\"; }\n"
"\n"
" private:\n"
" // TensorWrappers\n"
......@@ -2085,8 +2191,8 @@ static std::string GenerateGradNodeHeaderContents(
std::string grad_node_str = paddle::string::Sprintf(
GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type,
set_tensor_wrappers_str, set_attr_map_str, tensor_wrapper_members_str,
attr_members_str);
set_tensor_wrappers_str, set_attr_map_str, op_type,
tensor_wrapper_members_str, attr_members_str);
return grad_node_str;
}
......
......@@ -127,6 +127,40 @@ def ReadBwdFile(filepath):
######################
### Yaml Parsers ###
######################
def IntermediateValidationCheck(intermediate_outputs, forward_returns_list):
# intermediate_outputs : [name0, name1, ...]
# forward_returns_list : [[ret_name, type, orig_pos], ...]
"""
Check whether intermediate_outputs are positioned
at the very end of forward_returns_list
"""
intermediate_positions = range(
len(forward_returns_list) - len(intermediate_outputs),
len(forward_returns_list))
for ret_name, _, pos in forward_returns_list:
if ret_name in intermediate_outputs:
assert pos in intermediate_positions
def ParseDispensable(string):
# string: "X, Y"
return [v.strip() for v in string.split(",")]
def ParseIntermediate(string):
return [v.strip() for v in string.split(",")]
def ParseNoNeedBuffer(string):
# string: "x, y"
no_need_buffer_set = set()
for name in string.split(","):
no_need_buffer_set.add(name.strip())
return no_need_buffer_set
def ParseYamlArgs(string):
# Example: const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y
......@@ -397,7 +431,7 @@ def SlotNameMatching(backward_inputs_list, backward_returns_list,
def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
backward_attrs_list):
backward_attrs_list, no_need_buffer_set):
# Inputs:
# fwd_api_name = ""
# backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...}
......@@ -410,15 +444,20 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
set_tensor_wrapper_methods_str = ""
tensor_wrapper_members_str = ""
for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items():
if tname in no_need_buffer_set:
no_need_buffer = "true"
else:
no_need_buffer = "false"
tensor_wrapper_name = GetSavedName(tname)
if IsPlainTensorType(ttype):
SET_PLAIN_TENSOR_WRAPPER_TEMPLATE = """
void SetTensorWrapper{}(const paddle::experimental::Tensor& {}, bool full_reserved) {{
{} = egr::TensorWrapper({}, full_reserved);
{} = egr::TensorWrapper({}, full_reserved, {});
}}
"""
set_tensor_wrapper_methods_str += SET_PLAIN_TENSOR_WRAPPER_TEMPLATE.format(
tname, tname, tensor_wrapper_name, tname)
tname, tname, tensor_wrapper_name, tname, no_need_buffer)
PLAIN_TENSOR_MEMBER_TEMPLATE = """
egr::TensorWrapper {};
......@@ -430,12 +469,12 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """
void SetTensorWrapper{}(const std::vector<paddle::experimental::Tensor>& {}, bool full_reserved) {{
for(const auto& eager_tensor : {}) {{
{}.emplace_back( egr::TensorWrapper(eager_tensor, full_reserved) );
{}.emplace_back( egr::TensorWrapper(eager_tensor, full_reserved, {}) );
}};
}}
"""
set_tensor_wrapper_methods_str += SET_VECTOR_TENSOR_WRAPPER_TEMPLATE.format(
tname, tname, tname, tensor_wrapper_name)
tname, tname, tname, tensor_wrapper_name, no_need_buffer)
VECTOR_TENSOR_MEMBER_TEMPLATE = """
std::vector<egr::TensorWrapper> {};
......@@ -562,11 +601,11 @@ std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std:
return node_definition_str
def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name,
forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list):
def GenerateNodeCreationCodes(
fwd_api_name, bwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list, optional_inputs):
# fwd_api_name = ""
# forward_inputs_position_map = { "name" : [type, fwd_position] }
# forward_outputs_position_map = { "name" : [type, fwd_position] }
......@@ -640,10 +679,17 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name,
# SetTensorWrappers
set_tensor_wrappers_list = []
for name, (_, is_fwd_input, _) in backward_fwd_input_map.items():
is_optional = (name in optional_inputs)
if is_fwd_input:
set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);"
if is_optional:
set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);"
else:
set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);"
else:
set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, false);"
if is_optional:
set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, false);"
else:
set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, false);"
set_tensor_wrappers_list.append(set_tensor_wrappers)
set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list)
......@@ -732,7 +778,8 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list):
backward_grad_output_map, backward_attrs_list,
optional_inputs, intermediate_outputs):
# fwd_api_name = ""
# forward_inputs_position_map = { "name" : [type, fwd_position] }
# forward_outputs_position_map = { "name" : [type, fwd_position] }
......@@ -741,6 +788,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
# backward_grad_input_map = { "name" : [type, fwd_position, orig_position] ...}
# backward_grad_output_map = { "name" : [type, fwd_position, orig_position] ...}
# backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
# optional_inputs = ["name0", ...]
# Get Function Args
num_inputs = len(forward_attrs_list) + len(forward_inputs_position_map.keys(
......@@ -750,17 +798,18 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
inputs_call_list = ["" for i in range(num_inputs)]
for name, (ttype, pos) in forward_inputs_position_map.items():
inputs_call_list[pos] = f"{name}"
is_optional = (name in optional_inputs)
if IsPlainTensorType(ttype):
inputs_args_definition_list[
pos] = f"const paddle::experimental::Tensor& {name}"
inputs_args_declaration_list[
pos] = f"const paddle::experimental::Tensor& {name}"
if is_optional:
arg_str = f"const paddle::optional<paddle::experimental::Tensor>& {name}"
else:
arg_str = f"const paddle::experimental::Tensor& {name}"
else:
assert IsVectorTensorType(ttype)
inputs_args_definition_list[
pos] = f"const std::vector<paddle::experimental::Tensor>& {name}"
inputs_args_declaration_list[
pos] = f"const std::vector<paddle::experimental::Tensor>& {name}"
arg_str = f"const std::vector<paddle::experimental::Tensor>& {name}"
inputs_args_definition_list[pos] = arg_str
inputs_args_declaration_list[pos] = arg_str
for name, atype, default_val, pos in forward_attrs_list:
inputs_call_list[pos] = name
......@@ -776,13 +825,20 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
inputs_call_args_str = ", ".join(inputs_call_list)
# Forward Full Logic
forward_call_str = f"auto api_result = paddle::experimental::{fwd_api_name}({inputs_call_args_str});"
if len(intermediate_outputs) == 0:
function_name = fwd_api_name
else:
function_name = fwd_api_name + "_intermediate"
forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});"
# Get return type list & outputs
num_outputs = len(forward_outputs_position_map.keys())
num_outputs = len(forward_outputs_position_map.keys()) - len(
intermediate_outputs)
returns_type_list = ["" for i in range(num_outputs)]
returns_list = ["" for i in range(num_outputs)]
for name, (rtype, pos) in forward_outputs_position_map.items():
if name in intermediate_outputs:
continue
if num_outputs == 1:
returns_list[0] = f"api_result"
else:
......@@ -808,7 +864,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
fwd_api_name, bwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list)
backward_grad_output_map, backward_attrs_list, optional_inputs)
FORWARD_FUNCTION_TEMPLATE = """
{} {}({}) {{
......@@ -997,6 +1053,10 @@ if __name__ == "__main__":
assert 'output' in fwd_api.keys()
assert 'backward' in fwd_api.keys()
no_need_buffer_set = set()
if 'no_need_buffer' in fwd_api.keys():
no_need_buffer_set = ParseNoNeedBuffer(fwd_api['no_need_buffer'])
fwd_api_name = fwd_api['api']
fwd_args_str = fwd_api['args']
fwd_returns_str = fwd_api['output']
......@@ -1008,6 +1068,12 @@ if __name__ == "__main__":
assert 'args' in bwd_api.keys()
assert 'output' in bwd_api.keys()
assert 'forward' in bwd_api.keys()
# Parse Dispensable Inputs
optional_inputs = []
if 'optional' in fwd_api.keys():
optional_inputs = ParseDispensable(fwd_api['optional'])
bwd_forward_str = bwd_api['forward']
bwd_args_str = bwd_api['args']
bwd_returns_str = bwd_api['output']
......@@ -1019,6 +1085,12 @@ if __name__ == "__main__":
print("Prased Forward Attrs List: ", forward_attrs_list)
print("Parsed Forward Returns List: ", forward_returns_list)
intermediate_outputs = []
if 'intermediate' in fwd_api.keys():
intermediate_outputs = ParseIntermediate(fwd_api['intermediate'])
IntermediateValidationCheck(intermediate_outputs, forward_returns_list)
# Collect Original Forward Inputs/Outputs and then perform validation checks
orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
fwd_args_str, fwd_returns_str)
......@@ -1062,7 +1134,8 @@ if __name__ == "__main__":
# Node Declaration Generation
node_declaration_str += GenerateNodeDeclaration(
fwd_api_name, backward_fwd_input_map, backward_attrs_list)
fwd_api_name, backward_fwd_input_map, backward_attrs_list,
no_need_buffer_set)
print("Generated Node Declaration: ", node_declaration_str)
node_definition_str += GenerateNodeDefinition(
......@@ -1076,7 +1149,8 @@ if __name__ == "__main__":
fwd_api_name, bwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list)
backward_grad_output_map, backward_attrs_list, optional_inputs,
intermediate_outputs)
print("Generated Forward Definition: ", forward_definition_str)
print("Generated Forward Declaration: ", forward_declaration_str)
forward_definition_str += definition_declaration_pair[0]
......
......@@ -14,7 +14,7 @@
import os
import argparse
from eager_gen import ReadFwdFile, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
from eager_gen import ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
atype_to_parsing_function = {
"bool": "CastPyArg2Boolean",
......@@ -70,10 +70,12 @@ def FindParsingFunctionFromAttributeType(atype):
def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
forward_attrs_list, forward_outputs_position_map):
forward_attrs_list, forward_outputs_position_map,
optional_inputs):
# forward_inputs_position_map = { "name" : [type, fwd_position] }
# forward_outputs_position_map = { "name" : [type, fwd_position] }
# forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
# optional_inputs = [name0, ...]
# Get EagerTensor from args
# Get dygraph function call args
......@@ -82,7 +84,14 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
dygraph_function_call_list = ["" for i in range(num_args)]
get_eager_tensor_str = ""
for name, (ttype, pos) in forward_inputs_position_map.items():
get_eager_tensor_str += f" auto& {name} = GetTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
is_optional = (name in optional_inputs)
if IsVectorTensorType(ttype):
get_eager_tensor_str += f" auto {name} = GetTensorListFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
else:
if is_optional:
get_eager_tensor_str += f" auto {name} = GetOptionalTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
else:
get_eager_tensor_str += f" auto {name} = GetTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
dygraph_function_call_list[pos] = f"{name}"
parse_attributes_str = ""
......@@ -134,7 +143,7 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj
fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str,
GetForwardFunctionName(fwd_api_name), dygraph_function_call_str)
python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}},\n"
python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n"
return python_c_function_str, python_c_function_reg_str
......@@ -188,7 +197,7 @@ static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) {
"""
core_ops_infos_registry = """
{\"get_final_state_core_ops_args_info\",
,{\"get_final_state_core_ops_args_info\",
(PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS,
\"C++ interface function for eager_get_final_state_core_ops_args_info.\"},
{\"get_final_state_core_ops_args_type_info\",
......@@ -267,6 +276,11 @@ if __name__ == "__main__":
fwd_args_str = fwd_api['args']
fwd_returns_str = fwd_api['output']
# Parse Dispensable Inputs
optional_inputs = []
if 'optional' in fwd_api.keys():
optional_inputs = ParseDispensable(fwd_api['optional'])
# Collect Original Forward Inputs/Outputs and then perform validation checks
forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward(
fwd_args_str, fwd_returns_str)
......@@ -283,7 +297,7 @@ if __name__ == "__main__":
python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction(
fwd_api_name, forward_inputs_position_map, forward_attrs_list,
forward_outputs_position_map)
forward_outputs_position_map, optional_inputs)
python_c_function_list.append(python_c_function_str)
python_c_function_reg_list.append(python_c_function_reg_str)
print("Generated Python-C Function: ", python_c_function_str)
......
......@@ -97,6 +97,7 @@ class AutogradMeta : public AbstractAutogradMeta {
"Should Not set NULL as GradNode pointer, since "
"our default Edge and autogradMeta has nullptr for "
"grad node. Set Nullptr will lead error."));
grad_node_ = grad_node;
}
......@@ -127,6 +128,12 @@ class AutogradMeta : public AbstractAutogradMeta {
stop_gradient_ = static_cast<int>(stop_gradient);
}
void WeakSetStopGradient(bool stop_gradient) {
if (stop_gradient_ == -1) {
stop_gradient_ = static_cast<int>(stop_gradient);
}
}
bool Persistable() const { return persistable_; }
void SetPersistable(bool persistable) { persistable_ = persistable; }
......
......@@ -14,10 +14,10 @@
#pragma once
// framework deps
#include "paddle/fluid/framework/pten_utils.h"
#include "paddle/fluid/framework/phi_utils.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h"
// pten deps
// Phi deps
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/api/lib/api_declare.h"
#include "paddle/phi/api/lib/utils/tensor_utils.h"
......@@ -31,7 +31,7 @@
* provide variable in
* paddle::framework::ExecutionContext to support it. We should remove this as
* soon as we finish our latest
* Pten Lib, and use paddle::experimental::Tensor instead.
* Phi Lib, and use paddle::experimental::Tensor instead.
*
* Note: Keep this class as clean as possible.
* This class should only support method declared in
......
......@@ -53,7 +53,7 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
meta->OutRankInfo());
} else {
meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
meta->OutRankInfo());
}
......@@ -69,13 +69,16 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
"adj_edges is designed to has the same size of grad "
"inputs's slot num."));
if (meta && !meta->StopGradient()) {
VLOG(6) << "Add Edges for slot: " << slot_id;
auto node = meta->GetMutableGradNode();
if (node) {
VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
<< this->name() << " to " << meta->GetMutableGradNode()->name();
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
meta->OutRankInfo());
} else {
meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
<< this->name() << " to " << meta->GetMutableGradNode()->name();
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
meta->OutRankInfo());
}
......@@ -207,22 +210,22 @@ const std::vector<std::vector<Edge>>& GradNodeBase::GetEdges() const {
return adj_edges_;
}
void GradNodeBase::RegisterGradientHook(
size_t slot_id, size_t rank,
const std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>& hook) {
gradient_hooks_.emplace_back(std::make_tuple(slot_id, rank, hook));
int64_t GradNodeBase::RegisterGradientHook(
size_t slot_id, size_t rank, std::shared_ptr<egr::TensorHook>&& hook) {
gradient_hooks_.emplace(next_hook_id_,
std::make_tuple(slot_id, rank, std::move(hook)));
return next_hook_id_++;
}
std::vector<std::vector<paddle::experimental::Tensor>>
GradNodeBase::ApplyGradientHooks(
const std::vector<std::vector<paddle::experimental::Tensor>>& tensors) {
std::vector<std::vector<paddle::experimental::Tensor>> outs(tensors.size());
for (auto& tuple : gradient_hooks_) {
size_t slot_id = std::get<0>(tuple);
size_t rank = std::get<1>(tuple);
std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>& hook = std::get<2>(tuple);
for (auto& hook_pair : gradient_hooks_) {
size_t slot_id = std::get<0>(hook_pair.second);
size_t rank = std::get<1>(hook_pair.second);
auto hook = std::get<2>(hook_pair.second);
PADDLE_ENFORCE(slot_id < tensors.size(),
paddle::platform::errors::Fatal(
......@@ -239,12 +242,11 @@ GradNodeBase::ApplyGradientHooks(
slot_out.resize(tensors[slot_id].size());
paddle::experimental::Tensor& out = slot_out[rank];
if (!out.defined() || !out.initialized()) {
VLOG(8) << "Run Hook for tensor: " << tensors[slot_id][rank].name();
out = hook(tensors[slot_id][rank]);
out = (*hook)(tensors[slot_id][rank]);
} else {
// If more than one hook is registered, the input to the next hook func
// should be the output of the previous hook
out = hook(out);
out = (*hook)(out);
}
}
......
......@@ -15,6 +15,7 @@
#pragma once
#include "paddle/fluid/eager/eager_tensor.h"
#include "paddle/fluid/eager/hooks.h"
#include "paddle/phi/api/all.h"
namespace egr {
......@@ -135,18 +136,30 @@ class GradNodeBase {
/**
* Register GradientHook
* **/
void RegisterGradientHook(size_t slot_id, size_t rank,
const std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>& hook);
int64_t RegisterGradientHook(size_t slot_id, size_t rank,
std::shared_ptr<egr::TensorHook>&& hook);
/**
* Remove GradientHook
* **/
bool RemoveGradientHook(const int64_t& hook_id) {
auto remove_cnt = gradient_hooks_.erase(hook_id);
if (remove_cnt == 0) {
return false;
}
return true;
}
/**
* Apply GradientHook
* **/
inline bool GradientHooksRegistered() { return gradient_hooks_.size() != 0; }
inline bool GradientHooksRegistered() { return !gradient_hooks_.empty(); }
std::vector<std::vector<paddle::experimental::Tensor>> ApplyGradientHooks(
const std::vector<std::vector<paddle::experimental::Tensor>>& tensors);
virtual std::string name() { return "GradNodeBase"; }
private:
// TODO(jiabin): Use SmallVector instead after merge PR from develop
......@@ -164,12 +177,14 @@ class GradNodeBase {
// Gradient Hooks
// Customer may register a list of hooks which will be called in order during
// backward
// Each entry consists one pair of <out_rank, std::function>
std::vector<std::tuple<
/* slot id */ size_t, /* rank */ size_t,
/* hook */ std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>>>
// Each entry consists one pair of
// <hook_id, <out_rank, std::shared_ptr<TensorHook>>>
std::map<int64_t, std::tuple<
/* slot id */ size_t, /* rank */ size_t,
/* hook */ std::shared_ptr<TensorHook>>>
gradient_hooks_;
int64_t next_hook_id_{0};
};
class Edge {
......
......@@ -12,17 +12,52 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/cross_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
cross, ops::CrossKernel<paddle::platform::CUDADeviceContext, float>,
ops::CrossKernel<paddle::platform::CUDADeviceContext, double>,
ops::CrossKernel<paddle::platform::CUDADeviceContext, int>,
ops::CrossKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
cross_grad,
ops::CrossGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::CrossGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::CrossGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::CrossGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
#pragma once
#include <functional>
#include <memory>
#include <utility>
#include <vector>
#include "paddle/phi/api/include/tensor.h"
namespace egr {
class TensorHook {
public:
virtual ~TensorHook() = default;
virtual paddle::experimental::Tensor operator()(
const paddle::experimental::Tensor& var) = 0;
};
class TensorVoidHook {
public:
virtual ~TensorVoidHook() = default;
virtual void operator()() = 0;
};
class CppTensorHook : public TensorHook {
public:
explicit CppTensorHook(std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>&& fn)
: fn_(std::move(fn)) {}
paddle::experimental::Tensor operator()(
const paddle::experimental::Tensor& var) override {
return fn_(var);
}
private:
std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>
fn_;
};
class CppTensorVoidHook : public TensorVoidHook {
public:
explicit CppTensorVoidHook(std::function<void()>&& fn) : fn_(std::move(fn)) {}
void operator()() override { return fn_(); }
private:
std::function<void()> fn_;
};
} // namespace egr
......@@ -34,7 +34,8 @@ class TensorWrapper {
public:
TensorWrapper() = default;
explicit TensorWrapper(const paddle::experimental::Tensor& tensor,
bool full_reserved = false) {
bool full_reserved = false,
bool no_need_buffer = false) {
/**
* Normally, we should fully reserved all non-output or non-leaf fwd tensor
* here. And for fwd output tensor, we should not reserve its autogradmeta,
......@@ -48,16 +49,30 @@ class TensorWrapper {
}
// shallow copy tensor_impl here
intermidiate_tensor_.set_impl(tensor.impl());
if (no_need_buffer) {
if (phi::DenseTensor::classof(tensor.impl().get())) {
// Only Copy Meta
phi::DenseTensor* dense_tensor =
static_cast<phi::DenseTensor*>(tensor.impl().get());
auto tw_dense_tensor = std::make_shared<phi::DenseTensor>();
tw_dense_tensor->set_meta(dense_tensor->meta());
intermidiate_tensor_.set_impl(tw_dense_tensor);
} else {
PADDLE_THROW(paddle::platform::errors::Fatal(
"Unrecognized tensor type for no_need_buffer feature"));
}
} else {
intermidiate_tensor_.set_impl(tensor.impl());
}
intermidiate_tensor_.set_name(tensor.name() + "@Saved");
PADDLE_ENFORCE_NOT_NULL(
EagerUtils::unsafe_autograd_meta(tensor),
paddle::platform::errors::Fatal(
"Full reserved Tensor should not have null autograd meta, since "
"tensor_wrapper is used to build backward info. There is no way "
"for us to build it with null autograd_meta."));
// copy output_rank
out_rank_info_ = EagerUtils::OutRankInfo(tensor);
// If an output is marked "intermedaite", we won't create
// autograd_meta for it.
// In that case, simply skip OutRankInfo Copy
if (EagerUtils::nullable_autograd_meta(tensor)) {
out_rank_info_ = EagerUtils::OutRankInfo(tensor);
}
}
paddle::experimental::Tensor recover(
......
......@@ -17,11 +17,14 @@
#include "gtest/gtest.h"
#include "paddle/fluid/eager/accumulation/accumulation_node.h"
#include "paddle/fluid/eager/api/utils/hook_utils.h"
#include "paddle/fluid/eager/eager_tensor.h"
#include "paddle/fluid/eager/grad_node_info.h"
#include "paddle/fluid/eager/grad_tensor_holder.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/eager/hooks.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/core/kernel_registry.h"
// TODO(jiabin): remove nolint here!!!
......@@ -37,7 +40,7 @@ TEST(AccumulationNode, Tensor) {
.get(),
meta);
dt0->mutable_data<paddle::platform::float16>(
paddle::platform::CPUPlace())[0] = 10.0;
paddle::platform::CPUPlace())[0] = paddle::platform::float16(10.0f);
paddle::experimental::Tensor et0 = paddle::experimental::Tensor(dt0);
std::shared_ptr<phi::DenseTensor> dt1 = std::make_shared<phi::DenseTensor>(
......@@ -47,84 +50,102 @@ TEST(AccumulationNode, Tensor) {
meta);
dt1->mutable_data<paddle::platform::float16>(
paddle::platform::CPUPlace())[0] = 20.0;
paddle::platform::CPUPlace())[0] = paddle::platform::float16(20.0f);
paddle::experimental::Tensor et1 = paddle::experimental::Tensor(dt1);
std::shared_ptr<phi::DenseTensor> input_dt =
std::make_shared<phi::DenseTensor>(
std::make_unique<paddle::experimental::DefaultAllocator>(
paddle::platform::CPUPlace())
.get(),
meta);
paddle::experimental::Tensor input_et =
paddle::experimental::Tensor(input_dt);
auto grad_meta = EagerUtils::autograd_meta(&input_et);
// Initialize Grad Tensor
std::shared_ptr<phi::DenseTensor> grad_dt =
std::make_shared<phi::DenseTensor>(
std::make_unique<paddle::experimental::DefaultAllocator>(
paddle::platform::CPUPlace())
.get(),
meta);
paddle::experimental::Tensor grad_et = paddle::experimental::Tensor(grad_dt);
grad_dt->mutable_data<paddle::platform::float16>(
paddle::platform::CPUPlace())[0] = paddle::platform::float16(0.0f);
grad_meta->MutableGrad()->set_impl(grad_dt);
// AccumulationNode
GradNodeAccumulation node = GradNodeAccumulation();
// Hook, RetainGrad
std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>
hook = [&grad_et](const paddle::experimental::Tensor& t) {
grad_et.set_impl(t.impl());
return grad_et;
};
node.RetainGrad(hook);
auto node = std::make_shared<GradNodeAccumulation>(grad_meta);
grad_meta->SetGradNode(node);
grad_meta->SetStopGradient(false);
// operator()
paddle::experimental::Tensor ret_et0 = node({{et0}})[0][0];
paddle::experimental::Tensor ret_et0 = node->operator()({{et0}})[0][0];
auto* ret_et0_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(ret_et0.impl())
->data<paddle::platform::float16>();
CHECK_EQ(ret_et0_ptr[0], paddle::platform::float16(10.0f));
paddle::experimental::Tensor ret_et1 = node({{et1}})[0][0];
paddle::experimental::Tensor ret_et1 = node->operator()({{et1}})[0][0];
auto* ret_et1_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(ret_et1.impl())
->data<paddle::platform::float16>();
CHECK_EQ(ret_et1_ptr[0], paddle::platform::float16(30.0f));
CHECK_EQ(ret_et1_ptr[0], paddle::platform::float16(20.0f));
// Retain Grad
auto* ret_grad_et_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(grad_et.impl())
->data<paddle::platform::float16>();
CHECK_EQ(ret_grad_et_ptr[0], paddle::platform::float16(30.0f));
// Check Retain Grad
CHECK_EQ(std::dynamic_pointer_cast<phi::DenseTensor>(et0.impl())
->data<paddle::platform::float16>()[0],
paddle::platform::float16(10.0f));
paddle::experimental::Tensor* grad = EagerUtils::mutable_grad(input_et);
auto* grad_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(grad->impl())
->data<paddle::platform::float16>();
CHECK_EQ(grad_ptr[0], paddle::platform::float16(30.0f));
// Reduce Hook case 1: Call RegisterReduceHook and run operator()
VLOG(6) << "Test Reduce Hook";
CHECK_EQ(std::dynamic_pointer_cast<phi::DenseTensor>(et0.impl())
->data<paddle::platform::float16>()[0],
paddle::platform::float16(10.0f));
auto reduce_hook_1 = [&](void) -> void {
auto* grad_et_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(grad_et.impl())
->data<paddle::platform::float16>();
grad_et_ptr[0] = 36.0;
auto* input_et_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(input_et.impl())
->mutable_data<paddle::platform::float16>(
paddle::platform::CPUPlace());
input_et_ptr[0] = 36.0;
VLOG(6) << "Running Reduce Hook";
};
node.RegisterReduceHook(reduce_hook_1);
node->RegisterReduceHook(
std::make_shared<egr::CppTensorVoidHook>(reduce_hook_1));
// operator()
paddle::experimental::Tensor _ret = node({{et0}})[0][0];
paddle::experimental::Tensor _ret = node->operator()({{et0}})[0][0];
// Check operator() result, should be 36.0
auto* _ret_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(_ret.impl())
->data<paddle::platform::float16>();
CHECK_EQ(_ret_ptr[0], paddle::platform::float16(36.0f));
CHECK_EQ(_ret_ptr[0], paddle::platform::float16(10.0f));
// Check Retain Grad, should be 36.0
auto* _ret_grad_et_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(grad_et.impl())
auto* _ret_input_et_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(input_et.impl())
->data<paddle::platform::float16>();
CHECK_EQ(_ret_grad_et_ptr[0], paddle::platform::float16(36.0f));
CHECK_EQ(_ret_input_et_ptr[0], paddle::platform::float16(36.0f));
// Reduce Hook case 2: Call RegisterReduceHook and ApplyReduceHooks directly
VLOG(6) << "Test Reduce Hook";
auto reduce_hook_2 = [&](void) -> void {
auto* ret_et0_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(et0.impl())
->data<paddle::platform::float16>();
->mutable_data<paddle::platform::float16>(
paddle::platform::CPUPlace());
ret_et0_ptr[0] = 100.0; // set to 100.0
VLOG(6) << "Running Reduce Hook";
};
node.RegisterReduceHook(reduce_hook_2);
node.ApplyReduceHooks();
node->RegisterReduceHook(
std::make_shared<egr::CppTensorVoidHook>(reduce_hook_2));
node->ApplyReduceHooks();
// Check ApplyReduceHooks result
CHECK_EQ(std::dynamic_pointer_cast<phi::DenseTensor>(et0.impl())
......
......@@ -17,6 +17,7 @@
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/eager_tensor.h"
#include "paddle/fluid/eager/grad_node_info.h"
#include "paddle/fluid/eager/hooks.h"
#include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h"
#include "paddle/phi/api/lib/utils/allocator.h"
......@@ -32,7 +33,7 @@ TEST(GradNodeInfo, GradSlotMeta) {
CHECK_EQ(grad_slot.Size(), 2);
}
TEST(GradNodeInfo, GradNodeBase) {
void TestGradNodeBase(bool is_remove_gradient_hook) {
VLOG(6) << "Construct Grad Node";
auto grad_test_node0 = std::make_shared<eager_test::GradTestNode>(
/* val */ 5.0, /* in_num */ 2, /* out_num */ 2);
......@@ -112,13 +113,25 @@ TEST(GradNodeInfo, GradNodeBase) {
VLOG(6) << "Running Gradient Hook";
return res;
};
grad_test_node0->RegisterGradientHook(0, 0, gradient_hook);
// 5 + 6
int64_t hook_id = grad_test_node0->RegisterGradientHook(
0, 0, std::make_shared<egr::CppTensorHook>(gradient_hook));
if (is_remove_gradient_hook) {
// Remove GradientHook
grad_test_node0->RemoveGradientHook(hook_id);
}
// Check results
auto grad_hook_res = grad_test_node0->ApplyGradientHooks(grads);
CHECK_EQ(
std::dynamic_pointer_cast<phi::DenseTensor>(grad_hook_res[0][0].impl())
->data<float>()[0],
11.0);
is_remove_gradient_hook ? 5.0 : 11.0);
}
TEST(GradNodeInfo, GradNodeBase) {
TestGradNodeBase(true);
TestGradNodeBase(false);
}
TEST(GradNodeInfo, Edge) {
......
......@@ -59,22 +59,18 @@ TEST(Backward, SingleNodeEmptyGrad) {
auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta->SetStopGradient(false);
AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
// Connect Tensor and AccumulationNode via AutoGradMeta
auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
auto acc_node_ptr =
std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
auto_grad_meta1->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta1->SetStopGradient(false);
egr_utils_api::RetainGradForTensor(leaf_tensor);
// Connect Node0 -> AccumulationNode via Edge
auto meta = egr::AutogradMeta();
meta.SetStopGradient(false);
meta.SetSingleOutRankWithSlot(0, 0);
meta.SetGradNode(acc_node_ptr);
std::vector<egr::AutogradMeta*> res = {&meta};
std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
node0_ptr->AddEdges(&res, 0);
}
std::vector<paddle::experimental::Tensor> outs = {target_tensor};
......@@ -123,22 +119,17 @@ TEST(Backward, SingleNodeCustomGrad) {
std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta->SetStopGradient(false);
// Connect Tensor and AccumulationNode via AutoGradMeta
auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
// Connect Tensor and AccumulationNode via AutoGradMeta
auto acc_node_ptr =
std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
auto_grad_meta1->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
egr_utils_api::RetainGradForTensor(leaf_tensor);
// Connect Node0 -> AccumulationNode via Edge
auto meta = egr::AutogradMeta();
meta.SetStopGradient(false);
meta.SetSingleOutRankWithSlot(0, 0);
meta.SetGradNode(acc_node_ptr);
std::vector<egr::AutogradMeta*> res = {&meta};
auto_grad_meta1->SetStopGradient(false);
std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
node0_ptr->AddEdges(&res, 0);
}
......@@ -201,22 +192,17 @@ TEST(Backward, LinearNodes) {
std::vector<egr::AutogradMeta*> res0 = {&meta0};
node0_ptr->AddEdges(&res0, 0);
AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
// Connect Tensor and AccumulationNode via AutoGradMeta
auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
auto acc_node_ptr =
std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
auto_grad_meta1->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
egr_utils_api::RetainGradForTensor(leaf_tensor);
// Connect Node1 -> AccumulationNode via Edge
auto meta1 = egr::AutogradMeta();
meta1.SetStopGradient(false);
meta1.SetSingleOutRankWithSlot(0, 0);
meta1.SetGradNode(acc_node_ptr);
std::vector<egr::AutogradMeta*> res1 = {&meta1};
auto_grad_meta1->SetStopGradient(false);
std::vector<egr::AutogradMeta*> res1 = {auto_grad_meta1};
node1_ptr->AddEdges(&res1, 0);
}
......@@ -311,22 +297,17 @@ TEST(Backward, WithAccumulation) {
std::vector<egr::AutogradMeta*> res1 = {&meta1};
node1_ptr->AddEdges(&res1, 0);
AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor);
// Connect Tensor and AccumulationNode via AutoGradMeta
auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
auto acc_node_ptr =
std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta2);
AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor);
auto_grad_meta2->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto_grad_meta2->SetSingleOutRankWithSlot(0, 0);
egr_utils_api::RetainGradForTensor(leaf_tensor);
// Connect Node2 -> AccumulationNode via Edge
auto meta2 = egr::AutogradMeta();
meta2.SetStopGradient(false);
meta2.SetSingleOutRankWithSlot(0, 0);
meta2.SetGradNode(acc_node_ptr);
std::vector<egr::AutogradMeta*> res2 = {&meta2};
auto_grad_meta2->SetStopGradient(false);
std::vector<egr::AutogradMeta*> res2 = {auto_grad_meta2};
node2_ptr->AddEdges(&res2, 0);
}
......
......@@ -46,34 +46,26 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
paddle::experimental::Tensor& target_tensor = target_tensors[0];
paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor();
{
auto scale_node_ptr = std::make_shared<GradNodeScale>(1, 1);
scale_node_ptr->SetAttributes_scale(5.0 /*scale*/);
scale_node_ptr->SetDefaultGradInOutMeta();
auto acc_node_ptr = std::make_shared<GradNodeAccumulation>();
AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&target_tensor);
auto_grad_meta->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(scale_node_ptr));
auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta->SetStopGradient(false);
egr_utils_api::RetainGradForTensor(target_tensor); // result: 1.0
auto meta = AutogradMeta();
meta.SetSingleOutRankWithSlot(0, 0);
meta.SetStopGradient(false);
meta.SetGradNode(acc_node_ptr);
std::vector<egr::AutogradMeta*> res = {&meta};
scale_node_ptr->AddEdges(&res, 0);
AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
auto_grad_meta1->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
egr_utils_api::RetainGradForTensor(leaf_tensor);
}
auto scale_node_ptr = std::make_shared<GradNodeScale>(1, 1);
scale_node_ptr->SetAttributes_scale(5.0 /*scale*/);
scale_node_ptr->SetDefaultGradInOutMeta();
AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&target_tensor);
auto_grad_meta->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(scale_node_ptr));
auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta->SetStopGradient(false);
egr_utils_api::RetainGradForTensor(target_tensor); // result: 1.0
AutogradMeta* meta = EagerUtils::autograd_meta(&leaf_tensor);
auto acc_node_ptr = std::make_shared<GradNodeAccumulation>(meta);
meta->SetStopGradient(false);
meta->SetSingleOutRankWithSlot(0, 0);
meta->SetGradNode(acc_node_ptr);
std::vector<egr::AutogradMeta*> res = {meta};
scale_node_ptr->AddEdges(&res, 0);
RunBackward(target_tensors, {});
......
......@@ -159,7 +159,7 @@ TEST(EagerUtils, PassStopGradient) {
CHECK(auto_grad0->StopGradient() == false);
egr::EagerUtils::PassStopGradient(true, auto_grad0.get(), auto_grad1.get(),
auto_grad2.get(), auto_grad3.get());
CHECK(auto_grad0->StopGradient() == true);
CHECK(auto_grad0->StopGradient() == false);
CHECK(auto_grad1->StopGradient() == true);
CHECK(auto_grad2->StopGradient() == true);
CHECK(auto_grad3->StopGradient() == true);
......
......@@ -27,6 +27,7 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/tensor_meta.h"
#include "paddle/fluid/eager/hooks.h"
#include "paddle/fluid/eager/tests/test_utils.h"
namespace egr {
......@@ -221,10 +222,6 @@ TEST(FwdBwdJoint, GradientHook) {
phi::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
egr_utils_api::RetainGradForTensor(tensor);
std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>
hook = &hook_function;
// 3. Run Forward
// Run Forward Node 0
float scale0 = 2.0;
......@@ -232,24 +229,27 @@ TEST(FwdBwdJoint, GradientHook) {
paddle::experimental::Tensor out0 =
egr::scale(tensor, scale0, bias0, true /*bias_after_scale*/,
true /*trace_backward*/);
egr_utils_api::RetainGradForTensor(out0); // hook: +5
egr_utils_api::RegisterGradientHookForTensor(out0, hook); // hook: +5
egr_utils_api::RetainGradForTensor(out0); // hook: +5
egr_utils_api::RegisterGradientHookForTensor(
out0, std::make_shared<egr::CppTensorHook>(hook_function)); // hook: +5
// Run Forward Node 1
float scale1 = 5.0;
float bias1 = 10.0;
paddle::experimental::Tensor out1 = egr::scale(
out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/);
egr_utils_api::RetainGradForTensor(out1); // hook: +5
egr_utils_api::RegisterGradientHookForTensor(out1, hook); // hook: +5
egr_utils_api::RetainGradForTensor(out1); // hook: +5
egr_utils_api::RegisterGradientHookForTensor(
out1, std::make_shared<egr::CppTensorHook>(hook_function)); // hook: +5
// Run Forward Node 2
float scale2 = 10.0;
float bias2 = 20.0;
paddle::experimental::Tensor out2 = egr::scale(
out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/);
egr_utils_api::RetainGradForTensor(out2); // hook: +5
egr_utils_api::RegisterGradientHookForTensor(out2, hook); // hook: +5
egr_utils_api::RetainGradForTensor(out2); // hook: +5
egr_utils_api::RegisterGradientHookForTensor(
out2, std::make_shared<egr::CppTensorHook>(hook_function)); // hook: +5
// 4. Run Backward
std::vector<paddle::experimental::Tensor> outs = {out1, out2};
......
......@@ -28,6 +28,7 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/tensor_meta.h"
#include "paddle/fluid/eager/hooks.h"
#include "paddle/fluid/eager/tests/test_utils.h"
namespace egr {
......@@ -79,16 +80,10 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
// Set grad in/out meta for node0
scale_node_ptr->SetDefaultGradInOutMeta();
// Create AccumulationNode
auto acc_node_ptr = std::make_shared<GradNodeAccumulation>();
// Connect Input Tensor and ScaleNode via AutoGradMeta
// Apply RetainGrad
{
// ScaleNode Hook: +3
std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>
hook = &hook_function;
auto auto_grad_meta = std::make_shared<AutogradMeta>();
auto_grad_meta->SetGradNode(
......@@ -99,38 +94,36 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
auto_grad_meta));
egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook);
egr_utils_api::RegisterGradientHookForTensor(
target_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
egr_utils_api::RetainGradForTensor(
target_tensor); // result: 1.0 + 3.0 = 4.0
egr_utils_api::RetainGradForTensor(
target_tensor); // result: 1.0 + 3.0 = 4.0
}
// Connect ScaleNode -> AccumulationNode via Edge
{
auto meta = AutogradMeta();
meta.SetStopGradient(false);
meta.SetSingleOutRankWithSlot(0, 0);
meta.SetGradNode(acc_node_ptr);
std::vector<egr::AutogradMeta*> res = {&meta};
scale_node_ptr->AddEdges(&res, 0);
}
// Retain Grad for leaf tensor1
paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor();
{
// AccumulationNode Hook: +3
std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>
hook = &hook_function;
auto auto_grad_meta = std::make_shared<AutogradMeta>();
auto_grad_meta->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto acc_node_ptr =
std::make_shared<GradNodeAccumulation>(auto_grad_meta.get());
auto_grad_meta->SetStopGradient(false);
auto_grad_meta->SetGradNode(acc_node_ptr);
auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
std::vector<egr::AutogradMeta*> res = {auto_grad_meta.get()};
scale_node_ptr->AddEdges(&res, 0);
leaf_tensor.set_autograd_meta(
std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
auto_grad_meta));
egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook);
egr_utils_api::RegisterGradientHookForTensor(
leaf_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
egr_utils_api::RetainGradForTensor(
leaf_tensor); // result: 4.0*5.0 + 3.0 = 23.0
}
......@@ -160,16 +153,11 @@ TEST(RetainGrad, HookAfterRetainGrad) {
scale_node_ptr->SetAttributes_scale(5.0 /*scale*/);
// Set grad in/out meta for node0
scale_node_ptr->SetDefaultGradInOutMeta();
// Create AccumulationNode
auto acc_node_ptr = std::make_shared<GradNodeAccumulation>();
// Connect Input Tensor and ScaleNode via AutoGradMeta
// Apply RetainGrad
{
// ScaleNode Hook: +3
std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>
hook = &hook_function;
auto auto_grad_meta = std::make_shared<AutogradMeta>();
auto_grad_meta->SetGradNode(
......@@ -181,40 +169,30 @@ TEST(RetainGrad, HookAfterRetainGrad) {
auto_grad_meta));
egr_utils_api::RetainGradForTensor(target_tensor); // result: 1.0
egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook);
}
// Connect ScaleNode -> AccumulationNode via Edge
{
auto meta = AutogradMeta();
meta.SetStopGradient(false);
meta.SetSingleOutRankWithSlot(0, 0);
meta.SetGradNode(acc_node_ptr);
std::vector<egr::AutogradMeta*> res = {&meta};
scale_node_ptr->AddEdges(&res, 0);
egr_utils_api::RegisterGradientHookForTensor(
target_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
}
// Retain Grad for leaf tensor1
paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor();
{
// AccumulationNode Hook: +3
std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>
hook = &hook_function;
auto auto_grad_meta = std::make_shared<AutogradMeta>();
auto_grad_meta->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto acc_node_ptr =
std::make_shared<GradNodeAccumulation>(auto_grad_meta.get());
auto_grad_meta->SetGradNode(acc_node_ptr);
auto_grad_meta->SetStopGradient(false);
std::vector<egr::AutogradMeta*> res = {auto_grad_meta.get()};
scale_node_ptr->AddEdges(&res, 0);
auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
leaf_tensor.set_autograd_meta(
std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
auto_grad_meta));
egr_utils_api::RetainGradForTensor(
leaf_tensor); // RetainGrad for leaf tensor gets
// postponed, result: 4.0*5.0 + 3.0 =
// 23.0
egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook);
egr_utils_api::RegisterGradientHookForTensor(
leaf_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
}
RunBackward(target_tensors, {});
......
......@@ -24,6 +24,7 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
#include "paddle/fluid/eager/hooks.h"
#include "paddle/phi/core/kernel_registry.h"
namespace egr {
......@@ -54,7 +55,7 @@ paddle::experimental::Tensor hook_function(
return ret;
}
TEST(Hook_intermidiate, Sigmoid) {
void test_sigmoid(bool is_remove_gradient_hook) {
// Prepare Device Contexts
VLOG(6) << "Init Env";
eager_test::InitEnv(paddle::platform::CPUPlace());
......@@ -67,11 +68,6 @@ TEST(Hook_intermidiate, Sigmoid) {
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 0.0, true);
VLOG(6) << "Make Hook function";
std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>
hook = &hook_function;
VLOG(6) << "Make ReduceHook function";
auto reduce_hook = [&](void) -> void {
auto* t_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl())
......@@ -85,10 +81,12 @@ TEST(Hook_intermidiate, Sigmoid) {
egr_utils_api::RetainGradForTensor(tensor);
VLOG(6) << "Register GradientHook for Tensor";
egr_utils_api::RegisterGradientHookForTensor(tensor, hook);
int64_t hook_id = egr_utils_api::RegisterGradientHookForTensor(
tensor, std::make_shared<CppTensorHook>(hook_function));
VLOG(6) << "Register ReduceHook for Tensor";
egr_utils_api::RegisterReduceHookForTensor(tensor, reduce_hook);
egr_utils_api::RegisterReduceHookForTensor(
tensor, std::make_shared<CppTensorVoidHook>(reduce_hook));
VLOG(6) << "Runing Forward";
auto output_tensor = sigmoid_dygraph_function(tensor, {});
......@@ -98,11 +96,17 @@ TEST(Hook_intermidiate, Sigmoid) {
std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
if (is_remove_gradient_hook) {
std::shared_ptr<GradNodeBase> grad_node_tmp = EagerUtils::grad_node(tensor);
grad_node_tmp->RemoveGradientHook(hook_id);
}
VLOG(6) << "Runing Backward";
RunBackward(target_tensors, {});
VLOG(6) << "Finish Backward";
eager_test::CompareGradTensorWithValue<float>(tensor, 0.25 + 3);
eager_test::CompareGradTensorWithValue<float>(
tensor, is_remove_gradient_hook ? 0.25 : 0.25 + 3.0);
VLOG(6) << "Checking ReduceHook results";
for (int i = 0; i < tensor.numel(); i++) {
......@@ -113,7 +117,7 @@ TEST(Hook_intermidiate, Sigmoid) {
VLOG(6) << "After Tests";
}
TEST(Hook_intermidiate, ElementwiseAdd) {
void test_elementwiseAdd(bool is_remove_gradient_hook) {
// Prepare Device Contexts
eager_test::InitEnv(paddle::platform::CPUPlace());
......@@ -132,11 +136,7 @@ TEST(Hook_intermidiate, ElementwiseAdd) {
ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 2.0, true);
std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>
hook = &hook_function;
auto reduce_hook = [&](void) -> void {
auto reduce_hook = [&]() -> void {
auto* t_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(Y.impl())->data<float>();
for (int i = 0; i < Y.numel(); i++) {
......@@ -145,18 +145,26 @@ TEST(Hook_intermidiate, ElementwiseAdd) {
};
egr_utils_api::RetainGradForTensor(Y);
egr_utils_api::RegisterGradientHookForTensor(Y, hook);
egr_utils_api::RegisterReduceHookForTensor(Y, reduce_hook);
int64_t hook_id = egr_utils_api::RegisterGradientHookForTensor(
Y, std::make_shared<CppTensorHook>(hook_function));
egr_utils_api::RegisterReduceHookForTensor(
Y, std::make_shared<CppTensorVoidHook>(reduce_hook));
auto output_tensor = elementwise_add_dygraph_function(X, Y, {});
eager_test::CompareTensorWithValue<float>(output_tensor, 5);
std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
if (is_remove_gradient_hook) {
std::shared_ptr<GradNodeBase> grad_node_tmp = EagerUtils::grad_node(Y);
grad_node_tmp->RemoveGradientHook(hook_id);
}
RunBackward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(X, 1.0);
eager_test::CompareGradTensorWithValue<float>(Y, 4.0);
eager_test::CompareGradTensorWithValue<float>(
Y, is_remove_gradient_hook ? 1.0 : 1.0 + 3.0);
// Checking ReduceHook results
for (int i = 0; i < Y.numel(); i++) {
......@@ -166,7 +174,7 @@ TEST(Hook_intermidiate, ElementwiseAdd) {
}
}
TEST(Hook_intermidiate, Matmul_v2) {
void test_matmul(bool is_remove_gradient_hook) {
// Prepare Device Contexts
eager_test::InitEnv(paddle::platform::CPUPlace());
......@@ -185,10 +193,6 @@ TEST(Hook_intermidiate, Matmul_v2) {
ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 2.0, true);
std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>
hook = &hook_function;
auto reduce_hook = [&](void) -> void {
auto* t_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(Y.impl())->data<float>();
......@@ -198,19 +202,27 @@ TEST(Hook_intermidiate, Matmul_v2) {
};
egr_utils_api::RetainGradForTensor(Y);
egr_utils_api::RegisterGradientHookForTensor(Y, hook);
egr_utils_api::RegisterReduceHookForTensor(Y, reduce_hook);
int64_t hook_id = egr_utils_api::RegisterGradientHookForTensor(
Y, std::make_shared<CppTensorHook>(hook_function));
egr_utils_api::RegisterReduceHookForTensor(
Y, std::make_shared<CppTensorVoidHook>(reduce_hook));
auto output_tensor = matmul_v2_dygraph_function(
X, Y, {{"trans_x", false}, {"trans_y", false}});
eager_test::CompareTensorWithValue<float>(output_tensor, 96);
std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
if (is_remove_gradient_hook) {
std::shared_ptr<GradNodeBase> grad_node_tmp = EagerUtils::grad_node(Y);
grad_node_tmp->RemoveGradientHook(hook_id);
}
RunBackward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20);
eager_test::CompareGradTensorWithValue<float>(Y, 3.0 * 4 + 3);
eager_test::CompareGradTensorWithValue<float>(
Y, is_remove_gradient_hook ? 3.0 * 4 : 3.0 * 4 + 3);
// Checking ReduceHook results
for (int i = 0; i < Y.numel(); i++) {
......@@ -219,6 +231,22 @@ TEST(Hook_intermidiate, Matmul_v2) {
static_cast<float>(100.0f));
}
}
TEST(Hook_intermidiate, Sigmoid) {
// True or false represents whether to call RemoveGradientHook
test_sigmoid(true);
test_sigmoid(false);
}
TEST(Hook_intermidiate, ElementwiseAdd) {
test_elementwiseAdd(true);
test_elementwiseAdd(false);
}
TEST(Hook_intermidiate, Matmul_v2) {
test_matmul(true);
test_matmul(false);
}
} // namespace egr
USE_OP(sigmoid);
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/eager/accumulation/accumulation_node.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/eager/api/utils/hook_utils.h"
#include "paddle/fluid/eager/tensor_wrapper.h"
......@@ -21,9 +22,8 @@
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/tensor_meta.h"
#include "paddle/fluid/eager/accumulation/accumulation_node.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/pten_utils.h"
#include "paddle/fluid/framework/phi_utils.h"
#include "paddle/fluid/framework/variable.h"
PADDLE_DEFINE_EXPORTED_bool(retain_grad_for_all_tensor, true,
......@@ -109,6 +109,16 @@ std::shared_ptr<GradNodeBase> EagerUtils::grad_node(
}
}
paddle::experimental::Tensor* EagerUtils::mutable_grad(
const paddle::experimental::Tensor& target) {
auto* meta = nullable_autograd_meta(target);
if (meta) {
return meta->MutableGrad();
} else {
return nullptr;
}
}
void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
const std::shared_ptr<GradNodeBase>& grad_node) {
for (const auto& autograd_meta : *autograd_metas) {
......@@ -220,53 +230,62 @@ paddle::experimental::Tensor EagerUtils::GetOutput(
return paddle::experimental::Tensor(out->GetTensorBase(), out->name());
}
void EagerUtils::OverwriteOutputs(const std::shared_ptr<EagerVariable>& out,
paddle::experimental::Tensor* tensor) {
void EagerUtils::GetOutput(const std::shared_ptr<EagerVariable>& out,
paddle::experimental::Tensor* out_var) {
PADDLE_ENFORCE_NOT_NULL(
tensor, paddle::platform::errors::Fatal(
"Tensor is null and cannot be copied. "
"We are tring to OverwriteOutput from its "
"shared_ptr, this error may indicate some outputs "
"are nullptr"));
tensor->set_impl(out->GetTensorBase());
out_var, paddle::platform::errors::Fatal(
"Tensor is null and cannot be copied. "
"We are tring to OverwriteOutput from its "
"shared_ptr, this error may indicate some outputs "
"are nullptr"));
out_var->set_impl(out->GetTensorBase());
}
void EagerUtils::OverwriteOutputs(
void EagerUtils::GetOutputs(
const std::vector<std::shared_ptr<EagerVariable>>& outs,
const std::vector<paddle::experimental::Tensor*>& tensors) {
PADDLE_ENFORCE_EQ(
outs.size(), tensors.size(),
paddle::platform::errors::Fatal(
"We are tring to OverwriteOutputs which passed in and it expected "
"elements num of outs and origin outputs are equal, but we got outs "
"size of: %d, and tensors passed in size is: %d",
outs.size(), tensors.size()));
std::vector<paddle::experimental::Tensor>* result) {
for (size_t i = 0; i < outs.size(); i++) {
OverwriteOutputs(outs[i], tensors[i]);
result->emplace_back(outs[i]->GetTensorBase());
}
}
void EagerUtils::OverwriteOutputs(const paddle::experimental::Tensor& out,
paddle::experimental::Tensor* tensor) {
PADDLE_ENFORCE_NOT_NULL(
tensor, paddle::platform::errors::Fatal(
"Tensor is null and cannot be copied. "
"We are tring to OverwriteOutput from its "
"shared_ptr, this error may indicate some outputs "
"are nullptr"));
*tensor = out;
}
void EagerUtils::OverwriteOutputs(
const std::vector<paddle::experimental::Tensor>& outs,
const std::vector<paddle::experimental::Tensor*>& tensors) {
void EagerUtils::GetOutputs(
const std::vector<std::shared_ptr<EagerVariable>>& outs,
const std::vector<paddle::experimental::Tensor*>& out_var) {
for (size_t i = 0; i < outs.size(); i++) {
PADDLE_ENFORCE_NOT_NULL(
tensors[i], paddle::platform::errors::Fatal(
out_var[i], paddle::platform::errors::Fatal(
"Tensor is null and cannot be copied. "
"We are tring to OverwriteOutput from its "
"shared_ptr, this error may indicate some outputs "
"are nullptr"));
*tensors[i] = outs[i];
out_var[i]->set_impl(outs[i]->GetTensorBase());
}
}
void EagerUtils::GetOutputs(const std::shared_ptr<EagerVariable>& out,
std::vector<paddle::experimental::Tensor>* result) {
result->emplace_back(out->GetTensorBase());
}
void EagerUtils::GetOutputs(
const std::shared_ptr<EagerVariable>& out,
const std::vector<paddle::experimental::Tensor*>& out_var) {
PADDLE_ENFORCE_NOT_NULL(
out_var[0], paddle::platform::errors::Fatal(
"Tensor is null and cannot be copied. "
"We are tring to OverwriteOutput from its "
"shared_ptr, this error may indicate some outputs "
"are nullptr"));
out_var[0]->set_impl(out->GetTensorBase());
}
void EagerUtils::Output2Result(
const std::vector<paddle::experimental::Tensor*>& out_var,
std::vector<paddle::experimental::Tensor>* result) {
result->reserve(out_var.size());
for (size_t i = 0; i < out_var.size(); i++) {
result->emplace_back(*out_var[i]);
}
}
......@@ -333,7 +352,8 @@ std::shared_ptr<egr::GradNodeBase> EagerUtils::GetGradAccumulationNode(
} else {
if (!autograd_ptr->StopGradient()) {
VLOG(6) << "Add GradNodeAccumulation for tensor: " << tensor.name();
autograd_ptr->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
autograd_ptr->SetGradNode(
std::make_shared<egr::GradNodeAccumulation>(autograd_ptr));
return autograd_ptr->GetMutableGradNode();
} else {
return nullptr;
......
......@@ -77,7 +77,7 @@ class PassStopGradientIter : public IterHelper<AutogradMeta*> {
VLOG(2) << "Tensor is NULL";
return;
}
element->SetStopGradient(stop_gradient_);
element->WeakSetStopGradient(stop_gradient_);
}
bool stop_gradient_ = true;
......@@ -102,6 +102,8 @@ class EagerUtils {
static std::shared_ptr<GradNodeBase> grad_node(
const paddle::experimental::Tensor& target);
static paddle::experimental::Tensor* mutable_grad(
const paddle::experimental::Tensor& target);
// Set history is used to set backward info during forward process, it will
// set forward var's autograd meta's grad node as current backward node.
......@@ -173,17 +175,24 @@ class EagerUtils {
const std::vector<std::shared_ptr<EagerVariable>>& outs);
static paddle::experimental::Tensor GetOutput(
const std::shared_ptr<EagerVariable>& out);
// Sync Back to origin output Tensor
static void OverwriteOutputs(const std::shared_ptr<EagerVariable>& out,
paddle::experimental::Tensor* tensor);
static void OverwriteOutputs(const paddle::experimental::Tensor& out,
paddle::experimental::Tensor* tensor);
static void OverwriteOutputs(
static void GetOutput(const std::shared_ptr<EagerVariable>& out,
paddle::experimental::Tensor* out_var);
static void GetOutputs(
const std::vector<std::shared_ptr<EagerVariable>>& outs,
const std::vector<paddle::experimental::Tensor*>& tensors);
static void OverwriteOutputs(
const std::vector<paddle::experimental::Tensor>& outs,
const std::vector<paddle::experimental::Tensor*>& tensors);
std::vector<paddle::experimental::Tensor>* result);
static void GetOutputs(
const std::vector<std::shared_ptr<EagerVariable>>& outs,
const std::vector<paddle::experimental::Tensor*>& out_var);
static void GetOutputs(const std::shared_ptr<EagerVariable>& out,
std::vector<paddle::experimental::Tensor>* result);
static void GetOutputs(
const std::shared_ptr<EagerVariable>& out,
const std::vector<paddle::experimental::Tensor*>& out_var);
static void Output2Result(
const std::vector<paddle::experimental::Tensor*>& out_var,
std::vector<paddle::experimental::Tensor>* result);
// end Intermidate needed
static void CheckAndRetainGrad(const paddle::experimental::Tensor& tensor);
......
......@@ -193,19 +193,19 @@ cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_va
cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
IF(WITH_XPU)
cc_library(pten_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows_utils place pten var_type_traits pten_api_utils op_info xpu_op_list)
cc_library(phi_utils SRCS phi_utils.cc DEPS lod_tensor selected_rows_utils place phi var_type_traits phi_api_utils op_info xpu_op_list)
ELSE()
cc_library(pten_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows_utils place pten var_type_traits pten_api_utils op_info)
cc_library(phi_utils SRCS phi_utils.cc DEPS lod_tensor selected_rows_utils place phi var_type_traits phi_api_utils op_info)
ENDIF()
IF(WITH_XPU)
cc_library(operator SRCS operator.cc DEPS xpu_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils
pten pten_utils kernel_factory infershape_utils op_utils)
phi phi_utils kernel_factory infershape_utils op_utils)
ELSE()
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils
pten pten_utils kernel_factory infershape_utils op_utils)
phi phi_utils kernel_factory infershape_utils op_utils)
ENDIF()
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
......@@ -412,7 +412,7 @@ cc_library(save_load_util SRCS save_load_util.cc DEPS tensor scope layer)
cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
cc_library(generator SRCS generator.cc DEPS enforce place)
cc_library(infershape_utils SRCS infershape_utils.cc DEPS lod_tensor selected_rows_utils attribute place pten var_type_traits pten pten_api_utils op_info shape_inference)
cc_library(infershape_utils SRCS infershape_utils.cc DEPS lod_tensor selected_rows_utils attribute place phi var_type_traits phi phi_api_utils op_info shape_inference)
cc_test(infershape_utils_test SRCS infershape_utils_test.cc DEPS infershape_utils infermeta_utils meta_tensor)
# Get the current working branch
......@@ -436,9 +436,8 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
configure_file(commit.h.in commit.h)
cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_meta_info pten_api)
cc_library(custom_kernel SRCS custom_kernel.cc DEPS
tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_kernel_info pten_api)
cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api)
cc_library(custom_kernel SRCS custom_kernel.cc DEPS op_registry phi_custom_kernel phi_tensor_raw)
#cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
#cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
......@@ -451,7 +450,7 @@ if(WITH_TESTING AND TEST selected_rows_utils_test)
endif()
cc_test(scope_guard_test SRCS scope_guard_test.cc)
cc_test(pten_utils_test SRCS pten_utils_test.cc DEPS pten_utils)
cc_test(phi_utils_test SRCS phi_utils_test.cc DEPS phi_utils)
if(WITH_GPU OR WITH_ROCM)
cc_library(fluid_convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
......@@ -459,4 +458,3 @@ else()
cc_library(fluid_convert_utils SRCS convert_utils.cc DEPS data_type place)
endif()
cc_test(convert_utils_test SRCS convert_utils_test.cc DEPS fluid_convert_utils)
cc_test(custom_kernel_test SRCS custom_kernel_test.cc DEPS custom_kernel pten_tensor)
......@@ -33,7 +33,7 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/pybind/pybind.h"
// pten
// phi
#include "paddle/phi/kernels/declarations.h"
namespace paddle {
......
......@@ -18,7 +18,7 @@ limitations under the License. */
namespace paddle {
namespace framework {
paddle::experimental::DataType TransToPtenDataType(
paddle::experimental::DataType TransToPhiDataType(
const paddle::framework::proto::VarType::Type& dtype) {
// Set the order of case branches according to the frequency with
// the data type is used
......
......@@ -32,7 +32,7 @@ namespace framework {
using DataType = paddle::experimental::DataType;
using DataLayout = paddle::experimental::DataLayout;
DataType TransToPtenDataType(
DataType TransToPhiDataType(
const paddle::framework::proto::VarType::Type& dtype);
paddle::framework::proto::VarType::Type TransToProtoVarType(
......
......@@ -43,35 +43,35 @@ TEST(ConvertUtils, DataType) {
CHECK(paddle::framework::TransToProtoVarType(paddle::DataType::FLOAT16) ==
paddle::framework::proto::VarType::FP16);
// proto -> enum
CHECK(paddle::framework::TransToPtenDataType(
CHECK(paddle::framework::TransToPhiDataType(
paddle::framework::proto::VarType::FP64) ==
paddle::DataType::FLOAT64);
CHECK(paddle::framework::TransToPtenDataType(
CHECK(paddle::framework::TransToPhiDataType(
paddle::framework::proto::VarType::FP32) ==
paddle::DataType::FLOAT32);
CHECK(paddle::framework::TransToPtenDataType(
CHECK(paddle::framework::TransToPhiDataType(
paddle::framework::proto::VarType::INT64) ==
paddle::DataType::INT64);
CHECK(paddle::framework::TransToPtenDataType(
CHECK(paddle::framework::TransToPhiDataType(
paddle::framework::proto::VarType::INT32) ==
paddle::DataType::INT32);
CHECK(paddle::framework::TransToPtenDataType(
CHECK(paddle::framework::TransToPhiDataType(
paddle::framework::proto::VarType::INT8) == paddle::DataType::INT8);
CHECK(paddle::framework::TransToPtenDataType(
CHECK(paddle::framework::TransToPhiDataType(
paddle::framework::proto::VarType::UINT8) ==
paddle::DataType::UINT8);
CHECK(paddle::framework::TransToPtenDataType(
CHECK(paddle::framework::TransToPhiDataType(
paddle::framework::proto::VarType::INT16) ==
paddle::DataType::INT16);
CHECK(paddle::framework::TransToPtenDataType(
CHECK(paddle::framework::TransToPhiDataType(
paddle::framework::proto::VarType::BOOL) == paddle::DataType::BOOL);
CHECK(paddle::framework::TransToPtenDataType(
CHECK(paddle::framework::TransToPhiDataType(
paddle::framework::proto::VarType::COMPLEX64) ==
paddle::DataType::COMPLEX64);
CHECK(paddle::framework::TransToPtenDataType(
CHECK(paddle::framework::TransToPhiDataType(
paddle::framework::proto::VarType::COMPLEX128) ==
paddle::DataType::COMPLEX128);
CHECK(paddle::framework::TransToPtenDataType(
CHECK(paddle::framework::TransToPhiDataType(
paddle::framework::proto::VarType::FP16) ==
paddle::DataType::FLOAT16);
}
......
......@@ -18,355 +18,24 @@ limitations under the License. */
#endif
#include "paddle/fluid/framework/custom_kernel.h"
#include <dirent.h>
#include <algorithm>
#include <regex>
#include "paddle/fluid/framework/op_kernel_info_helper.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/api/ext/op_kernel_info.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/kernel_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/custom_kernel.h"
namespace paddle {
namespace framework {
// set phi::Kernel args_def_ from op_kernel_info
// because we can not set directly to phi::Kernel without exposing
// phi::KernelArgsDef when parsing custom user function
static void ParseArgs(const OpKernelInfo& op_kernel_info,
phi::KernelArgsDef* args_def) {
auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
for (auto& input : input_defs) {
auto type_index =
input.is_vector
? std::type_index(typeid(const std::vector<phi::DenseTensor>&))
: std::type_index(typeid(const phi::DenseTensor&));
args_def->AppendInput(input.backend, input.layout, input.dtype, type_index);
}
for (auto& output : output_defs) {
auto type_index =
output.is_vector
? std::type_index(typeid(const std::vector<phi::DenseTensor>&))
: std::type_index(typeid(const phi::DenseTensor&));
args_def->AppendOutput(output.backend, output.layout, output.dtype,
type_index);
}
for (auto& attr : attribute_defs) {
args_def->AppendAttribute(attr.type_index);
}
}
// custom pten kernel call function define
static void RunKernelFunc(phi::KernelContext* ctx,
const OpKernelInfo& op_kernel_info) {
VLOG(3) << "[CUSTOM KERNEL] RunKernelFunc begin...";
// input and output size is not params' num
// but actual Tensors' size
size_t input_size = ctx->InputsSize();
size_t output_size = ctx->OutputsSize();
size_t attr_size = ctx->AttrsSize();
// parameters' num of unified user kernel function
auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
PADDLE_ENFORCE_GE(input_size, input_defs.size(),
platform::errors::InvalidArgument(
"the size of ctx inputs size (%d) must be larger than "
"the size of kernel input_defs (%d).",
input_size, input_defs.size()));
PADDLE_ENFORCE_GE(output_size, output_defs.size(),
platform::errors::InvalidArgument(
"the size of ctx outputs size (%d) must be larger than "
"the size of kernel output_defs (%d).",
output_size, output_defs.size()));
PADDLE_ENFORCE_EQ(attr_size, attribute_defs.size(),
platform::errors::InvalidArgument(
"the size of ctx attribute size (%d) must be equal to "
"to the size of kernel attribute_defs (%d).",
attr_size, attribute_defs.size()));
VLOG(3) << "[CUSTOM KERNEL] Input num: " << input_defs.size()
<< "[tensor size:" << input_size << "]"
<< " Attribute num: " << attribute_defs.size()
<< " Output num: " << output_defs.size()
<< "[tensor size:" << output_size << "].";
// Inputs mapping
std::vector<paddle::experimental::Tensor> custom_ins;
std::vector<std::vector<paddle::experimental::Tensor>> custom_vec_ins;
for (size_t in_idx = 0; in_idx < input_defs.size(); ++in_idx) {
VLOG(3) << "Mapping Input[" << in_idx << "]";
const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
// is_vector tells if this Input is Tensor or std::vector<Tensor>
if (!input_defs.at(in_idx).is_vector) {
paddle::experimental::Tensor custom_t;
auto& ctx_tensor = ctx->InputAt<phi::DenseTensor>(range.first);
custom_t.set_impl(std::make_shared<phi::DenseTensor>(ctx_tensor));
custom_ins.emplace_back(custom_t);
} else {
std::vector<paddle::experimental::Tensor> custom_vec_in;
auto ctx_tensor_vec =
ctx->MoveInputsBetween<phi::DenseTensor>(range.first, range.second);
for (auto& ctx_tensor : ctx_tensor_vec) {
paddle::experimental::Tensor custom_t;
custom_t.set_impl(std::make_shared<phi::DenseTensor>(ctx_tensor));
custom_vec_in.emplace_back(custom_t);
}
custom_vec_ins.emplace_back(custom_vec_in);
}
VLOG(3) << "Mapped Input[" << in_idx << "] with range[" << range.first
<< "," << range.second << ").";
}
// Attributes mapping
std::vector<paddle::any> custom_attrs;
for (size_t attr_idx = 0; attr_idx < attribute_defs.size(); ++attr_idx) {
VLOG(3) << "Mapping Attribute[" << attr_idx << "]";
if (attribute_defs[attr_idx].type_index == std::type_index(typeid(bool))) {
bool arg = ctx->AttrAt<bool>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(int))) {
int arg = ctx->AttrAt<int>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(float))) {
float arg = ctx->AttrAt<float>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(double))) {
double arg = ctx->AttrAt<double>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(int64_t))) {
int64_t arg = ctx->AttrAt<int64_t>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(phi::dtype::float16))) {
phi::dtype::float16 arg = ctx->AttrAt<phi::dtype::float16>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(DataType))) {
DataType arg = ctx->AttrAt<DataType>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(const Scalar&))) {
const Scalar& arg = ctx->AttrAt<const Scalar&>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(const std::vector<int64_t>&))) {
const std::vector<int64_t>& arg =
ctx->AttrAt<const std::vector<int64_t>&>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(const ScalarArray&))) {
const ScalarArray& arg = ctx->AttrAt<const ScalarArray&>(attr_idx);
custom_attrs.emplace_back(arg);
} else if (attribute_defs[attr_idx].type_index ==
std::type_index(typeid(const std::vector<int>&))) {
const std::vector<int>& arg =
ctx->AttrAt<const std::vector<int>&>(attr_idx);
custom_attrs.emplace_back(arg);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported attribute attribute_defs[%d].type_index", attr_idx));
}
VLOG(3) << "Mapped Attribute[" << attr_idx << "]";
}
// Outputs mapping
std::vector<paddle::experimental::Tensor*> custom_outs;
std::vector<std::vector<paddle::experimental::Tensor*>> custom_vec_outs;
std::vector<std::shared_ptr<phi::DenseTensor>> custom_outs_ptr;
std::vector<std::vector<std::shared_ptr<phi::DenseTensor>>>
custom_vec_outs_ptr;
for (size_t out_idx = 0; out_idx < output_defs.size(); ++out_idx) {
VLOG(3) << "Mapping Output[" << out_idx << "]";
const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
// is_vector tells if this Output is Tensor or std::vector<Tensor>
if (!output_defs.at(out_idx).is_vector) {
auto* ctx_tensor = ctx->MutableOutputAt<phi::DenseTensor>(range.first);
auto* custom_t = new paddle::experimental::Tensor();
auto custom_t_ptr = std::make_shared<phi::DenseTensor>(*ctx_tensor);
custom_t->set_impl(custom_t_ptr);
custom_outs.emplace_back(custom_t);
custom_outs_ptr.emplace_back(custom_t_ptr);
} else {
std::vector<paddle::experimental::Tensor*> custom_vec_out;
std::vector<std::shared_ptr<phi::DenseTensor>> custom_vec_out_ptr;
auto ctx_tensor_vec = ctx->MutableOutputBetween<phi::DenseTensor>(
range.first, range.second);
for (auto ctx_tensor : ctx_tensor_vec) {
auto* custom_t = new paddle::experimental::Tensor();
auto custom_t_ptr = std::make_shared<phi::DenseTensor>(*ctx_tensor);
custom_t->set_impl(custom_t_ptr);
custom_vec_out.emplace_back(custom_t);
custom_vec_out_ptr.emplace_back(custom_t_ptr);
}
custom_vec_outs.emplace_back(custom_vec_out);
custom_vec_outs_ptr.emplace_back(custom_vec_out_ptr);
}
VLOG(3) << "Mapped Output[" << out_idx << "] with range[" << range.first
<< "," << range.second << ").";
}
// DeviceContext
// In pten, the first paramter XXContext is decided when registering
// through template param, but custom kernel function use unified
// DeviceContext as first parameter of user_kernel_fn, we use backend
// from OpKernelInfo to decide XXContext. In temporary simple
// DeviceContext, we just set necessary info to dev_ctx(such as stream
// in NPUContext), more related work should be done when
// phi::DeviceContext is exposed to outer.
DeviceContext dev_ctx;
auto& backend = OpKernelInfoHelper::GetBackend(op_kernel_info);
if (backend == phi::Backend::CPU) {
// do nothing
} else {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
size_t device_type_id_ = static_cast<size_t>(backend) -
static_cast<size_t>(phi::Backend::ALL_BACKEND);
std::string device_type = phi::GetGlobalDeviceType(device_type_id_);
if (!device_type.empty()) {
auto custom_ctx =
ctx->GetDeviceContext<paddle::platform::CustomDeviceContext>();
dev_ctx.set_stream(custom_ctx.stream());
return;
}
#endif
LOG(ERROR) << "[CUSTOM KERNEL] Unsupported kernel backend: " << backend
<< " with compiled Paddle.";
return;
}
auto& user_kernel_fn = OpKernelInfoHelper::GetKernelFn(op_kernel_info);
// call user function
user_kernel_fn(dev_ctx, custom_ins, custom_vec_ins, custom_attrs,
&custom_outs, &custom_vec_outs);
VLOG(3) << "[CUSTOM KERNEL] finished call user kernel function.";
// NOTE: Map back the output tensors with stored shared_ptrs.
for (int out_idx = output_defs.size() - 1; out_idx >= 0; --out_idx) {
VLOG(3) << "Mapping Back Output[" << out_idx << "]";
const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
// is_vector tells if this Output is Tensor or std::vector<Tensor>
if (!output_defs.at(out_idx).is_vector) {
auto* ctx_tensor = ctx->MutableOutputAt<phi::DenseTensor>(range.first);
*ctx_tensor = *(custom_outs_ptr.back().get());
custom_outs_ptr.pop_back();
} else {
auto ctx_tensor_vec = ctx->MutableOutputBetween<phi::DenseTensor>(
range.first, range.second);
auto custom_vec_ptr_out = custom_vec_outs_ptr.back();
for (int idx = ctx_tensor_vec.size() - 1; idx >= 0; --idx) {
*(ctx_tensor_vec[idx]) = *(custom_vec_ptr_out.back().get());
custom_vec_ptr_out.pop_back();
}
custom_vec_outs_ptr.pop_back();
}
VLOG(3) << "Mapped Output[" << out_idx << "] with range[" << range.first
<< "," << range.second << "].";
}
// delete newed paddle::Tensor for outputs while calling user kernel function
for (size_t i = 0; i < custom_outs.size(); ++i) {
delete custom_outs[i];
}
for (size_t i = 0; i < custom_vec_outs.size(); ++i) {
for (size_t j = 0; j < custom_vec_outs[i].size(); ++j) {
delete custom_vec_outs[i][j];
}
}
}
void RegisterKernelWithMetaInfo(
const std::vector<OpKernelInfo>& op_kernel_infos) {
for (size_t i = 0; i < op_kernel_infos.size(); ++i) {
auto& kernel_info = op_kernel_infos[i];
auto op_type = OpKernelInfoHelper::GetOpName(kernel_info);
auto kernel_key = OpKernelInfoHelper::GetKernelKey(kernel_info);
VLOG(3) << "[CUSTOM KERNEL] registering [" << op_type << "]" << kernel_key;
// 1.Check whether this kernel is valid for a specific operator
PADDLE_ENFORCE_EQ(
phi::KernelFactory::Instance().HasCompatiblePtenKernel(op_type), true,
platform::errors::InvalidArgument(
"[CUSTOM KERNEL] %s is not ready for custom kernel registering.",
op_type));
// 2.Check whether kernel_key has been already registed
PADDLE_ENFORCE_EQ(
phi::KernelFactory::Instance().kernels()[op_type].find(kernel_key),
phi::KernelFactory::Instance().kernels()[op_type].end(),
platform::errors::InvalidArgument(
"[CUSTOM KERNEL] The operator <%s>'s kernel: %s has been "
"already existed in Paddle, please contribute PR if need "
"to optimize the kernel code. Custom kernel do NOT support "
"to replace existing kernel in Paddle.",
op_type, kernel_key));
// phi::KernelFn
phi::KernelFn kernel_fn = [kernel_info](phi::KernelContext* ctx) {
VLOG(3) << "[CUSTOM KERNEL] run custom PTEN kernel func in lambda.";
RunKernelFunc(ctx, kernel_info);
};
// variadic_kernel_fn
void* variadic_kernel_fn =
OpKernelInfoHelper::GetVariadicKernelFn(kernel_info);
phi::Kernel kernel(kernel_fn, variadic_kernel_fn);
// args info
ParseArgs(kernel_info, kernel.mutable_args_def());
// register custom kernel to phi::KernelFactory
phi::KernelFactory::Instance().kernels()[op_type][kernel_key] = kernel;
VLOG(3) << "[CUSTOM KERNEL] Successed in registering operator <" << op_type
<< ">'s kernel " << kernel_key << " to Paddle. "
<< "It will be used like native ones.";
}
}
void RegisterKernelWithMetaInfoMap(
const paddle::OpKernelInfoMap& op_kernel_info_map) {
auto& kernel_info_map = op_kernel_info_map.GetMap();
VLOG(3) << "[CUSTOM KERNEL] size of op_kernel_info_map: "
<< kernel_info_map.size();
// pair: {op_type, OpKernelInfo}
for (auto& pair : kernel_info_map) {
VLOG(3) << "[CUSTOM KERNEL] pair first -> op name: " << pair.first;
RegisterKernelWithMetaInfo(pair.second);
}
}
void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle) {
#ifdef _LINUX
typedef OpKernelInfoMap& get_op_kernel_info_map_t();
auto* func = reinterpret_cast<get_op_kernel_info_map_t*>(
dlsym(dso_handle, "PD_GetOpKernelInfoMap"));
typedef phi::CustomKernelMap& get_custom_kernel_map_t();
auto* func = reinterpret_cast<get_custom_kernel_map_t*>(
dlsym(dso_handle, "PD_GetCustomKernelMap"));
if (func == nullptr) {
LOG(WARNING) << "Skipped lib [" << dso_lib_path << "]: fail to find "
<< "PD_GetOpKernelInfoMap symbol in this lib.";
<< "PD_GetCustomKernelMap symbol in this lib.";
return;
}
auto& op_kernel_info_map = func();
RegisterKernelWithMetaInfoMap(op_kernel_info_map);
auto& custom_kernel_map = func();
phi::RegisterCustomKernels(custom_kernel_map);
LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path;
#else
VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux.";
......
......@@ -14,22 +14,13 @@ limitations under the License. */
#pragma once
#include "paddle/phi/api/ext/op_kernel_info.h"
#include <string>
namespace paddle {
namespace framework {
// Load custom kernel lib and register
void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle);
// Load custom kernel api: register kernel after user compiled
void LoadOpKernelInfoAndRegister(const std::string& dso_name);
// Register custom kernel api: register kernel directly
void RegisterKernelWithMetaInfoMap(
const paddle::OpKernelInfoMap& op_kernel_info_map);
// Interface for selective register custom kernel.
void RegisterKernelWithMetaInfo(
const std::vector<OpKernelInfo>& op_kernel_infos);
} // namespace framework
} // namespace paddle
......@@ -30,7 +30,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_meta_info_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/pten_utils.h"
#include "paddle/fluid/framework/phi_utils.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/string/string_helper.h"
......@@ -779,13 +779,13 @@ void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
for (size_t i = 0; i < ctx->InputSize(in_name); ++i) {
auto dtype = ctx->GetInputDataType(in_name, i);
vec_custom_dtype.emplace_back(
paddle::framework::TransToPtenDataType(dtype));
paddle::framework::TransToPhiDataType(dtype));
}
vec_input_dtypes.emplace_back(vec_custom_dtype);
} else {
auto dtype = ctx->GetInputDataType(in_name);
input_dtypes.emplace_back(
paddle::framework::TransToPtenDataType(dtype));
paddle::framework::TransToPhiDataType(dtype));
}
}
......
......@@ -23,7 +23,7 @@ limitations under the License. */
#include "paddle/fluid/platform/init.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/fluid/framework/pten_utils.h"
#include "paddle/fluid/framework/phi_utils.h"
namespace paddle {
namespace framework {
......
......@@ -28,7 +28,7 @@ TEST(DataType, float16) {
Tensor tensor;
CPUPlace cpu;
tensor.mutable_data(cpu, f::TransToPtenDataType(dtype));
tensor.mutable_data(cpu, f::TransToPhiDataType(dtype));
// test fp16 tensor
EXPECT_EQ(f::TransToProtoVarType(tensor.dtype()),
......@@ -51,7 +51,7 @@ TEST(DataType, bfloat16) {
Tensor tensor;
CPUPlace cpu;
tensor.mutable_data(cpu, f::TransToPtenDataType(dtype));
tensor.mutable_data(cpu, f::TransToPhiDataType(dtype));
// test bf16 tensor
EXPECT_EQ(f::TransToProtoVarType(tensor.dtype()),
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
文件模式从 100755 更改为 100644
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册