未验证 提交 3cca89e7 编写于 作者: 石晓伟 提交者: GitHub

Merge the develop branch (#39362)

上级 d89f246c
......@@ -4,10 +4,13 @@ paddle/fluid/API_DEV.spec
paddle/fluid/API_PR.spec
paddle/fluid/op_use_default_grad_maker_DEV.spec
paddle/fluid/op_use_default_grad_maker_PR.spec
paddle/pten/api/*/api.*
paddle/pten/api/*/backward*
paddle/pten/api/include/api.h
paddle/pten/api/lib/api.cc
paddle/pten/api/backward/backward_api.h
paddle/pten/api/lib/backward_api.cc
paddle/pten/include/*
paddle/pten/extension.h
paddle/fluid/eager/api/generated/*
*.DS_Store
*.vs
......
......@@ -43,6 +43,7 @@ option(WITH_ONEMKL "Compile PaddlePaddle with oneMKL" OFF)
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
option(WITH_XPU_KP "Compile PaddlePaddle with BAIDU XPU compiler " OFF)
option(WITH_MLU "Compile PaddlePaddle with CAMBRICON MLU" OFF)
option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
......@@ -59,6 +60,9 @@ include(generic) # simplify cmake module
if (WITH_GPU AND WITH_XPU)
message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
endif()
if (WITH_GPU AND WITH_XPU_KP)
message(FATAL_ERROR "Error when compile GPU and XPU2 at the same time")
endif()
if (WITH_GPU AND WITH_ASCEND)
message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
endif()
......@@ -226,6 +230,7 @@ option(WITH_INFRT "Compile PaddlePaddle with INFRT" OFF)
option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON)
option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON)
option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF)
option(WITH_CNCL "Compile PaddlePaddle with CNCL support" OFF)
option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON)
option(WITH_ARM "Compile PaddlePaddle with arm support" OFF)
option(WITH_SW "Compile PaddlePaddle with sw support" OFF)
......@@ -273,6 +278,14 @@ if (NOT WITH_GPU AND WITH_NCCL)
"Disable NCCL when compiling without GPU" FORCE)
endif()
# force WITH_XPU on when WITH_XPU_KP
if (WITH_XPU_KP AND NOT WITH_XPU)
MESSAGE(WARNING
"Enable WITH_XPU when compiling with WITH_XPU_KP. Force WITH_XPU=ON.")
set(WITH_XPU ON CACHE STRING
"Enable WITH_XPU when compiling with WITH_XPU_KP" FORCE)
endif()
if (NOT WITH_XPU AND WITH_XPU_BKCL)
MESSAGE(WARNING
"Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.")
......@@ -280,6 +293,13 @@ if (NOT WITH_XPU AND WITH_XPU_BKCL)
"Disable BKCL when compiling without XPU" FORCE)
endif()
if (NOT WITH_MLU AND WITH_CNCL)
MESSAGE(WARNING
"Disable CNCL when compiling without MLU. Force WITH_MLU=OFF.")
set(WITH_MLU OFF CACHE STRING
"Disable CNCL when compiling without MLU" FORCE)
endif()
if(WITH_NCCL)
add_definitions("-DPADDLE_WITH_NCCL")
include(nccl)
......@@ -317,6 +337,10 @@ if(WITH_ROCM)
include(miopen) # set miopen libraries, must before configure
endif(WITH_ROCM)
if(WITH_XPU_KP)
include(xpu_kp)
endif()
if (NOT WITH_ROCM AND WITH_RCCL)
MESSAGE(WARNING
"Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
......
......@@ -99,6 +99,11 @@ if(WITH_XPU)
add_definitions(-DPADDLE_WITH_XPU)
endif()
if(WITH_XPU_KP)
message(STATUS "Compile with XPU_KP!")
add_definitions(-DPADDLE_WITH_XPU_KP)
endif()
if(WITH_IPU)
message(STATUS "Compile with IPU!")
add_definitions(-DPADDLE_WITH_IPU)
......
......@@ -654,6 +654,81 @@ function(hip_test TARGET_NAME)
endif()
endfunction(hip_test)
function(xpu_library TARGET_NAME)
if (WITH_XPU_KP)
set(options STATIC static SHARED shared)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(xpu_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if(xpu_library_SRCS)
if (xpu_library_SHARED OR xpu_library_shared) # build *.so
message(FATAL_ERROR "XPU kernel currently does not support dynamic links")
else()
xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS})
find_fluid_modules(${TARGET_NAME})
endif()
if (xpu_library_DEPS)
add_dependencies(${TARGET_NAME} ${xpu_library_DEPS})
target_link_libraries(${TARGET_NAME} ${xpu_library_DEPS})
endif()
# cpplint code style
foreach(source_file ${xpu_library_SRCS})
string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
list(APPEND xpu_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
else(xpu_library_SRCS)
if (xpu_library_DEPS)
list(REMOVE_DUPLICATES xpu_library_DEPS)
generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:xpu_library")
target_link_libraries(${TARGET_NAME} ${xpu_library_DEPS})
add_dependencies(${TARGET_NAME} ${xpu_library_DEPS})
else()
message(FATAL "Please specify source file or library in xpu_library.")
endif()
endif(xpu_library_SRCS)
endif()
endfunction(xpu_library)
function(xpu_binary TARGET_NAME)
if (WITH_XPU_KP)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(xpu_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${xpu_binary_SRCS})
if(xpu_binary_DEPS)
target_link_libraries(${TARGET_NAME} ${xpu_binary_DEPS})
add_dependencies(${TARGET_NAME} ${xpu_binary_DEPS})
common_link(${TARGET_NAME})
endif()
endif()
endfunction(xpu_binary)
function(xpu_test TARGET_NAME)
# The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation
# and execution of test in CI. `CI_SKIP_CPP_TEST` is set to ON when no files
# other than *.py are modified.
if (WITH_XPU_KP AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(xpu_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${xpu_test_SRCS})
# "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE
target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt)
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${xpu_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules})
add_dependencies(${TARGET_NAME} ${xpu_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
common_link(${TARGET_NAME})
add_test(${TARGET_NAME} ${TARGET_NAME})
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
endif()
endfunction(xpu_test)
function(go_library TARGET_NAME)
set(options STATIC static SHARED shared)
set(oneValueArgs "")
......
......@@ -19,4 +19,11 @@ set(CNRT_LIB ${NEUWARE_LIB_DIR}/libcnrt.so)
set(CNDRV_LIB ${NEUWARE_LIB_DIR}/libcndrv.so)
generate_dummy_static_lib(LIB_NAME "neuware_lib" GENERATOR "neuware.cmake")
TARGET_LINK_LIBRARIES(neuware_lib ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB})
if(WITH_CNCL)
MESSAGE(STATUS "Compile with CNCL!")
ADD_DEFINITIONS(-DPADDLE_WITH_CNCL)
set(CNCL_LIB ${NEUWARE_LIB_DIR}/libcncl.so)
TARGET_LINK_LIBRARIES(neuware_lib ${CNCL_LIB} ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB})
else()
TARGET_LINK_LIBRARIES(neuware_lib ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB})
endif()
......@@ -34,6 +34,7 @@ function(op_library TARGET)
set(cu_cc_srcs)
set(hip_cc_srcs)
set(xpu_cc_srcs)
set(xpu_kp_cc_srcs)
set(npu_cc_srcs)
set(mlu_cc_srcs)
set(cudnn_cu_cc_srcs)
......@@ -120,6 +121,11 @@ function(op_library TARGET)
list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
endif()
endif()
if(WITH_XPU_KP)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.xpu)
list(APPEND xpu_kp_cc_srcs ${TARGET}.xpu)
endif()
endif()
if(WITH_ASCEND_CL)
string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc)
......@@ -154,6 +160,8 @@ function(op_library TARGET)
list(APPEND mkldnn_cc_srcs ${src})
elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
list(APPEND xpu_cc_srcs ${src})
elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.xpu$")
list(APPEND xpu_kp_cc_srcs ${src})
elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
list(APPEND npu_cc_srcs ${src})
elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")
......@@ -161,11 +169,13 @@ function(op_library TARGET)
elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src})
else()
message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu or .xpu")
endif()
endforeach()
endif()
list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
list(LENGTH xpu_kp_cc_srcs xpu_kp_cc_srcs_len)
list(LENGTH cc_srcs cc_srcs_len)
if (${cc_srcs_len} EQUAL 0)
message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
......@@ -231,6 +241,8 @@ function(op_library TARGET)
list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
elseif (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
xpu_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps})
else()
# Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
if(WITH_UNITY_BUILD AND op_library_UNITY)
......@@ -359,6 +371,11 @@ function(op_library TARGET)
endif()
endif()
# pybind USE_OP_DEVICE_KERNEL for XPU KP
if (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, KP);\n")
endif()
# pybind USE_OP_DEVICE_KERNEL for NPU
if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
foreach(npu_src ${npu_cc_srcs})
......@@ -438,7 +455,6 @@ function(op_library TARGET)
endif()
endfunction()
function(register_operators)
set(options "")
set(oneValueArgs "")
......
......@@ -88,11 +88,12 @@ function(kernel_library TARGET)
set(cpu_srcs)
set(gpu_srcs)
set(xpu_srcs)
set(selected_rows_srcs)
# parse and save the deps kerenl targets
set(all_srcs)
set(kernel_deps)
set(oneValueArgs "")
set(oneValueArgs SUB_DIR)
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})
......@@ -106,6 +107,9 @@ function(kernel_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
list(APPEND selected_rows_srcs ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
endif()
if (WITH_GPU OR WITH_ROCM)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
......@@ -131,8 +135,17 @@ function(kernel_library TARGET)
foreach(src ${all_srcs})
file(READ ${src} target_content)
string(REGEX MATCHALL "#include \"paddle\/pten\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
if ("${kernel_library_SUB_DIR}" STREQUAL "")
string(REGEX MATCHALL "#include \"paddle\/pten\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
else()
string(REGEX MATCHALL "#include \"paddle\/pten\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
endif()
foreach(include_kernel ${include_kernels})
if ("${kernel_library_SUB_DIR}" STREQUAL "")
string(REGEX REPLACE "#include \"paddle\/pten\/kernels\/" "" kernel_name ${include_kernel})
else()
string(REGEX REPLACE "#include \"paddle\/pten\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel})
endif()
string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
list(APPEND kernel_deps ${kernel_name})
endforeach()
......@@ -144,27 +157,30 @@ function(kernel_library TARGET)
list(LENGTH cpu_srcs cpu_srcs_len)
list(LENGTH gpu_srcs gpu_srcs_len)
list(LENGTH xpu_srcs xpu_srcs_len)
list(LENGTH selected_rows_srcs selected_rows_srcs_len)
# Build Target according different src organization
if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
${xpu_srcs_len} GREATER 0) AND ${common_srcs_len} GREATER 0)
# If the common_srcs depends on specific device srcs, build target using this rule.
${xpu_srcs_len} GREATER 0) AND (${common_srcs_len} GREATER 0 OR
${selected_rows_srcs_len} GREATER 0))
# If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule.
if (WITH_GPU)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
nv_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
endif()
elseif (WITH_ROCM)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
endif()
else()
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
cc_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
cc_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
endif()
endif()
# If there are only specific device srcs, build target using this rule.
elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
if (WITH_GPU)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
......@@ -179,25 +195,42 @@ function(kernel_library TARGET)
cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
endif()
else()
if (${common_srcs_len} EQUAL 0)
message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
# If the selected_rows_srcs depends on common_srcs, build target using this rule.
elseif (${common_srcs_len} GREATER 0 AND ${selected_rows_srcs_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
elseif (WITH_ROCM)
hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
else()
# If the kernel has a device independent public implementation,
# we will use this implementation and will not adopt the implementation
# under specific devices
if (WITH_GPU)
nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
else()
cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
endif()
cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
endif()
# If there are only common_srcs or selected_rows_srcs, build target using below rules.
elseif (${common_srcs_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
else()
cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
elseif (${selected_rows_srcs_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
else()
cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
else()
message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
endif()
if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR
${selected_rows_srcs_len} GREATER 0)
# append target into PTEN_KERNELS property
get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
set(pten_kernels ${pten_kernels} ${TARGET})
......@@ -219,11 +252,14 @@ function(kernel_library TARGET)
if (${xpu_srcs_len} GREATER 0)
kernel_declare(${xpu_srcs})
endif()
if (${selected_rows_srcs_len} GREATER 0)
kernel_declare(${selected_rows_srcs})
endif()
endfunction()
function(register_kernels)
set(options "")
set(oneValueArgs "")
set(oneValueArgs SUB_DIR)
set(multiValueArgs EXCLUDES DEPS)
cmake_parse_arguments(register_kernels "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})
......@@ -236,9 +272,9 @@ function(register_kernels)
list(FIND register_kernels_EXCLUDES ${target} _index)
if (${_index} EQUAL -1)
if (${register_kernels_DEPS_len} GREATER 0)
kernel_library(${target} DEPS ${register_kernels_DEPS})
kernel_library(${target} DEPS ${register_kernels_DEPS} SUB_DIR ${register_kernels_SUB_DIR})
else()
kernel_library(${target})
kernel_library(${target} SUB_DIR ${register_kernels_SUB_DIR})
endif()
endif()
endforeach()
......@@ -246,9 +282,9 @@ endfunction()
function(append_op_util_declare TARGET)
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET} target_content)
string(REGEX MATCH "(PT_REGISTER_API_NAME|PT_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}")
string(REGEX MATCH "(PT_REGISTER_BASE_KERNEL_NAME|PT_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}")
string(REPLACE "PT_REGISTER_ARG_MAPPING_FN" "PT_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}")
string(REPLACE "PT_REGISTER_API_NAME" "PT_REGISTER_API_NAME" util_declare "${util_declare}")
string(REPLACE "PT_REGISTER_BASE_KERNEL_NAME" "PT_DECLARE_BASE_KERNEL_NAME" util_declare "${util_declare}")
string(APPEND util_declare ");")
file(APPEND ${op_utils_header} "${util_declare}")
endfunction()
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT WITH_XPU_KP)
return()
endif()
if(NOT XPU_TOOLCHAIN)
set(XPU_TOOLCHAIN /workspace/paddle/xpu-demo/XTDK)
get_filename_component(XPU_TOOLCHAIN ${XPU_TOOLCHAIN} REALPATH)
endif()
if(NOT IS_DIRECTORY ${XPU_TOOLCHAIN})
message(FATAL_ERROR "Directory ${XPU_TOOLCHAIN} not found!")
endif()
message(STATUS "Build with XPU_TOOLCHAIN=" ${XPU_TOOLCHAIN})
set(XPU_CLANG ${XPU_TOOLCHAIN}/bin/clang++)
message(STATUS "Build with XPU_CLANG=" ${XPU_CLANG})
# The host sysroot of XPU compiler is gcc-8.2
if(NOT HOST_SYSROOT)
set(HOST_SYSROOT /opt/compiler/gcc-8.2)
endif()
if(NOT IS_DIRECTORY ${HOST_SYSROOT})
message(FATAL_ERROR "Directory ${HOST_SYSROOT} not found!")
endif()
if(NOT API_ARCH)
set(API_ARCH x86_64-baidu-linux-gnu)
endif()
if(API_ARCH MATCHES "x86_64")
if(EXISTS ${HOST_SYSROOT}/bin/g++)
set(HOST_CXX ${HOST_SYSROOT}/bin/g++)
set(HOST_AR ${HOST_SYSROOT}/bin/ar)
else()
set(HOST_CXX /usr/bin/g++)
set(HOST_AR /usr/bin/ar)
endif()
else()
set(HOST_CXX ${CMAKE_CXX_COMPILER})
set(HOST_AR ${CMAKE_AR})
endif()
set(TOOLCHAIN_ARGS )
if(OPT_LEVEL)
set(OPT_LEVEL ${OPT_LEVEL})
else()
set(OPT_LEVEL "-O3")
endif()
message(STATUS "Build with API_ARCH=" ${API_ARCH})
message(STATUS "Build with TOOLCHAIN_ARGS=" ${TOOLCHAIN_ARGS})
message(STATUS "Build with HOST_SYSROOT=" ${HOST_SYSROOT})
message(STATUS "Build with HOST_CXX=" ${HOST_CXX})
message(STATUS "Build with HOST_AR=" ${HOST_AR})
macro(compile_kernel COMPILE_ARGS)
set(options "")
set(oneValueArgs "")
set(multiValueArgs KERNEL DIRPATH XNAME DEVICE HOST XPU DEPENDS)
cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(kernel_path ${xpu_add_library_DIRPATH})
set(kernel_name ${xpu_add_library_XNAME})
set(device_o_extra_flags ${xpu_add_library_DEVICE})
set(host_o_extra_flags ${xpu_add_library_HOST})
set(xpu_1_or_2 ${xpu_add_library_XPU})
set(cc_depends ${xpu_add_library_DEPENDS})
set(kernel_target ${kernel_name}_kernel)
add_custom_target(${kernel_target}
WORKING_DIRECTORY
${CMAKE_CURRENT_BINARY_DIR}
DEPENDS
kernel_build/${kernel_name}.host.o
kernel_build/${kernel_name}.bin.o
COMMENT
${kernel_target}
VERBATIM
)
if(cc_depends)
add_dependencies(${kernel_target} ${xpu_add_library_DEPENDS})
endif()
set(arg_device_o_extra_flags ${device_o_extra_flags})
separate_arguments(arg_device_o_extra_flags)
set(arg_host_o_extra_flags ${host_o_extra_flags})
separate_arguments(arg_host_o_extra_flags)
set(XTDK_DIR ${XPU_TOOLCHAIN})
set(CXX_DIR ${HOST_SYSROOT})
set(XPU_CXX_FLAGS -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -DNDEBUG )
#include path
get_property(dirs DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
set(XPU_CXX_INCLUDES "")
foreach(dir IN LISTS dirs)
list(APPEND XPU_CXX_INCLUDES "-I${dir}")
endforeach()
string(REPLACE ";" " " XPU_CXX_INCLUDES "${XPU_CXX_INCLUDES}" )
separate_arguments(XPU_CXX_INCLUDES UNIX_COMMAND "${XPU_CXX_INCLUDES}")
#related flags
get_directory_property( DirDefs DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS )
set(XPU_CXX_DEFINES "")
foreach(def IN LISTS DirDefs)
list(APPEND XPU_CXX_DEFINES "-D${def}")
endforeach()
string(REPLACE ";" " " XPU_CXX_DEFINES "${XPU_CXX_DEFINES}" )
separate_arguments(XPU_CXX_DEFINES UNIX_COMMAND "${XPU_CXX_DEFINES}")
add_custom_command(
OUTPUT
kernel_build/${kernel_name}.bin.o
COMMAND
${CMAKE_COMMAND} -E make_directory kernel_build
COMMAND
${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES}
-I. -o kernel_build/${kernel_name}.bin.o.sec ${kernel_path}/${kernel_name}.xpu
--xpu-device-only -c -v
COMMAND
${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR}
WORKING_DIRECTORY
${CMAKE_CURRENT_BINARY_DIR}
DEPENDS
${xpu_add_library_DEPENDS}
COMMENT
kernel_build/${kernel_name}.bin.o
VERBATIM
)
list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.bin.o)
add_custom_command(
OUTPUT
kernel_build/${kernel_name}.host.o
COMMAND
${CMAKE_COMMAND} -E make_directory kernel_build
COMMAND
${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES}
-I. -o kernel_build/${kernel_name}.host.o ${kernel_path}/${kernel_name}.xpu
--xpu-host-only -c -v
WORKING_DIRECTORY
${CMAKE_CURRENT_BINARY_DIR}
DEPENDS
${xpu_add_library_DEPENDS}
COMMENT
kernel_build/${kernel_name}.host.o
VERBATIM
)
list(APPEND xpu_kernel_depends kernel_build/${kernel_name}.host.o)
endmacro()
###############################################################################
# XPU_ADD_LIBRARY
###############################################################################
macro(xpu_add_library TARGET_NAME)
# Separate the sources from the options
set(options "")
set(oneValueArgs "")
set(multiValueArgs STATIC DEPENDS)
cmake_parse_arguments(xpu_add_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(xpu_srcs ${xpu_add_library_STATIC})
set(xpu_target ${TARGET_NAME})
set(cc_srcs_depends ${xpu_add_library_DEPENDS})
file(GLOB_RECURSE xpu_srcs_lists ${xpu_srcs})
list(LENGTH xpu_srcs_lists xpu_srcs_lists_num)
set(XPU1_DEVICE_O_EXTRA_FLAGS " ")
set(XPU1_HOST_O_EXTRA_FLAGS " ")
# Distinguish .xpu file from other files
foreach(cur_xpu_src IN LISTS xpu_srcs_lists)
get_filename_component(language_type_name ${cur_xpu_src} EXT)
if(${language_type_name} STREQUAL ".xpu")
list(APPEND xpu_kernel_lists ${cur_xpu_src})
else()
list(APPEND cc_kernel_lists ${cur_xpu_src})
endif()
endforeach()
# Ensure that there is only one xpu kernel
list(LENGTH xpu_kernel_lists xpu_kernel_lists_num)
list(LENGTH cc_srcs_depends cc_srcs_depends_num)
if(${xpu_kernel_lists_num})
foreach(xpu_kernel IN LISTS xpu_kernel_lists)
get_filename_component(kernel_name ${xpu_kernel} NAME_WE)
get_filename_component(kernel_dir ${xpu_kernel} DIRECTORY)
set(kernel_rules ${kernel_dir}/${kernel_name}.rules)
set(kernel_name ${kernel_name})
compile_kernel( KERNEL ${xpu_kernel} DIRPATH ${kernel_dir} XNAME ${kernel_name} DEVICE ${XPU1_DEVICE_O_EXTRA_FLAGS} HOST ${XPU1_HOST_O_EXTRA_FLAGS} XPU "xpu2" DEPENDS ${cc_srcs_depends})
endforeach()
add_custom_target(${xpu_target}_src ALL
WORKING_DIRECTORY
${CMAKE_CURRENT_BINARY_DIR}
DEPENDS
${xpu_kernel_depends}
${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
COMMENT
${xpu_target}_src
VERBATIM
)
add_custom_command(
OUTPUT
${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
COMMAND
${HOST_AR} rcs ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a ${xpu_kernel_depends}
WORKING_DIRECTORY
${CMAKE_CURRENT_BINARY_DIR}
DEPENDS
${xpu_kernel_depends}
COMMENT
${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a
VERBATIM
)
add_library(${xpu_target} STATIC ${cc_kernel_lists})
add_dependencies(${xpu_target} ${xpu_target}_src)
target_link_libraries(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/lib${xpu_target}_xpu.a)
else()
add_library(${xpu_target} STATIC ${cc_kernel_lists})
endif()
endmacro()
......@@ -33,7 +33,7 @@ namespace distributed {
template <typename T>
inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
GetBlas() {
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
T>(cpu_ctx);
}
......
......@@ -213,6 +213,7 @@ int32_t BrpcPsClient::initialize() {
auto &profiler = CostProfiler::instance();
profiler.register_profiler("pserver_client_pull_dense");
profiler.register_profiler("pserver_client_pull_sparse");
profiler.register_profiler("pserver_client_pull_sparse_param");
profiler.register_profiler("pserver_client_pull_sparse_local");
profiler.register_profiler("pserver_client_push_sparse");
profiler.register_profiler("pserver_client_push_sparse_parse");
......@@ -543,6 +544,7 @@ std::future<int32_t> BrpcPsClient::pull_geo_param(size_t table_id,
return fut;
}
// for GEO
std::future<int32_t> BrpcPsClient::push_sparse_param(
size_t table_id, const uint64_t *keys, const float **update_values,
size_t num, void *done) {
......@@ -558,18 +560,8 @@ std::future<int32_t> BrpcPsClient::push_sparse_param(
ids.resize(request_call_num);
value_ptrs.resize(request_call_num);
const auto &server_param = _config.server_param().downpour_server_param();
uint64_t shard_num = FLAGS_pserver_sparse_table_shard_num;
for (int i = 0; i < server_param.downpour_table_param_size(); ++i) {
const auto &table_param = server_param.downpour_table_param(i);
if (table_param.table_id() == table_id) {
shard_num = table_param.shard_num();
break;
}
}
for (size_t i = 0; i < num; ++i) {
size_t pserver_idx = get_sparse_shard(shard_num, request_call_num, keys[i]);
size_t pserver_idx = keys[i] % request_call_num;
ids[pserver_idx].push_back(keys[i]);
value_ptrs[pserver_idx].push_back(update_values[i]);
}
......@@ -1003,6 +995,120 @@ std::future<int32_t> BrpcPsClient::pull_sparse(float **select_values,
return fut;
}
// for GEO
std::future<int32_t> BrpcPsClient::pull_sparse_param(float **select_values,
size_t table_id,
const uint64_t *keys,
size_t num,
bool is_training) {
auto timer = std::make_shared<CostTimer>("pserver_client_pull_sparse_param");
size_t request_call_num = _server_channels.size();
auto shard_sorted_kvs = std::make_shared<
std::vector<std::vector<std::pair<uint64_t, float *>>>>();
shard_sorted_kvs->resize(request_call_num);
for (size_t i = 0; i < num; ++i) {
size_t shard_id = keys[i] % request_call_num;
shard_sorted_kvs->at(shard_id).push_back({keys[i], select_values[i]});
}
auto *accessor = table_accessor(table_id);
size_t value_size = accessor->select_size();
DownpourBrpcClosure *closure = new DownpourBrpcClosure(
request_call_num, [shard_sorted_kvs, value_size](void *done) {
int ret = 0;
auto *closure = reinterpret_cast<DownpourBrpcClosure *>(done);
for (size_t i = 0; i < shard_sorted_kvs->size(); ++i) {
if (closure->check_response(i, PS_PULL_SPARSE_TABLE) != 0) {
ret = -1;
break;
}
auto &request_kvs = shard_sorted_kvs->at(i);
auto &res_io_buffer = closure->cntl(i)->response_attachment();
butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
uint64_t last_key = UINT64_MAX;
float *last_value_data = NULL;
// can remove sort&unique
for (size_t kv_idx = 0; kv_idx < request_kvs.size(); ++kv_idx) {
auto *kv_pair = &(request_kvs[kv_idx]);
if (kv_pair->first == last_key) {
memcpy(reinterpret_cast<void *>(kv_pair->second),
reinterpret_cast<void *>(last_value_data), value_size);
} else {
last_key = kv_pair->first;
last_value_data = kv_pair->second;
if (value_size !=
io_buffer_itr.copy_and_forward(
reinterpret_cast<void *>(last_value_data), value_size)) {
LOG(WARNING) << "res data is lack or not in format";
ret = -1;
break;
}
}
}
}
closure->set_promise_value(ret);
});
closure->add_timer(timer);
auto promise = std::make_shared<std::promise<int32_t>>();
closure->add_promise(promise);
std::future<int> fut = promise->get_future();
for (size_t i = 0; i < request_call_num; ++i) {
auto &sorted_kvs = shard_sorted_kvs->at(i);
std::sort(sorted_kvs.begin(), sorted_kvs.end(),
[](const std::pair<uint64_t, float *> &k1,
const std::pair<uint64_t, float *> &k2) {
return k1.first < k2.first;
});
uint64_t last_key = UINT64_MAX;
uint32_t kv_request_count = 0;
size_t sorted_kv_size = sorted_kvs.size();
auto &request_buffer = closure->cntl(i)->request_attachment();
request_buffer.append(reinterpret_cast<void *>(&is_training), sizeof(bool));
std::vector<uint32_t> keys_counter;
keys_counter.reserve(sorted_kv_size);
for (size_t kv_idx = 0; kv_idx < sorted_kv_size; ++kv_idx) {
++kv_request_count;
uint32_t keys = 1;
last_key = sorted_kvs[kv_idx].first;
request_buffer.append(reinterpret_cast<void *>(&last_key),
sizeof(uint64_t));
while (kv_idx < sorted_kv_size - 1 &&
last_key == sorted_kvs[kv_idx + 1].first) {
++kv_idx;
++keys;
}
keys_counter.push_back(keys);
}
request_buffer.append(reinterpret_cast<void *>(keys_counter.data()),
sizeof(uint32_t) * keys_counter.size());
if (kv_request_count == 0) {
closure->Run();
} else {
closure->request(i)->set_cmd_id(PS_PULL_SPARSE_TABLE);
closure->request(i)->set_table_id(table_id);
closure->request(i)->set_client_id(_client_id);
closure->request(i)->add_params((char *)&kv_request_count, // NOLINT
sizeof(uint32_t));
PsService_Stub rpc_stub(get_cmd_channel(i));
closure->cntl(i)->set_log_id(butil::gettimeofday_ms());
rpc_stub.service(closure->cntl(i), closure->request(i),
closure->response(i), closure);
}
}
return fut;
}
std::future<int32_t> BrpcPsClient::send_client2client_msg(
int msg_type, int to_client_id, const std::string &msg) {
auto promise = std::make_shared<std::promise<int32_t>>();
......@@ -1067,12 +1173,14 @@ int32_t BrpcPsClient::recv_and_save_table(const uint64_t table_id,
std::string var_name = "";
int64_t var_num = 0;
int64_t var_shape = 0;
std::string table_class;
const auto &worker_param = _config.worker_param().downpour_worker_param();
for (size_t i = 0; i < worker_param.downpour_table_param_size(); ++i) {
if (worker_param.downpour_table_param(i).table_id() == table_id) {
var_name = worker_param.downpour_table_param(i).common().table_name();
var_num = worker_param.downpour_table_param(i).common().table_num();
var_shape = worker_param.downpour_table_param(i).common().table_dim();
table_class = worker_param.downpour_table_param(i).table_class();
break;
}
}
......@@ -1094,9 +1202,19 @@ int32_t BrpcPsClient::recv_and_save_table(const uint64_t table_id,
save_vec.push_back(save_huge_vec.data() + i * var_shape);
}
auto status = pull_sparse(reinterpret_cast<float **>(save_vec.data()),
table_id, save_key.data(), save_key.size(), true);
status.wait();
VLOG(2) << "recv_and_save_table: table_class: " << table_class;
// TODO(zhaocaibei123): new GeoBrpcPSClient, move this to its
// recv_and_save_table
if (table_class == "MemorySparseGeoTable") {
auto status =
pull_sparse_param(reinterpret_cast<float **>(save_vec.data()), table_id,
save_key.data(), save_key.size(), true);
status.wait();
} else {
auto status = pull_sparse(reinterpret_cast<float **>(save_vec.data()),
table_id, save_key.data(), save_key.size(), true);
status.wait();
}
// create lod tensor
std::shared_ptr<framework::Scope> scope;
......
......@@ -194,6 +194,10 @@ class BrpcPsClient : public PSClient {
size_t table_id,
const uint64_t *keys, size_t num,
bool is_training);
virtual std::future<int32_t> pull_sparse_param(float **select_values,
size_t table_id,
const uint64_t *keys,
size_t num, bool is_training);
virtual std::future<int32_t> print_table_stat(uint32_t table_id);
......
......@@ -354,7 +354,7 @@ void Communicator::RpcRecvSparse(const std::string &varname, int table_id,
bool training = true;
auto status = _worker_ptr->pull_sparse(
auto status = _worker_ptr->pull_sparse_param(
(float **)push_g_vec.data(), table_id, // NOLINT
sparse_push_keys.data(), sparse_push_keys.size(), training);
status.wait();
......@@ -1029,7 +1029,7 @@ void GeoCommunicator::Send(const std::vector<std::string> &var_names,
auto &sparse_ids_set = iter.second;
auto sparse_ids_vec = std::make_shared<std::vector<int64_t>>();
sparse_ids_vec->assign(sparse_ids_set.begin(), sparse_ids_set.end());
sparse_id_queues_.at(key)->Push(sparse_ids_vec);
sparse_id_queues_.at(key)->Put(sparse_ids_vec);
VLOG(3) << "push " << sparse_ids_vec->size() << " ids to " << key
<< "'s queue";
}
......@@ -1051,7 +1051,10 @@ void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
for (auto &iter : send_varname_to_ctx_) {
auto &ctx = iter.second;
if (!ctx.is_sparse) continue;
if (!ctx.is_sparse) {
parallel_task_nums_ += 1;
continue;
}
auto &varnames = ctx.origin_varnames;
PADDLE_ENFORCE_EQ(
varnames.size(), 1,
......@@ -1060,12 +1063,11 @@ void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
for (auto &splited_var : ctx.splited_varnames) {
parallel_task_nums_ += 1;
sparse_id_queues_.insert(
std::pair<std::string, std::shared_ptr<BlockingQueue<
std::shared_ptr<std::vector<int64_t>>>>>(
std::pair<std::string, paddle::framework::Channel<
std::shared_ptr<std::vector<int64_t>>>>(
splited_var,
std::make_shared<
BlockingQueue<std::shared_ptr<std::vector<int64_t>>>>(
send_queue_size_)));
paddle::framework::MakeChannel<
std::shared_ptr<std::vector<int64_t>>>(send_queue_size_)));
}
}
......@@ -1153,7 +1155,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
auto &t_latest = var_latest->Get<framework::LoDTensor>();
auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
auto *var_delta = delta_scope_->Var(varname);
auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
......@@ -1183,7 +1185,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
RpcRecvDense(varnames, table_id, pserver_scope_.get());
// 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
for (auto &varname : varnames) {
auto *var_latest = recv_scope_->FindVar(varname);
auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
......@@ -1242,8 +1244,8 @@ std::vector<int64_t> GeoCommunicator::MergeSparseIds(
VLOG(3) << "Merge Number of " << send_varname << " = " << merge_num;
if (sparse_id_queues_.at(send_varname)->Size() > 0) {
wait_times = 0;
std::shared_ptr<std::vector<int64_t>> pop_ids =
sparse_id_queues_.at(send_varname)->Pop();
std::shared_ptr<std::vector<int64_t>> pop_ids = nullptr;
sparse_id_queues_.at(send_varname)->Get(pop_ids);
for (size_t j = 0; j < pop_ids->size(); j++) {
sparse_ids.insert(pop_ids->at(j));
}
......@@ -1268,6 +1270,9 @@ void GeoCommunicator::SendSparse(const std::string &varname,
std::vector<int64_t> &sparse_ids, int table_id,
int ep_idx) {
platform::RecordEvent record_event("GeoCommunicator->SendSparse");
if (sparse_ids.size() == 0) {
return;
}
std::string param_name = SplitedGradToParam(varname);
VLOG(1) << "In GeoCommunicator::SendSparse(" << varname << " " << param_name
<< ", ids.size = " << sparse_ids.size() << ", table_id: " << table_id
......@@ -1287,7 +1292,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
auto *t_old = var_old->GetMutable<framework::LoDTensor>();
auto dims1 = t_latest.dims()[1];
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
auto *var_delta = delta_scope_->Var(varname);
auto *t_delta = var_delta->GetMutable<pten::SelectedRows>();
......@@ -1313,6 +1318,10 @@ void GeoCommunicator::SendSparse(const std::string &varname,
t_value + j * dims1,
t_old->data<float>() + sparse_ids[j] * dims1);
push_g_vec.push_back(t_value + j * dims1);
VLOG(5) << "DEBUG GeoCommunicator::SendSparse send sparse key "
<< sparse_ids[j] << " value[0] " << push_g_vec[j][0]
<< " value[-1] " << push_g_vec[j][dims1 - 1];
}
++_async_call_num;
......@@ -1361,12 +1370,15 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
std::vector<float> v_delta;
v_delta.resize(numel);
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
auto blas =
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
cpu_ctx);
for (auto j = 0; j < static_cast<int>(keys.size()); ++j) {
VLOG(5) << "DEBUG GeoCommunicator::RecvSparse recv sparse key" << keys[j]
<< "value[0] " << values[j * dims1] << " value[-1] "
<< values[j * dims1 + dims1 - 1];
float *latest_data = t_latest->data<float>() + keys[j] * dims1;
float *old_data = t_old->data<float>() + keys[j] * dims1;
// pserver - old => delta
......
......@@ -30,6 +30,7 @@ limitations under the License. */
#include "gflags/gflags.h"
#include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h"
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/framework/variable_helper.h"
......@@ -178,7 +179,7 @@ inline void MergeVars(const std::string &var_name,
}
// set output tensor to 0.
auto cpu_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext cpu_ctx;
paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T>
constant_functor;
constant_functor(cpu_ctx, out_t, static_cast<T>(0));
......@@ -203,7 +204,7 @@ inline void MergeVars(const std::string &var_name,
for (auto &var : vars) {
inputs.push_back(&var->Get<pten::SelectedRows>());
}
auto dev_ctx = paddle::platform::CPUDeviceContext();
paddle::platform::CPUDeviceContext dev_ctx;
if (merge_add) {
paddle::operators::math::scatter::MergeAdd<
paddle::platform::CPUDeviceContext, T>
......@@ -626,9 +627,8 @@ class GeoCommunicator : public AsyncCommunicator {
// parameter on pserver
std::shared_ptr<Scope> pserver_scope_;
std::unordered_map<
std::string,
std::shared_ptr<BlockingQueue<std::shared_ptr<std::vector<int64_t>>>>>
std::unordered_map<std::string, paddle::framework::Channel<
std::shared_ptr<std::vector<int64_t>>>>
sparse_id_queues_;
};
......
......@@ -128,6 +128,17 @@ class PSClient {
const uint64_t *keys, size_t num,
bool is_training) = 0;
virtual std::future<int32_t> pull_sparse_param(float **select_values,
size_t table_id,
const uint64_t *keys,
size_t num, bool is_training) {
VLOG(0) << "Did not implement";
std::promise<int32_t> promise;
std::future<int> fut = promise.get_future();
promise.set_value(-1);
return fut;
}
virtual ::std::future<int32_t> pull_sparse_ptr(char **select_values,
size_t table_id,
const uint64_t *keys,
......
......@@ -47,6 +47,9 @@ cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framewo
cc_library(ctr_accessor SRCS ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_proto ${TABLE_DEPS} fs afs_wrapper ctr_accessor common_table)
cc_library(table SRCS table.cc DEPS memory_sparse_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost)
set_source_files_properties(memory_sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_library(memory_sparse_geo_table SRCS memory_sparse_geo_table.cc DEPS ps_framework_proto ${TABLE_DEPS} common_table)
cc_library(table SRCS table.cc DEPS memory_sparse_table memory_sparse_geo_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost)
target_link_libraries(table -fopenmp)
......@@ -15,13 +15,9 @@
#pragma once
#include <ThreadPool.h>
#include <functional>
#include <future> // NOLINT
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
namespace paddle {
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h"
namespace paddle {
namespace distributed {
int32_t MemorySparseGeoTable::push_sparse_param(const uint64_t* keys,
const float* values,
size_t num) {
VLOG(5) << "DEBUG MemorySparseGeoTable::push_sparse_param begin "
"push_sparse_param "
<< num;
auto shard_num = _task_pool_size;
std::vector<std::vector<uint64_t>> offset_bucket;
offset_bucket.resize(shard_num);
for (int x = 0; x < num; ++x) {
auto y = keys[x] % shard_num;
offset_bucket[y].push_back(x);
if (x < 10) {
VLOG(5) << "DEBUG MemorySparseGeoTable::push_sparse_param key: "
<< keys[x] << " shard: " << y;
}
}
std::vector<std::future<int>> tasks(shard_num);
for (int shard_id = 0; shard_id < shard_num; ++shard_id) {
tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
[this, shard_id, &keys, &offset_bucket, &values]() -> int {
auto& local_shard = _local_shards[shard_id];
auto& offsets = offset_bucket[shard_id];
for (int i = 0; i < offsets.size(); ++i) {
auto offset = offsets[i];
auto id = keys[offset];
auto& feature_value = local_shard[id];
feature_value.resize(_dim);
std::copy_n(values + _dim * offset, _dim, feature_value.data());
if (i < 10) {
VLOG(5) << "MemorySparseGeoTable::push_sparse_param "
"push_sparse_param key "
<< id << " value[0]: " << (values + _dim * offset)[0]
<< " data: " << feature_value.data()[0]
<< " value[-1]: " << (values + _dim * offset)[_dim - 1]
<< " data: " << feature_value.data()[_dim - 1];
}
}
return 0;
});
}
for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
tasks[shard_id].wait();
}
return 0;
}
int32_t MemorySparseGeoTable::pull_geo_param(const uint32_t trainer_id,
std::vector<float>* values,
std::vector<uint64_t>* ids) {
_geo_recorder->GetAndClear(trainer_id, ids);
VLOG(5)
<< "DEBUG MemorySparseGeoTable::pull_geo_param pull_geo_param trainer_id "
<< trainer_id << " id_num: " << ids->size();
std::vector<uint32_t> frequencies;
frequencies.resize(ids->size(), 1);
auto pull_value = PullSparseValue(ids->size(), _dim);
pull_value.is_training_ = true;
pull_value.feasigns_ = ids->data();
pull_value.frequencies_ = frequencies.data();
values->resize(ids->size() * _dim);
pull_sparse(values->data(), pull_value);
return 0;
}
int32_t MemorySparseGeoTable::push_sparse(const uint64_t* keys,
const float* values, size_t num) {
VLOG(5) << "DEBUG MemorySparseGeoTable::push_sparse keys[0]" << keys[0]
<< " key_num: " << num;
std::vector<uint64_t> ids;
ids.resize(num);
std::copy_n(keys, num, ids.begin());
_geo_recorder->Update(ids);
_push_sparse(keys, values, num);
return 0;
}
int32_t MemorySparseGeoTable::initialize() {
if (!_geo_recorder) {
auto trainers = _config.common().trainer_num();
_geo_recorder = std::make_shared<GeoRecorder>(trainers);
}
_dim = _config.common().dims()[0];
_shards_task_pool.resize(_task_pool_size);
for (int i = 0; i < _shards_task_pool.size(); ++i) {
_shards_task_pool[i].reset(new ::ThreadPool(1));
}
_local_shards.reset(new shard_type[_task_pool_size]);
return 0;
}
int32_t MemorySparseGeoTable::pull_sparse(float* pull_values,
const PullSparseValue& pull_value) {
auto shard_num = _task_pool_size;
std::vector<std::future<int>> tasks(shard_num);
std::vector<std::vector<std::pair<uint64_t, int>>> task_keys(shard_num);
size_t num = pull_value.numel_;
for (size_t i = 0; i < num; ++i) {
int shard_id = pull_value.feasigns_[i] % shard_num;
task_keys[shard_id].push_back({pull_value.feasigns_[i], i});
}
for (int shard_id = 0; shard_id < shard_num; ++shard_id) {
tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
[this, shard_id, &task_keys, pull_values]() -> int {
auto& local_shard = _local_shards[shard_id];
auto& keys = task_keys[shard_id];
for (size_t i = 0; i < keys.size(); i++) {
uint64_t key = keys[i].first;
auto offset = keys[i].second;
float* select_data = pull_values + _dim * offset;
auto itr = local_shard.find(key);
if (itr == local_shard.end()) {
// ++missed_keys;
auto& feature_value = local_shard[key];
feature_value.resize(_dim);
memset(feature_value.data(), 0, sizeof(float) * _dim);
VLOG(0) << "MemorySparseGeoTable pull_sparse key not found!!! "
<< key;
itr = local_shard.find(key);
}
memcpy(select_data, itr.value().data(), _dim * sizeof(float));
VLOG(5) << "DEBUG MemorySparseGeoTable::pull_sparse key: " << key
<< " select_data[0] " << select_data[0]
<< " value[0]: " << itr.value().data()[0];
}
return 0;
});
}
for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
tasks[shard_id].wait();
}
return 0;
}
int32_t MemorySparseGeoTable::_push_sparse(const uint64_t* keys,
const float* values, size_t num) {
auto shard_num = _task_pool_size;
std::vector<std::future<int>> tasks(shard_num);
std::vector<std::vector<std::pair<uint64_t, int>>> task_keys(shard_num);
for (size_t i = 0; i < num; ++i) {
int shard_id = keys[i] % shard_num;
task_keys[shard_id].push_back({keys[i], i});
}
for (size_t shard_id = 0; shard_id < shard_num; ++shard_id) {
tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
[this, shard_id, values, &task_keys]() -> int {
auto& keys = task_keys[shard_id];
auto& local_shard = _local_shards[shard_id];
auto blas = GetBlas<float>();
for (int i = 0; i < keys.size(); ++i) {
uint64_t key = keys[i].first;
uint64_t push_data_idx = keys[i].second;
const float* update_data = values + push_data_idx * _dim;
auto itr = local_shard.find(key);
if (itr == local_shard.end()) {
VLOG(0) << "sparse geo table push not found key!!! " << key;
auto& feature_value = local_shard[key];
feature_value.resize(_dim);
memset(feature_value.data(), 0, sizeof(float) * _dim);
itr = local_shard.find(key);
}
auto& feature_value = itr.value();
float* value_data = feature_value.data();
VLOG(5) << "DEBUG MemorySparseGeoTable::_push_sparse before key: "
<< key << " update_data[0] " << update_data[0]
<< " value[0]: " << value_data[0];
blas.VADD(_dim, update_data, value_data, value_data);
VLOG(5) << "DEBUG MemorySparseGeoTable::_push_sparse after key: "
<< key << " value[0]: " << value_data[0];
}
return 0;
});
}
for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
tasks[shard_id].wait();
}
return 0;
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <assert.h>
// #include <pthread.h>
#include <stdint.h>
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/distributed/ps/table/accessor.h"
#include "paddle/fluid/distributed/ps/table/common_table.h"
#include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
#include "paddle/fluid/distributed/ps/table/depends/geo_recorder.h"
#include "paddle/fluid/string/string_helper.h"
namespace paddle {
namespace distributed {
class GeoRecorder;
class MemorySparseGeoTable : public SparseTable {
public:
typedef SparseTableShard<uint64_t, FixedFeatureValue> shard_type;
MemorySparseGeoTable() { _geo_recorder = nullptr; }
virtual ~MemorySparseGeoTable() {}
virtual int32_t initialize();
virtual int32_t initialize_shard() { return 0; }
virtual int32_t load(const std::string& path, const std::string& param) {
return 0;
}
virtual int32_t save(const std::string& path, const std::string& param) {
return 0;
}
virtual int32_t flush() { return 0; }
virtual int32_t shrink(const std::string& param) { return 0; }
virtual void clear() { return; }
virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
int32_t push_sparse_param(const uint64_t* keys, const float* values,
size_t num);
// TODO(zhaocaibei123): change to pull_sparse, and rename pull_sparse
int32_t pull_geo_param(const uint32_t trainer_id, std::vector<float>* values,
std::vector<uint64_t>* keys);
int32_t push_sparse(const uint64_t* keys, const float* values,
size_t num) override;
int32_t _push_sparse(const uint64_t* keys, const float* values, size_t num);
// int32_t _pull_sparse(float* pull_values, const PullSparseValue&
// pull_value);
private:
std::shared_ptr<GeoRecorder> _geo_recorder;
const int _task_pool_size = 10;
std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
std::unique_ptr<shard_type[]> _local_shards;
int _dim;
};
} // namespace distributed
} // namespace paddle
......@@ -20,6 +20,7 @@
#include "paddle/fluid/distributed/ps/table/common_dense_table.h"
#include "paddle/fluid/distributed/ps/table/common_graph_table.h"
#include "paddle/fluid/distributed/ps/table/common_sparse_table.h"
#include "paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h"
#include "paddle/fluid/distributed/ps/table/sparse_geo_table.h"
#ifdef PADDLE_WITH_HETERPS
#include "paddle/fluid/distributed/ps/table/ssd_sparse_table.h"
......@@ -43,6 +44,7 @@ REGISTER_PSCORE_CLASS(Table, TensorTable);
REGISTER_PSCORE_CLASS(Table, DenseTensorTable);
REGISTER_PSCORE_CLASS(Table, GlobalStepTable);
REGISTER_PSCORE_CLASS(Table, MemorySparseTable);
REGISTER_PSCORE_CLASS(Table, MemorySparseGeoTable);
REGISTER_PSCORE_CLASS(ValueAccessor, CommMergeAccessor);
REGISTER_PSCORE_CLASS(ValueAccessor, CtrCommonAccessor);
REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule);
......
......@@ -35,3 +35,6 @@ cc_test(ctr_accessor_test SRCS ctr_accessor_test.cc DEPS ${COMMON_DEPS} boost ta
set_source_files_properties(memory_sparse_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(memory_sparse_table_test SRCS memory_sparse_table_test.cc DEPS ${COMMON_DEPS} boost table)
set_source_files_properties(memory_geo_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(memory_sparse_geo_table_test SRCS memory_geo_table_test.cc DEPS ${COMMON_DEPS} boost table)
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <ThreadPool.h>
#include <unistd.h>
#include <string>
#include <thread> // NOLINT
#include "google/protobuf/text_format.h"
#include "gtest/gtest.h"
#include "paddle/fluid/distributed/ps.pb.h"
#include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h"
#include "paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h"
#include "paddle/fluid/distributed/ps/table/table.h"
namespace paddle {
namespace distributed {
// MemorySparseGeoTable
TEST(MemorySparseGeoTable, SSUM) {
int emb_dim = 10;
int trainers = 2;
TableParameter table_config;
table_config.set_table_class("MemorySparseGeoTable");
FsClientParameter fs_config;
Table *table = new MemorySparseGeoTable();
TableAccessorParameter *accessor_config = table_config.mutable_accessor();
accessor_config->set_accessor_class("CommMergeAccessor");
accessor_config->set_fea_dim(10);
CommonAccessorParameter *common_config = table_config.mutable_common();
common_config->set_name("sum");
common_config->set_table_name("ssum_test_table");
common_config->set_trainer_num(trainers);
common_config->add_params("Param");
common_config->add_dims(emb_dim);
common_config->add_initializers("fill_constant&1.0");
auto ret = table->initialize(table_config, fs_config);
ASSERT_EQ(ret, 0);
// test push_sparse_param, and create params
std::vector<uint64_t> init_keys = {0, 1, 2, 3, 4};
std::vector<uint32_t> init_fres = {1, 1, 1, 1, 1};
std::vector<float> init_values;
for (size_t i = 0; i < init_keys.size() * emb_dim; i++) {
init_values.push_back(0.0);
}
table->push_sparse_param(init_keys.data(), init_values.data(),
init_keys.size());
std::vector<float> pull_values(init_values.size());
auto value = PullSparseValue(init_keys, init_fres, emb_dim);
table->pull_sparse(pull_values.data(), value);
for (size_t i = 0; i < init_keys.size() * emb_dim; i++) {
ASSERT_TRUE(abs(pull_values[i] - init_values[i]) < 1e-5);
}
std::vector<std::vector<uint64_t>> trainer_keys;
std::vector<std::vector<float>> trainer_values;
trainer_keys.resize(trainers);
trainer_values.resize(trainers);
float start = 0.0;
for (int i = 0; i < trainers; i++) {
trainer_keys[i] = init_keys;
for (size_t j = 0; j < trainer_keys[i].size(); j++) {
auto id = trainer_keys[i][j];
for (int k = 0; k < emb_dim; k++) {
trainer_values[i].push_back(start);
pull_values[id * emb_dim + k] += start;
start += 0.1;
}
}
}
std::shared_ptr<::ThreadPool> pool_ =
std::make_shared<::ThreadPool>(trainers);
std::vector<std::future<void>> task_status;
for (int i = 0; i < trainers; i++) {
auto &push_keys = trainer_keys[i];
auto &push_values = trainer_values[i];
auto task = [table, &push_keys, &push_values] {
table->push_sparse(push_keys.data(), push_values.data(),
push_keys.size());
};
task_status.push_back(pool_->enqueue(std::move(task)));
}
for (auto &status : task_status) {
status.wait();
}
std::vector<std::vector<uint64_t>> geo_pull_ids;
std::vector<std::vector<float>> geo_pull_values;
geo_pull_ids.resize(trainers);
geo_pull_values.resize(trainers);
for (int i = 0; i < trainers; i++) {
table->pull_geo_param(i, &geo_pull_values[i], &geo_pull_ids[i]);
ASSERT_EQ(geo_pull_values[i].size(), geo_pull_ids[i].size() * emb_dim);
for (size_t j = 0; j < geo_pull_ids[i].size(); ++j) {
auto id = geo_pull_ids[i][j];
for (int k = 0; k < emb_dim; k++) {
ASSERT_TRUE(abs(geo_pull_values[i][j * emb_dim + k] -
pull_values[id * emb_dim + k]) < 1e-5);
}
}
}
}
} // namespace distributed
} // namespace paddle
set(eager_deps pten pten_api hook_utils tensor_utils utils global_utils backward pten_tensor legacy autograd_meta grad_node_info grad_tensor_holder gradient_accumulation accumulation_node)
set(eager_deps pten pten_api hook_utils tensor_utils utils global_utils backward pten_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
set(generated_deps dygraph_function dygraph_node)
......@@ -9,14 +9,12 @@ endif()
add_subdirectory(api)
add_subdirectory(accumulation)
add_subdirectory(legacy)
cc_library(grad_node_info SRCS grad_node_info.cc DEPS pten pten_api)
cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulation)
cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator)
cc_library(autograd_meta SRCS autograd_meta.cc DEPS pten pten_api)
cc_library(utils SRCS utils.cc DEPS pten pten_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
cc_library(legacy SRCS ${DYGRAPH_LEGACY} DEPS global_utils proto_desc operator pten pten_api op_registry variable_helper memcpy)
cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info)
add_subdirectory(tests)
cc_library(gradient_accumulation SRCS gradient_accumulation.cc DEPS blas pten pten_api var_type_traits layer math_function)
cc_library(accumulation_node SRCS accumulation_node.cc DEPS gradient_accumulation pten pten_api grad_node_info)
cc_library(accumulation_node SRCS accumulation_node.cc DEPS gradient_accumulator pten pten_api grad_node_info)
......@@ -13,8 +13,8 @@
// limitations under the License.
#include "paddle/fluid/eager/accumulation/accumulation_node.h"
#include "paddle/fluid/eager/accumulation/gradient_accumulation.h"
#include "paddle/fluid/eager/eager_tensor.h"
#include "paddle/fluid/imperative/gradient_accumulator.h"
#include "paddle/pten/api/all.h"
#include "paddle/pten/core/dense_tensor.h"
......@@ -35,7 +35,7 @@ static void CopyOrAddTensor(egr::EagerTensor* tensor,
*tensor = t;
} else {
// Accumulation
egr::TensorAdd(t, tensor);
paddle::imperative::TensorAdd<egr::EagerTensor>(t, tensor);
}
}
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/eager/accumulation/gradient_accumulation.h"
#include <algorithm>
#include <memory>
#include <utility>
#include "paddle/fluid/eager/eager_tensor.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/imperative/gradient_accumulator.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/pten/api/all.h"
#include "paddle/pten/core/convert_utils.h"
#include "unsupported/Eigen/CXX11/Tensor"
#ifdef PADDLE_WITH_XPU
#include "xpu/refactor/math.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
#endif
namespace egr {
template <typename T>
class TensorAddFunctor : public boost::static_visitor<> {
public:
TensorAddFunctor(int64_t numel, const T* x, T* y)
: numel_(numel), x_(x), y_(y) {}
void operator()(const paddle::platform::CPUPlace& place) const {
paddle::platform::CPUDeviceContext* ctx =
dynamic_cast<paddle::platform::CPUDeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place));
auto blas =
paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext, T>(
*ctx);
blas.AXPY(numel_, 1., x_, y_);
}
// TODO(jiabin): Support xpu here from gradient_accumulator.cc
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void operator()(const paddle::platform::CUDAPlace& place) const {
paddle::platform::CUDADeviceContext* ctx =
dynamic_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place));
auto blas =
paddle::operators::math::GetBlas<paddle::platform::CUDADeviceContext,
T>(*ctx);
blas.AXPY(numel_, 1., x_, y_);
}
#else
void operator()(const paddle::platform::CUDAPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#endif
// TODO(jiabin): Support Npu here from gradient_accumulator.cc
// there is NO blas in CUDAPinnedPlace
void operator()(const paddle::platform::CUDAPinnedPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#ifdef PADDLE_WITH_ASCEND_CL
void operator()(const paddle::platform::NPUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#else
void operator()(const paddle::platform::NPUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#endif
#ifdef PADDLE_WITH_XPU
void operator()(const paddle::platform::XPUPlace& place) const {
paddle::platform::XPUDeviceContext* ctx =
dynamic_cast<paddle::platform::XPUDeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place));
xpu::add<T>(ctx->x_context(), x_, y_, y_, static_cast<int>(numel_));
}
#else
void operator()(const paddle::platform::XPUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#endif
#ifdef PADDLE_WITH_MLU
void operator()(const paddle::platform::MLUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#else
void operator()(const paddle::platform::MLUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#endif
#ifdef PADDLE_WITH_IPU
void operator()(const paddle::platform::IPUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#else
void operator()(const paddle::platform::IPUPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#endif
void operator()(const paddle::platform::NPUPinnedPlace& place) const {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
private:
int64_t numel_;
const T* x_;
mutable T* y_;
};
template <typename DeviceContext, typename T>
void TensorAddImpl(const std::shared_ptr<pten::DenseTensor>& src,
pten::DenseTensor* dst,
const paddle::platform::Place& place) {
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
paddle::platform::DeviceContext* ctx = pool.Get(place);
auto dev_ctx = dynamic_cast<DeviceContext*>(ctx);
paddle::operators::math::ElementwiseAddTo<DeviceContext, T> func;
func(dev_ctx, *(src.get()), dst);
}
template <typename DeviceContext, typename T>
void TensorAddImpl(const paddle::framework::Tensor& src,
paddle::framework::Tensor* dst,
const paddle::platform::Place& place) {
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
paddle::platform::DeviceContext* ctx = pool.Get(place);
auto dev_ctx = dynamic_cast<DeviceContext*>(ctx);
paddle::operators::math::ElementwiseAddTo<DeviceContext, T> func;
func(dev_ctx, src, dst);
}
void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
// TODO(jiabin): Support other tensor type later
std::shared_ptr<pten::DenseTensor> dst_tensor =
std::dynamic_pointer_cast<pten::DenseTensor>(dst->impl());
std::shared_ptr<pten::DenseTensor> src_tensor =
std::dynamic_pointer_cast<pten::DenseTensor>(src.impl());
auto numel = src_tensor->numel();
if (numel == 0) {
return;
}
PADDLE_ENFORCE_EQ(
dst_tensor->numel(), numel,
paddle::platform::errors::PreconditionNotMet(
"The number of elements of source tensor and destination tensor "
"should be equal, but got the number of elements of source tensor is "
"%zu and the number of elements of destination tensor is %zu.",
numel, dst_tensor->numel()));
auto data_type = pten::TransToProtoVarType(src_tensor->dtype());
auto place = src_tensor->place();
PADDLE_ENFORCE_EQ(pten::TransToProtoVarType(dst_tensor->dtype()), data_type,
paddle::platform::errors::PreconditionNotMet(
"The data type of source tensor and destination tensor "
"should be equal, Otherwise, the calculation results "
"will be incorrect."));
#define PADDLE_TENSOR_ADD(cpp_type) \
if (data_type == paddle::framework::DataTypeTrait<cpp_type>::DataType()) { \
TensorAddFunctor<cpp_type> func( \
numel, src_tensor->data<cpp_type>(), \
dst_tensor->mutable_data<cpp_type>(place)); \
paddle::platform::VisitPlace(place, func); \
return; \
}
// TODO(jiabin): Support NPU here
PADDLE_TENSOR_ADD(float);
// NOTE(phlrain): xpu only support float
#ifndef PADDLE_WITH_XPU
PADDLE_TENSOR_ADD(double);
// NOTE(chenweihang): only support complex grad tensor accumulated,
// support selected rows if needed in the future
PADDLE_TENSOR_ADD(paddle::platform::complex<float>);
PADDLE_TENSOR_ADD(paddle::platform::complex<double>);
#endif
#undef PADDLE_TENSOR_ADD
if (data_type == paddle::framework::proto::VarType::FP16) {
if (paddle::platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return TensorAddImpl<paddle::platform::CUDADeviceContext,
paddle::platform::float16>(src_tensor,
dst_tensor.get(), place);
#else
PADDLE_THROW(paddle::platform::errors::Unimplemented(
"Gradient accumulation of data type (%s) on place (%s) is not "
"supported in imperative mode",
paddle::framework::DataTypeToString(data_type), place));
#endif
} else if (paddle::platform::is_cpu_place(place)) {
return TensorAddImpl<paddle::platform::CPUDeviceContext,
paddle::platform::float16>(src_tensor,
dst_tensor.get(), place);
}
}
PADDLE_THROW(paddle::platform::errors::Unimplemented(
"Gradient accumulation of data type (%s) on place (%s) is not "
"supported in imperative mode",
paddle::framework::DataTypeToString(data_type), place));
}
void VariableAdd(const egr::EagerTensor& src_tensor,
egr::EagerTensor* dst_tensor) {
auto& src = src_tensor.Var();
auto* dst = dst_tensor->MutableVar();
if (dst->IsType<paddle::framework::LoDTensor>()) {
if (src.IsType<paddle::framework::LoDTensor>()) {
paddle::imperative::TensorAdd(src, dst);
} else if (src.IsType<pten::SelectedRows>()) {
paddle::imperative::SelectedRowsAddToTensor(src, dst);
} else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"Unexpected branch, output variable type is %s",
paddle::framework::ToTypeName(dst->Type())));
}
} else {
if (src.IsType<paddle::framework::LoDTensor>()) {
paddle::framework::Variable new_dst;
paddle::imperative::SelectedRowsAddTensor(*dst, src, &new_dst);
*dst = std::move(new_dst);
} else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"Unexpected branch, output variable type is %s",
paddle::framework::ToTypeName(dst->Type())));
}
}
}
} // namespace egr
fluid_generated/**
eager_generated/**
\ No newline at end of file
cc_library(scale_node SRCS scale_node.cc DEPS global_utils pten pten_api grad_node_info)
if(NOT ON_INFER)
cc_library(final_dygraph_node SRCS nodes.cc DEPS ${eager_deps})
add_dependencies(final_dygraph_node eager_final_state_codegen)
endif()
cc_library(eager_scale SRCS scale.cc DEPS pten_api pten autograd_meta scale_node)
if(NOT ON_INFER)
cc_library(final_dygraph_function SRCS dygraph_functions.cc DEPS ${eager_deps})
add_dependencies(final_dygraph_function eager_final_state_codegen)
endif()
#add_subdirectory(final_state_generator)
add_subdirectory(final_state_generator)
set(EAGER_GENERETOR_DEPS ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
......
......@@ -1220,7 +1220,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
// According to op_proto->attrs()
egr::legacy::RunOp("op_type", ins, outs, attr_map,
Controller.Instance().GetCurrentTracer()->TraceOp("op_type", ins, outs,
attr_map,
Controller.Instance().GetExpectedPlace(), {});
// According to fwd_outputs_names
......@@ -1401,7 +1402,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
const char* FWD_TRACE_OP_TEMPLATE =
" paddle::framework::AttributeMap attrs = attr_map;\n"
" paddle::framework::AttributeMap default_attrs;\n"
" egr::legacy::RunOp(\"%s\", ins, outs, attrs, \n"
" egr::Controller::Instance().GetCurrentTracer()->TraceOp(\"%s\", ins, "
"outs, attrs, \n"
" egr::Controller::Instance().GetExpectedPlace(),\n"
" &default_attrs, true, {});\n";
std::string trace_op_str =
......@@ -1712,7 +1714,8 @@ static std::string GenerateSingleOpBase(
" // Pass the entire attribute map to TraceOp\n"
" // The underlying kernel will pickup whatever attribute they need "
"at runtime\n"
" egr::legacy::RunOp(\"%s\", %s, %s, %s,\n"
" egr::Controller::Instance().GetCurrentTracer()->TraceOp(\"%s\", %s, "
"%s, %s,\n"
" egr::Controller::Instance().GetExpectedPlace(),\n"
" &this->default_attr_map_, false, {});\n";
std::string trace_opbase_str = paddle::string::Sprintf(
......@@ -1822,7 +1825,8 @@ static std::string GenerateGradNodeCCContents(
// Visit each OpBase
for(auto iter = "grad_node->begin()"; iter < "grad_node->end()"; iter++) {
// Simply pass entire attribute map to kernels
egr::legacy::RunOp("iter->Type()", ins, outs, this->attr_map_,
Controller.Instance().GetCurrentTracer()->TraceOp("iter->Type()", ins,
outs, this->attr_map_,
egr::Controller::Instance().ExpectedPlace(), false, {});
}
......@@ -2054,6 +2058,7 @@ static std::string GenerateDygraphHFileIncludes() {
"#include \"paddle/fluid/eager/autograd_meta.h\"\n"
"#include \"paddle/pten/api/all.h\"\n"
"#include \"paddle/fluid/eager/utils.h\"\n"
"#include \"paddle/fluid/imperative/tracer.h\"\n"
"#include \"paddle/fluid/framework/op_registry.h\"\n\n";
dygraph_forward_api_includes_str +=
......@@ -2084,8 +2089,7 @@ static void GenerateForwardDygraphFile(const std::string& forward_cc_path,
"dygraph_forward_api.h\"\n"
"#include "
"\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n\n"
"#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"
"#include \"paddle/fluid/eager/legacy/op_runner.h\"\n";
"#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n";
std::string forward_cc_include_str =
paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE);
std::ofstream forward_cc_stream(forward_cc_path, std::ios::out);
......@@ -2099,7 +2103,7 @@ static void GenerateNodeHFile(const std::string& node_h_path,
std::string node_h_include_str =
"#pragma once\n"
"#include \"paddle/fluid/eager/tensor_wrapper.h\"\n"
"#include \"paddle/fluid/eager/legacy/op_runner.h\"\n"
"#include \"paddle/fluid/imperative/tracer.h\"\n"
"#include \"paddle/fluid/eager/grad_node_info.h\"\n\n";
std::ofstream node_h_stream(node_h_path, std::ios::out);
node_h_stream << node_h_include_str;
......
......@@ -2,13 +2,14 @@ set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml")
set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml")
set(tmp_forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc")
set(tmp_forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h")
set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_node.cc")
set(tmp_nodes_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_node.h")
set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc")
set(tmp_nodes_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.h")
set(forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.cc")
set(forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h")
set(nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/node.cc")
set(nodes_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/node.h")
set(nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.cc")
set(nodes_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h")
message("Final State Eager CodeGen")
add_custom_target(eager_final_state_codegen
COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py"
"--api_yaml_path=${api_yaml_path}"
......
......@@ -15,9 +15,45 @@
import sys
import os
if __name__ == "__main__":
assert len(sys.argv) == 2
eager_dir = sys.argv[1]
def GenerateFileStructureForFinalDygraph(eager_dir):
"""
paddle/fluid/eager
|- generated
| |- CMakeLists.txt
| | "add_subdirectory(forwards), add_subdirectory(backwards)"
|
| |- forwards
| |- "dygraph_functions.cc"
| |- "dygraph_functions.h"
|
| |- backwards
| |- "nodes.cc"
| |- "nodes.h"
"""
# Directory Generation
generated_dir = os.path.join(eager_dir, "api/generated/eager_generated")
forwards_dir = os.path.join(generated_dir, "forwards")
nodes_dir = os.path.join(generated_dir, "backwards")
dirs = [generated_dir, forwards_dir, nodes_dir]
for directory in dirs:
if not os.path.exists(directory):
os.mkdir(directory)
# Empty files
dygraph_forward_api_h_path = os.path.join(generated_dir,
"dygraph_functions.h")
empty_files = [dygraph_forward_api_h_path]
empty_files.append(os.path.join(forwards_dir, "dygraph_functions.cc"))
empty_files.append(os.path.join(nodes_dir, "nodes.cc"))
empty_files.append(os.path.join(nodes_dir, "nodes.h"))
for path in empty_files:
if not os.path.exists(path):
open(path, 'a').close()
def GenerateFileStructureForIntermediateDygraph(eager_dir):
"""
paddle/fluid/eager
|- generated
......@@ -79,3 +115,10 @@ if __name__ == "__main__":
with open(generated_level_cmakelist_path, "w") as f:
f.write("add_subdirectory(forwards)\nadd_subdirectory(nodes)")
if __name__ == "__main__":
assert len(sys.argv) == 2
eager_dir = sys.argv[1]
GenerateFileStructureForIntermediateDygraph(eager_dir)
GenerateFileStructureForFinalDygraph(eager_dir)
......@@ -18,10 +18,10 @@
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h"
// pten deps
#include "paddle/pten/api/all.h"
#include "paddle/pten/api/include/tensor.h"
#include "paddle/pten/api/lib/api_declare.h"
#include "paddle/pten/api/lib/utils/tensor_utils.h"
#include "paddle/pten/core/convert_utils.h"
#include "paddle/pten/core/compat/convert_utils.h"
/**
* This class is used by Eager mode for now. It's painful to do this in Eager
* Mode, the better
......@@ -245,8 +245,7 @@ class EagerTensor final {
auto tensor_dense =
std::dynamic_pointer_cast<pten::DenseTensor>(tensor_->impl());
if (tensor_dense && tensor_dense.get()) {
paddle::experimental::SharesStorage(tensor_dense.get(),
framework_tensor);
*framework_tensor = *tensor_dense;
} else {
PADDLE_THROW(paddle::platform::errors::Fatal(
"Unrecognized egr::EagerTensor type, only "
......
......@@ -13,7 +13,7 @@
// limitations under the License.
#include "paddle/fluid/eager/grad_tensor_holder.h"
#include "paddle/fluid/eager/accumulation/gradient_accumulation.h"
#include "paddle/fluid/imperative/gradient_accumulator.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/math/math_function.h"
......@@ -72,17 +72,17 @@ void GradTensorHolder::add(size_t slot_id, size_t rank,
} else {
// Accumulation
if (t.initialized() && buffer_tensor.initialized()) {
TensorAdd(t, &buffer_tensor);
paddle::imperative::TensorAdd<egr::EagerTensor>(t, &buffer_tensor);
} else if (t.Var().IsInitialized() &&
buffer_tensor.Var().IsInitialized()) {
VariableAdd(t, &buffer_tensor);
paddle::imperative::VariableAdd(t, &buffer_tensor);
} else if (t.Var().IsInitialized() && buffer_tensor.initialized()) {
// TODO(jiabin): This can be merge to upper if case.
buffer_tensor.SyncToVar();
VariableAdd(t, &buffer_tensor);
paddle::imperative::VariableAdd(t, &buffer_tensor);
} else if (t.initialized() && buffer_tensor.Var().IsInitialized()) {
buffer_tensor.SyncToTensor();
TensorAdd(t, &buffer_tensor);
paddle::imperative::TensorAdd<egr::EagerTensor>(t, &buffer_tensor);
} else {
// Should not happend case
// 1. both not init
......
file(GLOB DYGRAPH_LEGACY "*.cpp" "*.cc")
set(DYGRAPH_LEGACY ${DYGRAPH_LEGACY} PARENT_SCOPE)
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/eager/legacy/amp_auto_cast.h"
#include <memory>
#include <string>
#include "paddle/fluid/eager/legacy/op_runner.h"
#include "paddle/fluid/eager/legacy/tensor_helper.h"
#include "paddle/fluid/framework/operator.h"
namespace egr {
namespace legacy {
AmpOperators::AmpOperators()
: allow_ops_(new std::unordered_set<std::string>()),
block_ops_(new std::unordered_set<std::string>()),
unsupported_fp16_ops_(new std::unordered_set<std::string>()) {
auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
auto fp16_dtype = paddle::framework::proto::VarType::FP16;
for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
bool supported = false;
for (auto& kernel_type : it->second) {
if ((paddle::platform::is_gpu_place(kernel_type.first.place_) ||
paddle::platform::is_xpu_place(kernel_type.first.place_)) &&
kernel_type.first.data_type_ == fp16_dtype) {
supported = true;
}
}
if (!supported) {
unsupported_fp16_ops_->insert(it->first);
}
}
}
AmpOperators::~AmpOperators() {}
AmpOperators& AmpOperators::Instance() {
static AmpOperators instance;
return instance;
}
std::shared_ptr<std::unordered_set<std::string>>
AmpOperators::GetMutableAllowOps() {
return allow_ops_;
}
std::shared_ptr<std::unordered_set<std::string>>
AmpOperators::GetMutableBlockOps() {
return block_ops_;
}
std::shared_ptr<std::unordered_set<std::string>>
AmpOperators::GetMutableUnsupportedFp16Ops() {
return unsupported_fp16_ops_;
}
std::ostream& operator<<(std::ostream& os, AmpOperators& ops) {
os << "allow ops: ";
auto allow_ops = ops.GetMutableAllowOps();
std::copy((*allow_ops).begin(), (*allow_ops).end(),
std::ostream_iterator<std::string>(os, " "));
os << "\n";
os << "block ops: ";
auto block_ops = ops.GetMutableBlockOps();
std::copy((*block_ops).begin(), (*block_ops).end(),
std::ostream_iterator<std::string>(os, " "));
os << "\n";
os << "unsupported fp16 ops: ";
auto unsupported_fp16_ops = ops.GetMutableUnsupportedFp16Ops();
std::copy((*unsupported_fp16_ops).begin(), (*unsupported_fp16_ops).end(),
std::ostream_iterator<std::string>(os, " "));
return os;
}
inline std::string GetDtypeStr(
const std::shared_ptr<egr::EagerTensor>& tensor) {
return paddle::framework::DataTypeToString(
egr::legacy::GetDtypeFromVar(tensor->Var()));
}
inline bool NeedCast(const std::shared_ptr<egr::EagerTensor>& tensor) {
auto place = egr::legacy::GetPlaceFromVar(tensor->Var());
auto data_type = egr::legacy::GetDtypeFromVar(tensor->Var());
if (paddle::platform::is_gpu_place(place) ||
paddle::platform::is_cuda_pinned_place(place) ||
paddle::platform::is_xpu_place(place)) {
// CudaPinndePlace is added for varbase created by dataloader
if (data_type == paddle::framework::proto::VarType::FP32 ||
data_type == paddle::framework::proto::VarType::FP16) {
return true;
}
}
return false;
}
// NOTE: Trace a cast op, so if a var is casted from fp32 to fp16, then the grad
// var will be cast back from fp16 to fp32 during backward phase.
static inline std::shared_ptr<egr::EagerTensor> CastToType(
const std::shared_ptr<egr::EagerTensor>& tensor,
const paddle::framework::proto::VarType::Type dst_type) {
NameTensorMap ins = {{"X", {tensor}}};
auto in_data_type = egr::legacy::GetDtypeFromVar(tensor->Var());
paddle::framework::AttributeMap attrs = {{"in_dtype", in_data_type},
{"out_dtype", dst_type}};
auto out = std::shared_ptr<egr::EagerTensor>(new egr::EagerTensor());
NameTensorMap outs = {{"Out", {out}}};
{
AutoCastGuard guard(paddle::imperative::AmpLevel::O0);
paddle::framework::AttributeMap default_attrs;
RunOp("cast", ins, outs, std::move(attrs), {}, &default_attrs, true);
}
return out;
}
static inline std::shared_ptr<egr::EagerTensor> CastToFP16(
const std::shared_ptr<egr::EagerTensor>& tensor) {
auto dst_type = paddle::framework::proto::VarType::FP16;
if (NeedCast(tensor) &&
(egr::legacy::GetDtypeFromVar(tensor->Var()) != dst_type)) {
return CastToType(tensor, dst_type);
}
return tensor;
}
static inline std::shared_ptr<egr::EagerTensor> CastToFP32(
const std::shared_ptr<egr::EagerTensor>& tensor) {
auto dst_type = paddle::framework::proto::VarType::FP32;
if (NeedCast(tensor) &&
(egr::legacy::GetDtypeFromVar(tensor->Var()) != dst_type)) {
return CastToType(tensor, dst_type);
}
return tensor;
}
static inline paddle::framework::proto::VarType::Type GetPromoteType(
const std::string& op_type, const NameTensorMap& ins) {
auto dst_type = paddle::framework::proto::VarType::FP16;
for (const auto& pair : ins) {
for (const auto& tensor : pair.second) {
if (egr::legacy::GetDtypeFromVar(tensor->Var()) ==
paddle::framework::proto::VarType::FP32) {
dst_type = egr::legacy::GetDtypeFromVar(tensor->Var());
break;
}
}
}
// NOTE(juncai): moving_average_abs_max_scale only consider the
// dtype of input(X)
if (op_type == "moving_average_abs_max_scale") {
for (const auto& pair : ins) {
if (pair.first == "X" &&
egr::legacy::GetDtypeFromVar(pair.second.front()->Var()) ==
paddle::framework::proto::VarType::FP16) {
dst_type = paddle::framework::proto::VarType::FP16;
}
}
}
return dst_type;
}
NameTensorMap AutoCastInputs(const std::string& op_type,
const NameTensorMap& ins) {
NameTensorMap new_ins(ins);
if (AmpOperators::Instance().GetMutableAllowOps()->count(op_type)) {
for (auto& pair : new_ins) {
// NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16.
if ((op_type == "batch_norm" || op_type == "layer_norm" ||
op_type == "sync_batch_norm") &&
pair.first != "X") {
continue;
}
VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
<< GetDtypeStr(*pair.second.cbegin()) << " to float16";
for (auto& var : pair.second) {
var = CastToFP16(var);
}
}
return new_ins;
} else if (AmpOperators::Instance().GetMutableBlockOps()->count(op_type)) {
for (auto& pair : new_ins) {
VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
<< GetDtypeStr(*pair.second.cbegin()) << " to float";
for (auto& var : pair.second) {
var = CastToFP32(var);
}
}
return new_ins;
} else {
auto dst_type = GetPromoteType(op_type, ins);
// NOTE(zhiqiu): if the op has op fp16 kernel, fall back to fp32.
if (dst_type == paddle::framework::proto::VarType::FP16 &&
AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count(
op_type)) {
dst_type = paddle::framework::proto::VarType::FP32;
}
for (auto& pair : new_ins) {
// NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16.
if ((op_type == "batch_norm" || op_type == "layer_norm" ||
op_type == "sync_batch_norm") &&
pair.first == "X" &&
dst_type == paddle::framework::proto::VarType::FP32) {
continue;
}
VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
<< GetDtypeStr(*pair.second.cbegin()) << " to "
<< paddle::framework::DataTypeToString(dst_type);
for (auto& var : pair.second) {
var = (dst_type == paddle::framework::proto::VarType::FP32
? CastToFP32(var)
: CastToFP16(var));
}
}
return new_ins;
}
return new_ins;
}
NameTensorMap CastPureFp16Inputs(const std::string& op_type,
const NameTensorMap& ins) {
NameTensorMap new_ins(ins);
auto dst_type = paddle::framework::proto::VarType::FP16;
if (AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count(op_type) ||
AmpOperators::Instance().GetMutableBlockOps()->count(op_type)) {
dst_type = paddle::framework::proto::VarType::FP32;
}
for (auto& pair : new_ins) {
if ((op_type == "batch_norm" || op_type == "layer_norm" ||
op_type == "sync_batch_norm") &&
pair.first != "X") {
continue;
}
VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
<< GetDtypeStr(*pair.second.cbegin()) << " to "
<< paddle::framework::DataTypeToString(dst_type);
for (auto& var : pair.second) {
var = (dst_type == paddle::framework::proto::VarType::FP32
? CastToFP32(var)
: CastToFP16(var));
}
}
return new_ins;
}
} // namespace legacy
} // namespace egr
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <set>
#include <string>
#include <tuple>
#include <unordered_set>
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/eager/eager_tensor.h"
#include "paddle/fluid/eager/legacy/type_def.h"
#include "paddle/fluid/imperative/amp_auto_cast.h"
namespace egr {
namespace legacy {
class AmpOperators {
public:
~AmpOperators();
AmpOperators(const AmpOperators& o) = delete;
const AmpOperators& operator=(const AmpOperators& o) = delete;
static AmpOperators& Instance();
std::shared_ptr<std::unordered_set<std::string>> GetMutableAllowOps();
std::shared_ptr<std::unordered_set<std::string>> GetMutableBlockOps();
std::shared_ptr<std::unordered_set<std::string>>
GetMutableUnsupportedFp16Ops();
private:
AmpOperators(); // forbid calling default constructor
// The set of ops that support fp16 calculation and are considered numerically
// safe and performance critical. These ops are always converted to fp16.
std::shared_ptr<std::unordered_set<std::string>> allow_ops_;
// The set of ops that support fp16 calculation and are considered numerically
// dangerous and whose effects may also be observed in downstream ops.
std::shared_ptr<std::unordered_set<std::string>> block_ops_;
// The set of ops that has no fp16 CUDA kennel.
std::shared_ptr<std::unordered_set<std::string>> unsupported_fp16_ops_;
};
std::ostream& operator<<(std::ostream& os, AmpOperators& ops);
// NOTE(zhiqiu): AutoCastGuard is used for RAII.
class AutoCastGuard {
public:
explicit AutoCastGuard(paddle::imperative::AmpLevel guard_level) {
pre_amp_level_ = Controller::Instance().GetAMPLevel();
if (pre_amp_level_ != guard_level) {
Controller::Instance().SetAMPLevel(guard_level);
}
}
~AutoCastGuard() { Controller::Instance().SetAMPLevel(pre_amp_level_); }
// forbid copy and operator=
AutoCastGuard(const AutoCastGuard& guard) = delete;
AutoCastGuard& operator=(const AutoCastGuard& guard) = delete;
private:
paddle::imperative::AmpLevel pre_amp_level_;
};
NameTensorMap AutoCastInputs(const std::string& op_type,
const NameTensorMap& ins);
NameTensorMap CastPureFp16Inputs(const std::string& op_type,
const NameTensorMap& ins);
} // namespace legacy
} // namespace egr
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/eager/eager_tensor.h"
#include "paddle/fluid/eager/legacy/type_def.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/framework/variable.h"
namespace egr {
namespace legacy {
class EagerExecutionContext : public paddle::framework::ExecutionContext {
using Variable = paddle::framework::Variable;
public:
EagerExecutionContext(const paddle::framework::OperatorBase& op,
const paddle::framework::Scope& scope,
const paddle::platform::DeviceContext& device_context,
const paddle::framework::RuntimeContext& ctx,
const NameTensorMap& tensor_map_in,
const NameTensorMap& tensor_map_out,
const paddle::framework::AttributeMap& attrs,
const paddle::framework::AttributeMap& default_attrs)
: ExecutionContext(op, scope, device_context, ctx),
tensor_map_in_(tensor_map_in),
tensor_map_out_(tensor_map_out),
attrs_(attrs),
default_attrs_(default_attrs) {}
std::string InputName(const std::string& name) const override {
auto it = tensor_map_in_.find(name);
PADDLE_ENFORCE_NE(it, tensor_map_in_.end(),
paddle::platform::errors::PreconditionNotMet(
"Can not find [%s] in Input", name));
// TODO(jiabin): This is used for egr::EagerTensor temporally,
// once we have name, remove it.
return it->second[0] ? it->second[0]->name()
: paddle::framework::kEmptyVarName;
}
std::vector<std::string> InputNames(const std::string& name) const override {
auto it = tensor_map_in_.find(name);
PADDLE_ENFORCE_NE(
it, tensor_map_in_.end(),
paddle::platform::errors::NotFound("Can not find [%s] in Input", name));
std::vector<std::string> vec_res;
vec_res.reserve(it->second.size());
for (size_t i = 0; i < it->second.size(); ++i) {
if (it->second[i]) {
// TODO(jiabin): This is used for egr::EagerTensor
// temporally, once we have name, remove it.
vec_res.push_back(it->second[i]->name());
} else {
vec_res.push_back(paddle::framework::kEmptyVarName);
}
}
return vec_res;
}
std::string OutputName(const std::string& name) const override {
auto it = tensor_map_out_.find(name);
PADDLE_ENFORCE_NE(it, tensor_map_out_.end(),
paddle::platform::errors::NotFound(
"Can not find [%s] in Output", name));
return it->second[0] ? it->second[0]->name()
: paddle::framework::kEmptyVarName;
}
std::vector<std::string> OutputNames(const std::string& name) const override {
auto it = tensor_map_out_.find(name);
PADDLE_ENFORCE_NE(it, tensor_map_out_.end(),
paddle::platform::errors::NotFound(
"Can not find [%s] in Output", name));
std::vector<std::string> vec_res;
vec_res.reserve(it->second.size());
for (size_t i = 0; i < it->second.size(); ++i) {
if (it->second[i]) {
vec_res.push_back(it->second[i]->name());
} else {
vec_res.push_back(paddle::framework::kEmptyVarName);
}
}
return vec_res;
}
bool HasAttr(const std::string& name) const override {
return attrs_.count(name) != 0 || default_attrs_.count(name) != 0;
}
const paddle::framework::AttributeMap& Attrs() const override {
return attrs_;
}
const paddle::framework::Attribute& GetAttr(
const std::string& name) const override {
auto it = attrs_.find(name);
if (it == attrs_.end()) {
it = default_attrs_.find(name);
if (it == default_attrs_.end()) {
PADDLE_THROW(paddle::platform::errors::NotFound(
"Can not find [%s] in attributes of op %s.", name,
this->GetOp().Type()));
}
}
return it->second;
}
std::vector<std::string> InNameList() const override {
std::vector<std::string> vec_temp;
vec_temp.reserve(tensor_map_in_.size());
for (auto& v : tensor_map_in_) {
vec_temp.push_back(v.first);
}
return vec_temp;
}
bool HasInput(const std::string& name) const override {
auto it = tensor_map_in_.find(name);
return (it != tensor_map_in_.end() && it->second.size() > 0);
}
bool HasOutput(const std::string& name) const override {
auto it = tensor_map_out_.find(name);
return (it != tensor_map_out_.end() && it->second.size() > 0);
}
size_t InputSize(const std::string& name) const override {
return InputNames(name).size();
}
size_t OutputSize(const std::string& name) const override {
return OutputNames(name).size();
}
const Variable* InputVar(const std::string& name) const override {
auto it = tensor_map_in_.find(name);
if (it == tensor_map_in_.end()) {
return nullptr;
}
return it->second.empty() || it->second[0] == nullptr
? nullptr
: it->second[0]->MutableVar();
}
Variable* OutputVar(const std::string& name) const override {
auto it = tensor_map_out_.find(name);
if (it == tensor_map_out_.end()) {
return nullptr;
}
return it->second.empty() || it->second[0] == nullptr
? nullptr
: it->second[0]->MutableVar();
}
const std::vector<Variable*> MultiInputVar(
const std::string& name) const override {
auto it = tensor_map_in_.find(name);
if (it == tensor_map_in_.end()) {
return {};
}
std::vector<Variable*> vec_res;
vec_res.reserve(it->second.size());
for (size_t i = 0; i < it->second.size(); ++i) {
vec_res.push_back(it->second[i] ? it->second[i]->MutableVar() : nullptr);
}
return vec_res;
}
std::vector<Variable*> MultiOutputVar(
const std::string& name) const override {
auto it = tensor_map_out_.find(name);
if (it == tensor_map_out_.end()) {
return {};
}
std::vector<Variable*> vec_res;
vec_res.reserve(it->second.size());
for (size_t i = 0; i < it->second.size(); ++i) {
vec_res.push_back(it->second[i] ? it->second[i]->MutableVar() : nullptr);
}
return vec_res;
}
private:
const NameTensorMap& tensor_map_in_;
const NameTensorMap& tensor_map_out_;
const paddle::framework::AttributeMap& attrs_;
const paddle::framework::AttributeMap& default_attrs_;
};
} // namespace legacy
} // namespace egr
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/eager/eager_tensor.h"
#include "paddle/fluid/eager/legacy/type_def.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/framework/var_type.h"
namespace egr {
namespace legacy {
class EagerInferShapeContext : public paddle::framework::InferShapeContext {
using DDim = paddle::framework::DDim;
public:
EagerInferShapeContext(
const NameTensorMap* in, const NameTensorMap* out,
const paddle::framework::AttributeMap* attr,
const paddle::framework::AttributeMap* default_attr,
const std::string op_type,
const paddle::framework::OpKernelType* op_kernel_type = nullptr)
: tensor_in_(in),
tensor_out_(out),
attrs_(attr),
default_attrs_(default_attr),
op_type_(op_type),
op_kernel_type_(op_kernel_type) {}
bool HasInput(const std::string& name) const override {
// has only one input
auto it = tensor_in_->find(name);
if (it == tensor_in_->end()) {
return false;
}
const auto& in = it->second;
if (in.size() == 0) return false;
PADDLE_ENFORCE_EQ(
in.size(), 1UL,
paddle::platform::errors::PreconditionNotMet(
"Input %s should not have more than one inputs", name));
return in[0] != nullptr;
}
bool HasOutput(const std::string& name) const override {
// has only one output
auto it = tensor_out_->find(name);
if (it == tensor_out_->end()) {
return false;
}
const auto& out = it->second;
if (out.size() == 0) {
return false;
}
PADDLE_ENFORCE_EQ(
out.size(), 1UL,
paddle::platform::errors::PreconditionNotMet(
"Output %s should not have more than one outputs", name));
return out[0] != nullptr;
}
bool HasInputs(const std::string& name) const override {
auto it = tensor_in_->find(name);
if (it == tensor_in_->end() || it->second.empty()) {
return false;
}
for (auto& input : it->second) {
if (input == nullptr) {
return false;
}
}
return true;
}
bool HasOutputs(const std::string& name) const override {
auto it = tensor_out_->find(name);
if (it == tensor_out_->end() || it->second.empty()) {
return false;
}
for (auto& output : it->second) {
if (output == nullptr) {
return false;
}
}
return true;
}
paddle::framework::AttrReader Attrs() const override {
return paddle::framework::AttrReader(*attrs_, *default_attrs_);
}
std::vector<std::string> Inputs(const std::string& name) const override {
std::vector<std::string> vec_res;
auto it = tensor_in_->find(name);
PADDLE_ENFORCE_NE(
it, tensor_in_->end(),
paddle::platform::errors::NotFound("can not find [%s] in input", name));
vec_res.reserve(it->second.size());
for (auto& var : it->second) {
if (var) {
vec_res.push_back(var->name());
} else {
vec_res.push_back(paddle::framework::kEmptyVarName);
}
}
return vec_res;
}
std::vector<std::string> Outputs(const std::string& name) const override {
std::vector<std::string> vec_res;
auto it = tensor_out_->find(name);
PADDLE_ENFORCE_NE(it, tensor_out_->end(),
paddle::platform::errors::NotFound(
"can not find [%s] in output", name));
vec_res.reserve(it->second.size());
for (auto& var : it->second) {
if (var) {
vec_res.push_back(var->name());
} else {
vec_res.push_back(paddle::framework::kEmptyVarName);
}
}
return vec_res;
}
std::string GetInputNameByIdx(size_t idx) const override {
auto& op_proto =
paddle::framework::OpInfoMap::Instance().Get(op_type_).proto_;
PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
paddle::platform::errors::OutOfRange(
"The index should be less than the size of inputs of "
"operator %s, but got index is %d and size is %d",
op_type_, idx, op_proto->inputs().size()));
return op_proto->inputs()[idx].name();
}
std::string GetOutputNameByIdx(size_t idx) const override {
auto& op_proto =
paddle::framework::OpInfoMap::Instance().Get(op_type_).proto_;
PADDLE_ENFORCE_LT(
idx, op_proto->outputs().size(),
paddle::platform::errors::OutOfRange(
"The index should be less than the size of outputs of "
"operator %s, but got index is %d and size is %d",
op_type_, idx, op_proto->outputs().size()));
return op_proto->outputs()[idx].name();
}
void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
size_t j = 0) override {
auto in_it = tensor_in_->find(in);
auto out_it = tensor_out_->find(out);
PADDLE_ENFORCE_NE(
in_it, tensor_in_->end(),
paddle::platform::errors::NotFound("can not found [%s] in input", in));
PADDLE_ENFORCE_GT(in_it->second.size(), i,
paddle::platform::errors::PreconditionNotMet(
"Inputs %s should have %llu argument", in, i));
PADDLE_ENFORCE_NE(
out_it, tensor_out_->end(),
paddle::platform::errors::NotFound("can not found [%s] in input", in));
PADDLE_ENFORCE_GT(out_it->second.size(), j,
paddle::platform::errors::PreconditionNotMet(
"Outputs %s should have %llu argument", out, j));
paddle::framework::Variable* in_var = in_it->second[i]->MutableVar();
paddle::framework::Variable* out_var = out_it->second[j]->MutableVar();
PADDLE_ENFORCE_EQ(in_var->Type(), out_var->Type(),
paddle::platform::errors::PreconditionNotMet(
"The type of %s and %s is not the same.", in, out));
if (in_var->IsType<paddle::framework::LoDTensor>()) {
auto& in_lod_tensor = in_var->Get<paddle::framework::LoDTensor>();
auto* out_lod_tensor =
out_var->GetMutable<paddle::framework::LoDTensor>();
out_lod_tensor->Resize(in_lod_tensor.dims());
} else {
auto& in_sele_rows = in_var->Get<pten::SelectedRows>();
auto out_sele_rows = out_var->GetMutable<pten::SelectedRows>();
out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
out_sele_rows->set_rows(in_sele_rows.rows());
out_sele_rows->set_height(in_sele_rows.height());
}
}
void ShareAllLoD(const std::string& in,
const std::string& out) const override {
// do nothing
}
void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
size_t j = 0) const override {
// do nothing
}
bool IsRuntime() const override { return true; }
bool IsRunMKLDNNKernel() const override {
return (op_kernel_type_ && (op_kernel_type_->data_layout_ ==
paddle::framework::DataLayout::kMKLDNN));
}
std::vector<paddle::framework::InferShapeVarPtr> GetInputVarPtrs(
const std::string& name) const override {
std::vector<paddle::framework::InferShapeVarPtr> res;
auto it = tensor_in_->find(name);
PADDLE_ENFORCE_NE(it, tensor_in_->end(),
paddle::platform::errors::NotFound(
"Can not find [%s] in inputs.", name));
for (auto& tensor : it->second) {
res.emplace_back(tensor->MutableVar());
}
return res;
}
std::vector<paddle::framework::InferShapeVarPtr> GetOutputVarPtrs(
const std::string& name) const override {
std::vector<paddle::framework::InferShapeVarPtr> res;
auto it = tensor_out_->find(name);
PADDLE_ENFORCE_NE(it, tensor_out_->end(),
paddle::platform::errors::NotFound(
"Can not find [%s] in outputs.", name));
for (auto& tensor : it->second) {
res.emplace_back(tensor->MutableVar());
}
return res;
}
DDim GetInputDim(const std::string& name) const override {
auto it = tensor_in_->find(name);
PADDLE_ENFORCE_NE(
it, tensor_in_->end(),
paddle::platform::errors::NotFound("can not find [%s] in input", name));
PADDLE_ENFORCE_EQ(
it->second.size(), 1UL,
paddle::platform::errors::PreconditionNotMet(
"Input(%s) should hold one element, but now it holds %d", name,
it->second.size()));
return this->GetDim(it->second[0]->MutableVar());
}
std::vector<DDim> GetInputsDim(const std::string& name) const override {
// const std::vector<Variable*>& vars = InputVars(name);
std::vector<DDim> vec_res;
auto it = tensor_in_->find(name);
PADDLE_ENFORCE_NE(it, tensor_in_->end(),
paddle::platform::errors::NotFound(
"can not find [%s] in output", name));
vec_res.reserve(it->second.size());
for (size_t i = 0; i < it->second.size(); ++i) {
if (it->second[i]) {
vec_res.emplace_back(GetDim(it->second[i]->MutableVar()));
} else {
vec_res.emplace_back();
}
}
return vec_res;
}
std::vector<paddle::framework::proto::VarType::Type> GetInputsVarType(
const std::string& name) const override {
std::vector<paddle::framework::proto::VarType::Type> vec_res;
auto it = tensor_in_->find(name);
PADDLE_ENFORCE_NE(
it, tensor_in_->end(),
paddle::platform::errors::NotFound("can not find [%s] in input", name));
vec_res.reserve(it->second.size());
for (size_t i = 0; i < it->second.size(); ++i) {
if (it->second[i]) {
vec_res.emplace_back(
paddle::framework::ToVarType(it->second[i]->MutableVar()->Type()));
} else {
vec_res.emplace_back();
}
}
return vec_res;
}
std::vector<paddle::framework::proto::VarType::Type> GetOutputsVarType(
const std::string& name) const override {
std::vector<paddle::framework::proto::VarType::Type> vec_res;
auto it = tensor_out_->find(name);
PADDLE_ENFORCE_NE(it, tensor_out_->end(),
paddle::platform::errors::NotFound(
"can not find [%s] in output", name));
vec_res.reserve(it->second.size());
for (size_t i = 0; i < it->second.size(); ++i) {
if (it->second[i]) {
vec_res.emplace_back(
paddle::framework::ToVarType(it->second[i]->MutableVar()->Type()));
} else {
vec_res.emplace_back(
static_cast<paddle::framework::proto::VarType::Type>(-1));
}
}
return vec_res;
}
void SetOutputDim(const std::string& name, const DDim& dim) override {
auto it = tensor_out_->find(name);
PADDLE_ENFORCE_NE(it, tensor_out_->end(),
paddle::platform::errors::NotFound(
"can not find [%s] in output", name));
if (it->second[0]) {
SetDim(it->second[0]->MutableVar(), dim);
}
}
void SetOutputsDim(const std::string& name,
const std::vector<DDim>& dims) override {
auto it = tensor_out_->find(name);
PADDLE_ENFORCE_NE(it, tensor_out_->end(),
paddle::platform::errors::NotFound(
"can not find [%s] in output", name));
PADDLE_ENFORCE_EQ(dims.size(), it->second.size(),
paddle::platform::errors::InvalidArgument(
"The number of dims is expected to be equal to the "
"number of Outputs(%s). But receieved: the number of "
"dims = %d, the number of Outputs(%s) = %d.",
name, dims.size(), name, it->second.size()));
for (size_t i = 0; i < dims.size(); ++i) {
if (it->second[i]) {
SetDim(it->second[i]->MutableVar(), dims[i]);
}
}
}
int32_t GetLoDLevel(const std::string& in, size_t i = 0) const override {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"GetLoDLevel function not support in dygraph mode"));
}
void SetLoDLevel(const std::string& out, int32_t lod_level,
size_t j = 0) const override {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"SetLoDLevel function not support in dygraph mode"));
}
protected:
DDim GetDim(paddle::framework::Variable* var) const {
PADDLE_ENFORCE_NOT_NULL(var, paddle::platform::errors::PreconditionNotMet(
"Input variable should not be null"));
if (var->IsType<paddle::framework::LoDTensor>()) {
return var->Get<paddle::framework::LoDTensor>().dims();
} else if (var->IsType<pten::SelectedRows>()) {
return var->Get<pten::SelectedRows>().GetCompleteDims();
} else {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Only LoDTensor/SelectedRows support 'GetDim', but Variables "
"type_id is xx."));
}
}
std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"GetRepeatedDims not support in dygraph runtime"));
}
void SetDim(paddle::framework::Variable* var, const DDim& dim) {
if (var->IsType<paddle::framework::LoDTensor>()) {
var->GetMutable<paddle::framework::LoDTensor>()->Resize(dim);
} else if (var->IsType<pten::SelectedRows>()) {
var->GetMutable<pten::SelectedRows>()->set_height(dim[0]);
} else {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Variable type_id %s, expect LoDTensor/SelectedRows."));
}
}
void SetDims(const std::vector<paddle::framework::Variable*>& vars,
const std::vector<DDim>& dims) {
size_t length = vars.size();
PADDLE_ENFORCE_EQ(
length, dims.size(),
paddle::platform::errors::PreconditionNotMet(
"Vars number [%d] should be equal with dims number [%d]", length,
dims.size()));
for (size_t i = 0; i < length; ++i) {
if (vars[i] == nullptr) {
continue;
}
SetDim(vars[i], dims[i]);
}
}
void SetRepeatedDims(const std::string& name,
const std::vector<DDim>& dims) override {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"SetRepeatedDims not support in dygraph runtime"));
}
private:
const NameTensorMap* tensor_in_;
const NameTensorMap* tensor_out_;
const paddle::framework::AttributeMap* attrs_;
const paddle::framework::AttributeMap* default_attrs_;
const std::string op_type_;
const paddle::framework::OpKernelType* op_kernel_type_;
};
} // namespace legacy
} // namespace egr
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/eager/eager_tensor.h"
#include "paddle/fluid/eager/legacy/tensor_helper.h"
#include "paddle/fluid/eager/legacy/type_def.h"
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/framework/var_type_inference.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/pten/api/all.h"
namespace egr {
namespace legacy {
// infer var type context for imperative mode
class TensorRuntimeInferVarTypeContext
: public paddle::framework::InferVarTypeContext {
public:
TensorRuntimeInferVarTypeContext(
const NameTensorMap& inputs, const NameTensorMap& outputs,
const paddle::framework::AttributeMap& attrs_map,
const paddle::framework::AttributeMap& default_attrs_map)
: InferVarTypeContext(nullptr, nullptr),
inputs_(inputs),
outputs_(outputs),
attrs_(attrs_map),
default_attrs_(default_attrs_map) {}
virtual ~TensorRuntimeInferVarTypeContext() {}
paddle::framework::Attribute GetAttr(const std::string& name) const override {
auto it = attrs_.find(name);
if (it == attrs_.end()) {
it = default_attrs_.find(name);
if (it == default_attrs_.end()) {
PADDLE_THROW(paddle::platform::errors::NotFound(
"Can not find [%s] in attributes.", name));
}
}
return it->second;
}
bool HasInput(const std::string& name) const override {
auto it = inputs_.find(name);
return (it != inputs_.end() && it->second.size() > 0);
}
bool HasOutput(const std::string& name) const override {
auto it = outputs_.find(name);
return (it != outputs_.end() && it->second.size() > 0);
}
size_t InputSize(const std::string& name) const {
return inputs_.at(name).size();
}
const std::string& InputVarName(const std::string& name,
const int index = 0) const {
// TODO(jiabin): Support this usage inputs_.at(name)[index]->Name()
auto it = inputs_.find(name);
PADDLE_ENFORCE_NE(it, inputs_.end(),
paddle::platform::errors::PreconditionNotMet(
"Can not find [%s] in Input", name));
return inputs_.at(name)[index]->name();
}
bool InputTypeAnyOf(
const std::string& name,
paddle::framework::proto::VarType::Type type) const override {
auto& inputs = inputs_.at(name);
return std::any_of(
inputs.begin(), inputs.end(),
[&type](const std::shared_ptr<egr::EagerTensor>& var) {
return paddle::framework::ToVarType(var->Var().Type()) == type;
});
}
bool InputTypeAllOf(
const std::string& name,
paddle::framework::proto::VarType::Type type) const override {
auto& inputs = inputs_.at(name);
return std::all_of(
inputs.begin(), inputs.end(),
[&type](const std::shared_ptr<egr::EagerTensor>& var) {
return paddle::framework::ToVarType(var->Var().Type()) == type;
});
}
void SyncTypeAndDataType(const std::string& input_name,
const std::string& output_name,
int index = 0) override {
auto in_tensor = inputs_.at(input_name)[index];
auto out_tensor = outputs_.at(output_name)[index];
if (in_tensor != out_tensor) {
this->SetTensorType(
out_tensor, paddle::framework::ToVarType(in_tensor->Var().Type()));
}
}
void SetOutputType(const std::string& name,
paddle::framework::proto::VarType::Type type,
int index = 0) override {
if (index == paddle::framework::ALL_ELEMENTS) {
for (auto& item : outputs_.at(name)) {
this->SetTensorType(item, type);
}
} else {
auto& var = outputs_.at(name)[index];
this->SetTensorType(var, type);
}
}
void SetTensorType(std::shared_ptr<egr::EagerTensor> out,
paddle::framework::proto::VarType::Type type) {
switch (type) {
case paddle::framework::proto::VarType::LOD_TENSOR: {
out->MutableVar()->GetMutable<paddle::framework::LoDTensor>();
break;
}
case paddle::framework::proto::VarType::SELECTED_ROWS: {
out->MutableVar()->GetMutable<pten::SelectedRows>();
break;
}
default: {
PADDLE_THROW(paddle::platform::errors::NotFound(
"Cannot found var type: %s while running runtime InferVarType",
paddle::framework::ToTypeName(type)));
}
}
}
paddle::framework::proto::VarType::Type GetInputType(
const std::string& name, const int& index = 0) const override {
return paddle::framework::ToVarType(inputs_.at(name)[index]->Var().Type());
}
paddle::framework::proto::VarType::Type GetOutputType(
const std::string& name, const int& index = 0) const override {
// TODO(jiabin): Support SelectedRows when we have it.
return paddle::framework::proto::VarType::LOD_TENSOR;
}
paddle::framework::proto::VarType::Type GetInputDataType(
const std::string& name, const int& index = 0) const override {
return inputs_.at(name)[index]
->Var()
.Get<paddle::framework::LoDTensor>()
.type();
}
void SetOutputDataType(const std::string& name,
paddle::framework::proto::VarType::Type type,
int index = 0) override {
// TODO(jiabin): It seems doesn't make sense to set data_type in EagerMode.
}
bool IsDygraph() const override { return true; }
protected:
bool HasVar(const std::string& name) const override {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"HasVar is not supported in runtime InferVarType"));
}
const std::vector<std::string>& InputVars(
const std::string& name) const override {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"InputVars is not supported in runtime InferVarType"));
}
const std::vector<std::string>& OutputVars(
const std::string& name) const override {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"OutputVars is not supported in runtime InferVarType"));
}
paddle::framework::proto::VarType::Type GetVarType(
const std::string& name) const override {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Do not manipulate var in runtime InferVarType"));
}
void SetVarType(const std::string& name,
paddle::framework::proto::VarType::Type type) override {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Do not manipulate var in runtime InferVarType"));
}
paddle::framework::proto::VarType::Type GetVarDataType(
const std::string& name) const override {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Do not manipulate var in runtime InferVarType"));
}
void SetVarDataType(const std::string& name,
paddle::framework::proto::VarType::Type type) override {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Do not manipulate var in runtime InferVarType"));
}
std::vector<paddle::framework::proto::VarType::Type> GetVarDataTypes(
const std::string& name) const override {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"GetVarDataTypes is not supported in runtime InferVarType"));
}
void SetVarDataTypes(
const std::string& name,
const std::vector<paddle::framework::proto::VarType::Type>&
multiple_data_type) override {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"SetVarDataTypes is not supported in runtime InferVarType"));
}
std::vector<int64_t> GetVarShape(const std::string& name) const override {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Do not handle Shape in runtime InferVarType"));
}
void SetVarShape(const std::string& name,
const std::vector<int64_t>& dims) override {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Do not handle Shape in runtime InferVarType"));
}
int32_t GetVarLoDLevel(const std::string& name) const override {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Do not handle LoDLevel in runtime InferVarType"));
}
void SetVarLoDLevel(const std::string& name, int32_t lod_level) override {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Do not handle LoDLevel in runtime InferVarType"));
}
private:
const NameTensorMap& inputs_;
const NameTensorMap& outputs_;
const paddle::framework::AttributeMap& attrs_;
const paddle::framework::AttributeMap& default_attrs_;
};
} // namespace legacy
} // namespace egr
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/eager/legacy/op_runner.h"
#include <map>
#include <set>
#include <unordered_set>
#include <utility>
#include "paddle/fluid/eager/legacy/amp_auto_cast.h"
#include "paddle/fluid/eager/legacy/infer_var_type_context.h"
#include "paddle/fluid/eager/legacy/prepared_operator.h"
#include "paddle/fluid/eager/legacy/tensor_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/denormal.h"
#include "paddle/fluid/string/string_helper.h"
DECLARE_bool(use_mkldnn);
DECLARE_string(tracer_mkldnn_ops_on);
DECLARE_string(tracer_mkldnn_ops_off);
namespace egr {
namespace legacy {
void OpRunImpl(const paddle::framework::OperatorBase& op,
const NameTensorMap& ins, const NameTensorMap& outs,
const paddle::framework::AttributeMap& attrs,
const paddle::framework::AttributeMap& default_attrs,
const paddle::platform::Place& place) {
VLOG(6) << "Get Opertor With Kernel";
auto* op_kernel =
dynamic_cast<const paddle::framework::OperatorWithKernel*>(&op);
PADDLE_ENFORCE_NOT_NULL(
op_kernel, paddle::platform::errors::PermissionDenied(
"Only support operator with kernel in Dygraph mode."));
auto& info = op.Info();
if (info.infer_var_type_) {
VLOG(6) << "Run InferVarType";
egr::legacy::TensorRuntimeInferVarTypeContext infer_var_type_ctx(
ins, outs, attrs, default_attrs);
VLOG(9) << "Actual Run InferVarType";
info.infer_var_type_(&infer_var_type_ctx);
}
VLOG(6) << "Initialize output tensor";
// Initialize output tensor
for (auto& tensor_pair : outs) {
for (auto& tensor : tensor_pair.second) {
if (tensor && tensor.get() && (!tensor->Var().IsInitialized())) {
InitializeVariable(tensor->MutableVar(),
paddle::framework::proto::VarType::LOD_TENSOR);
}
}
}
/**
* [ Why need temporary inputs here? ]
*
* PrepareData should not change original input tensor inplace.
* Suppose the user defines a tensor(int), enters an op to execute,
* and then this op rewrites GetExpectedKernelForVar, and converts
* this tensor to float type during execution. After the dynamic
* graph is executed, the user-defined variable will be lost, and
* the user cannot get the originally defined int tensor, because
* it has been converted to float, this should be regarded as a bug
* in certain usage scenarios
*
* In static graph mode, when op is executed, a temporary scope
* `transfer_scope` is created before PrepareData, the data after
* transform is stored in the temporary scope, and then discarded
* after the execution of op, but the original input is directly
* overwritten in the previous dynamic graph implemention.
*/
VLOG(6) << "Prepare Op";
auto prepared_op = egr::legacy::PreparedOp::Prepare(
ins, outs, *op_kernel, place, attrs, default_attrs);
VLOG(6) << "Prepare Data";
auto tmp_ins_ptr =
egr::legacy::PrepareData(*op_kernel, ins, prepared_op.kernel_type());
VLOG(6) << "Run Prepared Op";
if (tmp_ins_ptr == nullptr) {
prepared_op.Run(ins, outs, attrs, default_attrs);
} else {
prepared_op.Run(*tmp_ins_ptr, outs, attrs, default_attrs);
}
VLOG(6) << "Run Prepared Op end";
// TODO(jiabin): Set the output var's grad Forward DataType
}
void RunOp(const std::string& type, const NameTensorMap& ins,
const NameTensorMap& outs, paddle::framework::AttributeMap attrs,
const paddle::platform::Place& place,
paddle::framework::AttributeMap* default_attrs,
bool override_default_attr_map,
const std::map<std::string, std::string>& inplace_map) {
VLOG(1) << "Run Op: " << type;
if (FLAGS_use_mkldnn) {
// if both lists are empty all ops are enabled (default for
// FLAGS_use_mkldnn=1)
// if ops_on list is not empty only ops from that list are enabled
if (!FLAGS_tracer_mkldnn_ops_on.empty()) {
auto is_on = FLAGS_tracer_mkldnn_ops_on.find(type) != std::string::npos;
attrs["use_mkldnn"] = is_on;
} else {
// if ops_on list is empty all ops are enabled except types from off_list
auto is_off = FLAGS_tracer_mkldnn_ops_off.find(type) != std::string::npos;
attrs["use_mkldnn"] = !is_off;
}
}
auto op = paddle::framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
PADDLE_ENFORCE_NOT_NULL(default_attrs,
paddle::platform::errors::PermissionDenied(
"Detected default_attrs = nullptr."));
if (override_default_attr_map) {
const auto& op_info = op->Info();
auto* attr_checker = op_info.Checker();
if (attr_checker) {
attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true);
}
static paddle::framework::AttributeMap empty_attrs_map = {};
*default_attrs = attr_checker == nullptr
? empty_attrs_map
: attr_checker->GetDefaultAttrMap();
}
auto amp_level = egr::Controller::Instance().GetAMPLevel();
VLOG(6) << "Check AMP status";
NameTensorMap new_ins = ins;
if (amp_level == paddle::imperative::AmpLevel::O1) {
VLOG(5) << "Auto mixed precision run operator: " << type;
new_ins = AutoCastInputs(type, ins);
} else if (amp_level == paddle::imperative::AmpLevel::O2) {
VLOG(5) << "Pure fp16 run operator: " << type;
new_ins = CastPureFp16Inputs(type, ins);
}
try {
VLOG(6) << "Get Device id";
if (paddle::platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::platform::SetDeviceId(place.device);
#else
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU if use CUDAPlace."));
#endif
} else if (paddle::platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU
paddle::platform::SetXPUDeviceId(place.device);
#else
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with XPU if use XPUPlace."));
#endif
} else if (paddle::platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
paddle::platform::SetNPUDeviceId(place.device);
#else
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU if use NPUPlace."));
#endif
}
VLOG(6) << "Step in OpRunImpl";
OpRunImpl(*op, new_ins, outs, attrs, *default_attrs, place);
} catch (paddle::platform::EnforceNotMet& exception) {
paddle::framework::AppendErrorOpHint(type, &exception);
throw std::move(exception);
} catch (std::exception& ex) {
PADDLE_THROW(paddle::platform::errors::Fatal(
"Operator %s raises an %s exception.\n"
"The exception content is\n:%s.",
type, paddle::platform::demangle(typeid(ex).name()), ex.what()));
} catch (...) {
// NOTE: this branch represents a very serious bug with
// low probability of occurrence, and we can't get its
// exception content here.
PADDLE_THROW(paddle::platform::errors::Fatal(
"Operator %s raises an unknown exception.", type));
}
VLOG(6) << "Finish Run Op";
// TODO(jiabin): Support this later
// if (enable_program_desc_tracing_) {
// VLOG(5) << "Trace op " << type << " into ProgramDesc";
// program_desc_tracer_->InsertOp(type, new_ins, outs, attrs);
// }
}
} // namespace legacy
} // namespace egr
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/eager/legacy/prepared_operator.h"
#include "paddle/fluid/imperative/prepared_operator.h"
#include "paddle/fluid/eager/legacy/infer_shape_context.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/details/nan_inf_utils.h"
#include "paddle/fluid/framework/pten_utils.h"
#include "paddle/utils/small_vector.h"
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
#endif
DECLARE_bool(check_nan_inf);
DECLARE_bool(run_pten_kernel);
namespace egr {
namespace legacy {
const paddle::framework::Tensor* GetTensorFromVar(
const paddle::framework::Variable& var) {
if (var.IsType<paddle::framework::LoDTensor>()) {
return &(var.Get<paddle::framework::LoDTensor>());
} else if (var.IsType<pten::SelectedRows>()) {
return &(var.Get<pten::SelectedRows>().value());
} else {
return nullptr;
}
}
static const paddle::framework::Attribute& GetAttr(
const paddle::framework::AttributeMap& attrs,
const paddle::framework::AttributeMap& default_attrs,
const std::string& name) {
auto it = attrs.find(name);
bool found = it != attrs.end();
if (!found) {
it = default_attrs.find(name);
found = it != default_attrs.end();
}
PADDLE_ENFORCE_EQ(found, true,
paddle::platform::errors::NotFound(
"(%s) is not found in AttributeMap.", name));
return it->second;
}
static void HandleComplexGradToRealGrad(const NameTensorMap& outs) {
// TODO(jiabin): Support complex forward datatype later.
}
PreparedOp::PreparedOp(
const paddle::framework::OperatorBase& op,
const paddle::framework::RuntimeContext& ctx,
const paddle::framework::OpKernelType& kernel_type,
const paddle::framework::OperatorWithKernel::OpKernelFunc& func,
paddle::platform::DeviceContext* dev_ctx)
: op_(op),
ctx_(ctx),
kernel_type_(kernel_type),
func_(func),
dev_ctx_(dev_ctx) {}
PreparedOp::PreparedOp(
const paddle::framework::OperatorBase& op,
const paddle::framework::RuntimeContext& ctx,
const paddle::framework::OpKernelType& kernel_type,
const paddle::framework::KernelSignature& kernel_signature,
const pten::Kernel& pt_kernel, paddle::platform::DeviceContext* dev_ctx)
: op_(op),
ctx_(ctx),
kernel_type_(kernel_type),
func_(nullptr),
dev_ctx_(dev_ctx),
run_pten_kernel_(true),
pt_kernel_signature_(kernel_signature),
pt_kernel_(pt_kernel) {}
PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs,
const paddle::framework::OperatorWithKernel& op,
const paddle::platform::Place& place,
const paddle::framework::AttributeMap& attrs,
const paddle::framework::AttributeMap& default_attrs) {
VLOG(6) << "Preparing an Op";
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
paddle::framework::RuntimeContext ctx({}, {});
#ifdef PADDLE_WITH_MKLDNN
// MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and
// GetKernelType functions, so we need to copy the attributes there.
// Const qualifier of Attrs had to be discarded to overwrite it.
if (FLAGS_use_mkldnn) {
auto& mutable_op_attrs =
const_cast<paddle::framework::AttributeMap&>(op.Attrs());
mutable_op_attrs = default_attrs;
for (auto& attr : attrs) {
mutable_op_attrs[attr.first] = attr.second;
}
}
#endif
// 1. get expected kernel key
auto dygraph_exe_ctx = egr::legacy::EagerExecutionContext(
op, paddle::framework::Scope(), *dev_ctx, ctx, ins, outs, attrs,
default_attrs);
auto expected_kernel_key = op.GetExpectedKernelType(dygraph_exe_ctx);
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
// fit for pten
pten::KernelSignature pt_kernel_signature;
pten::KernelKey pt_kernel_key;
std::string pt_kernel_name;
if (pten::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) {
pt_kernel_signature = op.GetExpectedPtenKernelArgs(dygraph_exe_ctx);
VLOG(6) << pt_kernel_signature;
pt_kernel_name = pt_kernel_signature.name;
pt_kernel_key = TransOpKernelTypeToPtenKernelKey(expected_kernel_key);
auto pt_kernel = pten::KernelFactory::Instance().SelectKernel(
pt_kernel_name, pt_kernel_key);
if (pt_kernel.IsValid()) {
VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name
<< " | kernel key: " << pt_kernel_key
<< " | kernel: " << pt_kernel;
// TODO(chenweihang): using CPUKernel when miss device kernel case
return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature,
pt_kernel, dev_ctx);
} else {
VLOG(6) << "Dynamic mode ChoosePtenKernel - kernel `" << pt_kernel_name
<< "` not found.";
}
}
// 2. check if op[type] has kernel registered.
auto& all_op_kernels = op.AllOpKernels();
auto kernels_iter = all_op_kernels.find(op.Type());
if (kernels_iter == all_op_kernels.end() ||
kernels_iter->second.find(expected_kernel_key) ==
kernels_iter->second.end()
#ifdef PADDLE_WITH_XPU
||
paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
!paddle::platform::is_xpu_support_op(op.Type(),
expected_kernel_key) ||
paddle::platform::is_in_xpu_black_list(op.Type())
#endif
) {
if (pten::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) {
auto pt_cpu_kernel_key =
FallBackToCpu(expected_kernel_key, pt_kernel_key, op);
auto pt_cpu_kernel = pten::KernelFactory::Instance().SelectKernel(
pt_kernel_name, pt_cpu_kernel_key);
if (pt_cpu_kernel.IsValid()) {
VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name
<< " | kernel key: " << pt_cpu_kernel_key
<< " | kernel: " << pt_cpu_kernel;
return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature,
pt_cpu_kernel, dev_ctx);
}
}
}
PADDLE_ENFORCE_NE(
kernels_iter, all_op_kernels.end(),
paddle::platform::errors::NotFound(
"There are no kernels which are registered in the %s operator.",
op.Type()));
auto& kernels = kernels_iter->second;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_XPU
if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
(kernel_iter == kernels.end() ||
!paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
paddle::platform::is_in_xpu_black_list(op.Type()))) {
VLOG(3) << "missing XPU kernel: " << op.Type()
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = paddle::platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (kernel_iter == kernels.end() &&
paddle::platform::is_npu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing NPU kernel: " << op.Type()
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = paddle::platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
// TODO(jiabin): Add operator.cc's line 1000 part back when we need that
// case
PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
paddle::platform::errors::NotFound(
"Operator %s does not have kernel for %s.", op.Type(),
KernelTypeToString(expected_kernel_key)));
if (!(expected_kernel_key.place_ == place)) {
dev_ctx = pool.Get(expected_kernel_key.place_);
}
VLOG(6) << "Construct Prepared Op";
return PreparedOp(op, ctx, expected_kernel_key, kernel_iter->second, dev_ctx);
}
PreparedOp PreparedOp::Prepare(
const NameTensorMap& ins, const NameTensorMap& outs,
const paddle::framework::OperatorWithKernel& op,
const paddle::platform::Place& place,
const paddle::framework::AttributeMap& attrs,
const paddle::framework::AttributeMap& default_attrs) {
return PrepareImpl(ins, outs, op, place, attrs, default_attrs);
}
static void PreparedOpRunImpl(
const paddle::framework::OperatorBase& op,
const paddle::framework::RuntimeContext& ctx,
const paddle::framework::OpKernelType& kernel_type,
const paddle::framework::OperatorWithKernel::OpKernelFunc& func,
paddle::platform::DeviceContext* dev_ctx, const NameTensorMap& ins,
const NameTensorMap& outs, const paddle::framework::AttributeMap& attrs,
const paddle::framework::AttributeMap& default_attrs) {
// TODO(zjl): remove scope in dygraph
VLOG(6) << "Runing Prepared Op";
paddle::framework::Scope scope;
EagerInferShapeContext infer_shape_ctx(&ins, &outs, &attrs, &default_attrs,
op.Type(), &kernel_type);
op.Info().infer_shape_(&infer_shape_ctx);
func(EagerExecutionContext(op, scope, *dev_ctx, ctx, ins, outs, attrs,
default_attrs));
if (FLAGS_check_nan_inf) {
paddle::framework::details::CheckOpHasNanOrInfInEager<EagerTensor>(
op.Type(), outs, dev_ctx->GetPlace());
}
/**
* [ Why need handle complex gradient to real gradient? ]
*
* After the introduction of complex number calculations, Ops that support
* complex number calculations generally support type promotion, such as
* x(float32) + y(complex64) = out(complex64), then the type of the grad
* tensor should be dout(complex64), dx(float32), dy (complex64).
*
* But because the dout is complex64, the dx is also complex64 after
* grad op kernel executed, we need to recognize this situation and
* convert dx to float32 type. HandleComplexGradToRealGrad does this thing.
*/
if (paddle::framework::IsComplexType(kernel_type.data_type_)) {
HandleComplexGradToRealGrad(outs);
}
VLOG(6) << "Finish Runing Prepared Op";
}
static void PreparedOpRunPtImpl(
const paddle::framework::OperatorBase& op,
const paddle::framework::OpKernelType& kernel_type,
const paddle::framework::KernelSignature& pt_kernel_signature,
const pten::Kernel& pt_kernel, paddle::platform::DeviceContext* dev_ctx,
const NameTensorMap& ins, const NameTensorMap& outs,
const paddle::framework::AttributeMap& attrs,
const paddle::framework::AttributeMap& default_attrs) {
EagerInferShapeContext infer_shape_ctx(&ins, &outs, &attrs, &default_attrs,
op.Type());
static_cast<const paddle::framework::OperatorWithKernel&>(op).InferShape(
&infer_shape_ctx);
paddle::imperative::PreparePtenData<EagerTensor>(
pt_kernel, pt_kernel_signature,
static_cast<paddle::imperative::NameTensorMap>(ins));
pten::KernelContext pt_kernel_context;
paddle::imperative::BuildDygraphPtenKernelContext<EagerTensor>(
pt_kernel_signature, pt_kernel,
static_cast<paddle::imperative::NameTensorMap>(ins),
static_cast<paddle::imperative::NameTensorMap>(outs), attrs,
default_attrs, dev_ctx, &pt_kernel_context);
pt_kernel(&pt_kernel_context);
// TODO(chenweihang): add debug flags later
// TODO(chenweihang): deal with complex cases later
}
void PreparedOp::Run(const NameTensorMap& ins, const NameTensorMap& outs,
const paddle::framework::AttributeMap& attrs,
const paddle::framework::AttributeMap& default_attrs) {
if (run_pten_kernel_) {
PreparedOpRunPtImpl(op_, kernel_type_, pt_kernel_signature_, pt_kernel_,
dev_ctx_, ins, outs, attrs, default_attrs);
} else {
PreparedOpRunImpl(op_, ctx_, kernel_type_, func_, dev_ctx_, ins, outs,
attrs, default_attrs);
}
}
std::shared_ptr<NameTensorMap> PrepareData(
const paddle::framework::OperatorWithKernel& op, const NameTensorMap& ins,
const paddle::framework::OpKernelType& expected_kernel_key) {
std::shared_ptr<NameTensorMap> tmp_ins_ptr = nullptr;
for (const auto& name_pair : ins) {
for (size_t i = 0; i < name_pair.second.size(); ++i) {
auto& egr_tensor = name_pair.second[i];
const auto* tensor = GetTensorFromVar(egr_tensor->Var());
if (tensor && tensor->IsInitialized()) {
auto kernel_type_for_var = op.GetKernelTypeForVar(
name_pair.first, *tensor, expected_kernel_key);
if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
continue;
} else {
// TODO(jiabin): Support Cache later
VLOG(3) << "Transform Variable " << egr_tensor->name() << " from "
<< kernel_type_for_var << " to " << expected_kernel_key;
paddle::framework::Tensor out;
TransformData(expected_kernel_key, kernel_type_for_var, *tensor,
&out);
if (NeedTransformDataType(kernel_type_for_var, expected_kernel_key)) {
// To avoid NameVarMap copy construction overhead in general
// scenarios, if inplace transformed, return original input
// directly
if (tmp_ins_ptr == nullptr) {
tmp_ins_ptr = std::make_shared<NameTensorMap>(ins);
}
auto tmp_egr_tensor =
std::make_shared<EagerTensor>(egr_tensor->name());
SetTensorToVariable(egr_tensor->Var(), out,
tmp_egr_tensor->MutableVar());
(*tmp_ins_ptr)[name_pair.first][i] = tmp_egr_tensor;
} else {
// if dtype is same, transform inplace will not change the
// original
// value, transform inplace to avoid multiple copy
SetTensorToVariable(egr_tensor->Var(), out,
egr_tensor->MutableVar());
}
}
}
}
}
return tmp_ins_ptr;
}
} // namespace legacy
} // namespace egr
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/eager/legacy/execution_context.h"
#include "paddle/fluid/eager/legacy/type_def.h"
#include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/type_defs.h"
DECLARE_bool(use_mkldnn);
namespace paddle {
namespace framework {
class Variable;
} // namespace framework
} // namespace paddle
namespace pten {
class DenseTensor;
} // namespace pten
namespace egr {
namespace legacy {
const paddle::framework::Tensor* GetTensorFromVar(
const paddle::framework::Variable& var);
std::shared_ptr<NameTensorMap> PrepareData(
const paddle::framework::OperatorWithKernel& op, const NameTensorMap& ins,
const paddle::framework::OpKernelType& expected_kernel_key);
class PreparedOp {
public:
PreparedOp(const paddle::framework::OperatorBase& op,
const paddle::framework::RuntimeContext& ctx,
const paddle::framework::OpKernelType& kernel_type,
const paddle::framework::OperatorWithKernel::OpKernelFunc& func,
paddle::platform::DeviceContext* dev_ctx);
PreparedOp(const paddle::framework::OperatorBase& op,
const paddle::framework::RuntimeContext& ctx,
const paddle::framework::OpKernelType& kernel_type,
const paddle::framework::KernelSignature& kernel_signature,
const pten::Kernel& pt_kernel,
paddle::platform::DeviceContext* dev_ctx);
static PreparedOp Prepare(
const NameTensorMap& ins, const NameTensorMap& outs,
const paddle::framework::OperatorWithKernel& op,
const paddle::platform::Place& place,
const paddle::framework::AttributeMap& attrs,
const paddle::framework::AttributeMap& default_attrs);
void Run(const NameTensorMap& in, const NameTensorMap& out,
const paddle::framework::AttributeMap& attrs,
const paddle::framework::AttributeMap& default_attrs);
const paddle::framework::OpKernelType& kernel_type() const {
return kernel_type_;
}
private:
const paddle::framework::OperatorBase& op_;
const paddle::framework::RuntimeContext& ctx_;
paddle::framework::OpKernelType kernel_type_;
paddle::framework::OperatorWithKernel::OpKernelFunc func_;
paddle::platform::DeviceContext* dev_ctx_;
// NOTE(chenweihang): Similar op members are used to adapt to
// new pten kernel, if there is a better design in the future,
// we may polish the implementation here
bool run_pten_kernel_{false};
paddle::framework::KernelSignature pt_kernel_signature_;
pten::Kernel pt_kernel_;
};
} // namespace legacy
} // namespace egr
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/eager/legacy/tensor_helper.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows_utils.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/platform/place.h"
namespace egr {
namespace legacy {
void InitializeVariable(paddle::framework::Variable *var,
paddle::framework::proto::VarType::Type var_type) {
if (var_type == paddle::framework::proto::VarType::LOD_TENSOR) {
var->GetMutable<paddle::framework::LoDTensor>();
} else if (var_type == paddle::framework::proto::VarType::SELECTED_ROWS) {
var->GetMutable<pten::SelectedRows>();
} else if (var_type == paddle::framework::proto::VarType::FEED_MINIBATCH) {
var->GetMutable<paddle::framework::FeedList>();
} else if (var_type == paddle::framework::proto::VarType::FETCH_LIST) {
var->GetMutable<paddle::framework::FetchList>();
} else if (var_type == paddle::framework::proto::VarType::STEP_SCOPES) {
var->GetMutable<std::vector<paddle::framework::Scope *>>();
} else if (var_type == paddle::framework::proto::VarType::LOD_RANK_TABLE) {
var->GetMutable<paddle::framework::LoDRankTable>();
} else if (var_type == paddle::framework::proto::VarType::LOD_TENSOR_ARRAY) {
var->GetMutable<paddle::framework::LoDTensorArray>();
} else if (var_type == paddle::framework::proto::VarType::STRINGS) {
var->GetMutable<paddle::framework::Strings>();
} else if (var_type == paddle::framework::proto::VarType::VOCAB) {
var->GetMutable<paddle::framework::Vocab>();
} else if (var_type == paddle::framework::proto::VarType::PLACE_LIST) {
var->GetMutable<paddle::platform::PlaceList>();
} else if (var_type == paddle::framework::proto::VarType::READER) {
var->GetMutable<paddle::framework::ReaderHolder>();
} else if (var_type == paddle::framework::proto::VarType::RAW) {
// GetMutable will be called in operator
} else {
PADDLE_THROW(paddle::platform::errors::Unavailable(
"paddle::framework::Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, RAW].",
var_type));
}
}
void CopyVariable(const paddle::framework::Variable &src_var,
paddle::framework::Variable *dst_var) {
// only support cpu now
auto cpu_place = paddle::platform::CPUPlace();
if (src_var.IsType<paddle::framework::LoDTensor>()) {
auto *tmp_grad_tensor = dst_var->GetMutable<paddle::framework::LoDTensor>();
auto &src_tensor = src_var.Get<paddle::framework::LoDTensor>();
tmp_grad_tensor->set_lod(src_tensor.lod());
paddle::framework::TensorCopy(src_tensor, cpu_place, tmp_grad_tensor);
} else if (src_var.IsType<pten::SelectedRows>()) {
auto &src_slr = src_var.Get<pten::SelectedRows>();
auto *tmp_grad_slr = dst_var->GetMutable<pten::SelectedRows>();
tmp_grad_slr->set_rows(src_slr.rows());
tmp_grad_slr->set_height(src_slr.height());
auto &src_t = src_slr.value();
auto *dst_t = tmp_grad_slr->mutable_value();
paddle::framework::TensorCopy(src_t, cpu_place, dst_t);
} else {
PADDLE_THROW(paddle::platform::errors::Unavailable(
"Unknown variable type to copy."));
}
}
paddle::framework::proto::VarType::Type GetDtypeFromVar(
const paddle::framework::Variable &var) {
if (var.IsType<paddle::framework::LoDTensor>()) {
return var.Get<paddle::framework::LoDTensor>().type();
} else if (var.IsType<pten::SelectedRows>()) {
return var.Get<pten::SelectedRows>().value().type();
} else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"Variable type is %s, expect LoDTensor or SelectedRows.",
paddle::framework::ToTypeName(var.Type())));
}
}
const paddle::platform::Place &GetPlaceFromVar(
const paddle::framework::Variable &var) {
if (var.IsType<paddle::framework::LoDTensor>()) {
return var.Get<paddle::framework::LoDTensor>().place();
} else if (var.IsType<pten::SelectedRows>()) {
return var.Get<pten::SelectedRows>().place();
} else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"Variable type is %s, expect LoDTensor or SelectedRows.",
paddle::framework::ToTypeName(var.Type())));
}
}
} // namespace legacy
} // namespace egr
......@@ -36,11 +36,6 @@
using namespace egr; // NOLINT
using namespace egr_utils_api; // NOLINT
// Disable pten path
DECLARE_bool(run_pten_kernel);
TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
TEST(Benchmark, EagerScaleCPU) {
// Prepare Device Contexts
eager_test::InitEnv(paddle::platform::CPUPlace());
......
......@@ -35,10 +35,6 @@
using namespace egr; // NOLINT
using namespace egr_utils_api; // NOLINT
DECLARE_bool(run_pten_kernel);
TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(Benchmark, EagerScaleCUDA) {
......
......@@ -34,11 +34,6 @@
#include "gperftools/profiler.h"
#endif
// Disable pten path
DECLARE_bool(run_pten_kernel);
TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
namespace paddle {
namespace imperative {
......
......@@ -34,11 +34,6 @@
#include "gperftools/profiler.h"
#endif
// Disable pten path
DECLARE_bool(run_pten_kernel);
TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
namespace paddle {
......
......@@ -214,7 +214,7 @@ void benchmark_fluid_scale(const std::shared_ptr<imperative::VarBase>& X,
{std::shared_ptr<imperative::VarBase>(
new imperative::VarBase(true, "Out"))}}};
tracer.TraceOp("scale", ins, outs, attrs, place, true);
tracer.TraceOp<VarBase>("scale", ins, outs, attrs, place, true);
tmp_out = outs["Out"][0];
}
......@@ -250,7 +250,7 @@ void benchmark_fluid_matmul(const std::shared_ptr<imperative::VarBase>& X,
{std::shared_ptr<imperative::VarBase>(
new imperative::VarBase(true, "Out"))}}};
tracer.TraceOp("matmul_v2", ins, outs, attrs, place, true);
tracer.TraceOp<VarBase>("matmul_v2", ins, outs, attrs, place, true);
tmp_out = outs["Out"][0];
}
......@@ -288,7 +288,7 @@ void benchmark_fluid_mlp(
{std::shared_ptr<imperative::VarBase>(
new imperative::VarBase(true, "Out"))}}};
tracer.TraceOp("matmul_v2", ins, outs, attrs, place, true);
tracer.TraceOp<VarBase>("matmul_v2", ins, outs, attrs, place, true);
// EW-Add0
ins = {{"X", outs["Out"]}, {"Y", {Bs[i]}}};
......@@ -296,7 +296,7 @@ void benchmark_fluid_mlp(
{std::shared_ptr<imperative::VarBase>(
new imperative::VarBase(true, "Out"))}}};
tracer.TraceOp("elementwise_add", ins, outs, attrs, place, true);
tracer.TraceOp<VarBase>("elementwise_add", ins, outs, attrs, place, true);
input0 = outs["Out"][0];
}
......@@ -307,7 +307,7 @@ void benchmark_fluid_mlp(
new imperative::VarBase(true, "Out"))}}};
attrs = {{"reduce_all", true}};
tracer.TraceOp("reduce_sum", ins, outs, attrs, place, true);
tracer.TraceOp<VarBase>("reduce_sum", ins, outs, attrs, place, true);
auto* engine = tracer.GetEngine();
std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
......
......@@ -286,4 +286,43 @@ void EagerUtils::CheckAndRetainGrad(
}
}
paddle::experimental::Tensor EagerUtils::SyncToPtenTensors(
const egr::EagerTensor& tensor) {
const_cast<EagerTensor*>(&tensor)->SyncToTensor();
return *tensor.Tensor().get();
}
std::vector<paddle::experimental::Tensor> EagerUtils::SyncToPtenTensors(
const std::vector<egr::EagerTensor>& tensors) {
std::vector<paddle::experimental::Tensor> res;
size_t num = tensors.size();
res.reserve(num);
for (size_t i = 0; i < num; i++) {
const_cast<EagerTensor*>(&(tensors[i]))->SyncToTensor();
res.push_back(*tensors[i].Tensor().get());
}
return res;
}
egr::EagerTensor EagerUtils::CreateEagerTensorFromTensor(
const paddle::experimental::Tensor& tensor) {
egr::EagerTensor ret;
ret.set_tensor(std::make_shared<paddle::experimental::Tensor>(tensor));
return ret;
}
std::vector<egr::EagerTensor> EagerUtils::CreateEagerTensorFromTensor(
const std::vector<paddle::experimental::Tensor>& tensors) {
std::vector<egr::EagerTensor> res;
size_t num = tensors.size();
res.reserve(num);
for (size_t i = 0; i < num; i++) {
egr::EagerTensor tmp;
tmp.set_tensor(std::make_shared<paddle::experimental::Tensor>(tensors[i]));
res.emplace_back(std::move(tmp));
}
return res;
}
} // namespace egr
......@@ -170,6 +170,16 @@ class EagerUtils {
static void CheckAndRetainGrad(const egr::EagerTensor& tensor);
static void CheckAndRetainGrad(const std::vector<egr::EagerTensor>& tensors);
static paddle::experimental::Tensor SyncToPtenTensors(
const egr::EagerTensor& tensor);
static std::vector<paddle::experimental::Tensor> SyncToPtenTensors(
const std::vector<egr::EagerTensor>& tensors);
static egr::EagerTensor CreateEagerTensorFromTensor(
const paddle::experimental::Tensor& tensor);
static std::vector<egr::EagerTensor> CreateEagerTensorFromTensor(
const std::vector<paddle::experimental::Tensor>& tensors);
};
} // namespace egr
......@@ -293,7 +293,7 @@ if(WITH_DISTRIBUTE)
ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto trainer_desc_proto glog fs shell
fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer
fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper metrics lodtensor_printer
lod_rank_table feed_fetch_method collective_helper ${GLOB_DISTRIBUTE_DEPS}
graph_to_program_pass variable_helper data_feed_proto timer monitor
heter_service_proto fleet_executor ${BRPC_DEP})
......@@ -315,7 +315,7 @@ if(WITH_DISTRIBUTE)
pull_dense_worker.cc section_worker.cc heter_section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
index_sampler index_wrapper sampler index_dataset_proto
lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor heter_service_proto fleet heter_server brpc fleet_executor)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
......@@ -336,7 +336,7 @@ if(WITH_DISTRIBUTE)
ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor fleet_executor)
endif()
elseif(WITH_PSLIB)
......
......@@ -25,12 +25,10 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/pten/api/ext/op_kernel_info.h"
#include "paddle/pten/core/convert_utils.h"
#include "paddle/pten/core/compat/convert_utils.h"
#include "paddle/pten/core/kernel_context.h"
#include "paddle/pten/core/kernel_registry.h"
DECLARE_bool(run_pten_kernel);
namespace paddle {
namespace framework {
......@@ -279,10 +277,6 @@ static void RunKernelFunc(pten::KernelContext* ctx,
void RegisterKernelWithMetaInfo(
const std::vector<OpKernelInfo>& op_kernel_infos) {
PADDLE_ENFORCE_EQ(FLAGS_run_pten_kernel, true,
platform::errors::Unimplemented(
"Custom Kernel depends on pten kernel enabled,"));
for (size_t i = 0; i < op_kernel_infos.size(); ++i) {
auto& kernel_info = op_kernel_infos[i];
auto op_type = OpKernelInfoHelper::GetOpName(kernel_info);
......
......@@ -212,11 +212,13 @@ TEST(CustomKernel, custom_kernel_dot) {
kernel_context.EmplaceBackAttr(fake_attr_int64_vec);
kernel_context.EmplaceBackAttr(fake_attr_int_vec);
auto out_meta = pten::DotInferMeta(dense_x->meta(), dense_y->meta());
auto dense_out = std::make_shared<pten::DenseTensor>(
pten::make_intrusive<paddle::experimental::SharedStorage>(
pten::TransToFluidPlace(backend)),
std::move(out_meta));
pten::DenseTensorMeta());
pten::MetaTensor meta_out(dense_out.get());
pten::DotInferMeta(*dense_x, *dense_y, &meta_out);
kernel_context.EmplaceBackOutput(dense_out.get()); // idx:0 index:[0,1)
// fake_input_vec: idx:1, index:[1,3)
......
......@@ -37,7 +37,7 @@ limitations under the License. */
#include "paddle/pten/api/lib/api_declare.h"
#include "paddle/pten/api/lib/ext_compat_utils.h"
#include "paddle/pten/api/lib/utils/tensor_utils.h"
#include "paddle/pten/core/convert_utils.h"
#include "paddle/pten/core/compat/convert_utils.h"
#include "paddle/utils/any.h"
namespace paddle {
......@@ -110,8 +110,8 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
const std::vector<std::string>& outputs,
const std::vector<std::string>& attrs) {
VLOG(3) << "Custom Operator: Start run KernelFunc.";
std::vector<paddle::experimental::Tensor> custom_ins;
std::vector<std::vector<paddle::experimental::Tensor>> custom_vec_ins;
// prepare CustomOpKernelContext
paddle::CustomOpKernelContext kernel_ctx;
for (auto& in_name : inputs) {
VLOG(3) << "Custom Operator: input name - " << in_name;
if (detail::IsDuplicableVar(in_name)) {
......@@ -136,7 +136,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
custom_t.set_impl(std::make_shared<pten::DenseTensor>(*x));
custom_vec_in.emplace_back(custom_t);
}
custom_vec_ins.emplace_back(custom_vec_in);
kernel_ctx.EmplaceBackInputs(std::move(custom_vec_in));
} else {
auto* x = ctx.Input<Tensor>(in_name);
PADDLE_ENFORCE_NOT_NULL(x, platform::errors::NotFound(
......@@ -146,33 +146,32 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
"Input tensor (%s) is not initialized.", in_name));
paddle::experimental::Tensor custom_in;
custom_in.set_impl(std::make_shared<pten::DenseTensor>(*x));
custom_ins.emplace_back(custom_in);
kernel_ctx.EmplaceBackInput(std::move(custom_in));
}
}
std::vector<paddle::any> custom_attrs;
for (auto& attr_str : attrs) {
auto attr_name_and_type = detail::ParseAttrStr(attr_str);
auto attr_name = attr_name_and_type[0];
auto attr_type_str = attr_name_and_type[1];
if (attr_type_str == "bool") {
custom_attrs.emplace_back(ctx.Attr<bool>(attr_name));
kernel_ctx.EmplaceBackAttr(ctx.Attr<bool>(attr_name));
} else if (attr_type_str == "int") {
custom_attrs.emplace_back(ctx.Attr<int>(attr_name));
kernel_ctx.EmplaceBackAttr(ctx.Attr<int>(attr_name));
} else if (attr_type_str == "float") {
custom_attrs.emplace_back(ctx.Attr<float>(attr_name));
kernel_ctx.EmplaceBackAttr(ctx.Attr<float>(attr_name));
} else if (attr_type_str == "int64_t") {
custom_attrs.emplace_back(ctx.Attr<int64_t>(attr_name));
kernel_ctx.EmplaceBackAttr(ctx.Attr<int64_t>(attr_name));
} else if (attr_type_str == "std::string") {
custom_attrs.emplace_back(ctx.Attr<std::string>(attr_name));
kernel_ctx.EmplaceBackAttr(ctx.Attr<std::string>(attr_name));
} else if (attr_type_str == "std::vector<int>") {
custom_attrs.emplace_back(ctx.Attr<std::vector<int>>(attr_name));
kernel_ctx.EmplaceBackAttr(ctx.Attr<std::vector<int>>(attr_name));
} else if (attr_type_str == "std::vector<float>") {
custom_attrs.emplace_back(ctx.Attr<std::vector<float>>(attr_name));
kernel_ctx.EmplaceBackAttr(ctx.Attr<std::vector<float>>(attr_name));
} else if (attr_type_str == "std::vector<int64_t>") {
custom_attrs.emplace_back(ctx.Attr<std::vector<int64_t>>(attr_name));
kernel_ctx.EmplaceBackAttr(ctx.Attr<std::vector<int64_t>>(attr_name));
} else if (attr_type_str == "std::vector<std::string>") {
custom_attrs.emplace_back(ctx.Attr<std::vector<std::string>>(attr_name));
kernel_ctx.EmplaceBackAttr(ctx.Attr<std::vector<std::string>>(attr_name));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported `%s` type value as custom attribute now. "
......@@ -185,39 +184,75 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
}
}
VLOG(3) << "Custom Operator: Run ComputeFunc.";
try {
auto outs = func(custom_ins, custom_vec_ins, custom_attrs);
VLOG(3) << "Custom Operator: push outputs into CustomOpKernelContext.";
// cache the target tensor pointers
std::vector<Tensor*> true_out_ptrs;
for (size_t i = 0; i < outputs.size(); ++i) {
auto out_name = outputs[i];
if (detail::IsDuplicableVar(out_name)) {
PADDLE_ENFORCE(i == 0UL && outputs.size() == 1UL,
platform::errors::PreconditionNotMet(
"If custom operator's outputs contains `paddle::Vec("
")` type, "
"it only can hold one output."));
auto vec_out = ctx.MultiOutput<Tensor>(out_name);
PADDLE_ENFORCE_NE(vec_out.empty(), true,
platform::errors::NotFound(
"Output vector<tensor> (%s) is empty.", out_name));
std::vector<paddle::experimental::Tensor> custom_vec_out;
for (size_t j = 0; j < vec_out.size(); ++j) {
auto* out = vec_out[j];
PADDLE_ENFORCE_NOT_NULL(
out,
platform::errors::NotFound(
"The %d-th tensor in output vector<tensor> (%s) is nullptr.", j,
out_name));
true_out_ptrs.emplace_back(out);
paddle::experimental::Tensor custom_t;
// here only can copy the output tensor into context
custom_t.set_impl(std::make_shared<pten::DenseTensor>(*out));
custom_vec_out.emplace_back(custom_t);
}
kernel_ctx.EmplaceBackOutputs(std::move(custom_vec_out));
} else {
auto* out = ctx.Output<Tensor>(out_name);
PADDLE_ENFORCE_NOT_NULL(
out, platform::errors::NotFound("Output tensor (%s) is nullptr.",
out_name));
true_out_ptrs.emplace_back(out);
paddle::experimental::Tensor custom_out;
// here only can copy the output tensor into context
custom_out.set_impl(std::make_shared<pten::DenseTensor>(*out));
kernel_ctx.EmplaceBackOutput(std::move(custom_out));
}
}
VLOG(3) << "Custom Operator: Share outputs into ExecutionContext.";
for (size_t i = 0; i < outputs.size(); ++i) {
auto out_name = outputs[i];
if (detail::IsDuplicableVar(out_name)) {
PADDLE_ENFORCE(i == 0UL && outputs.size() == 1UL,
platform::errors::PreconditionNotMet(
"If custom operator's outputs contains `paddle::Vec("
")` type, "
"it only can hold one output."));
auto vec_true_outs = ctx.MultiOutput<Tensor>(out_name);
PADDLE_ENFORCE_EQ(
vec_true_outs.size(), outs.size(),
platform::errors::InvalidArgument(
"The number of element in custom operator outputs is wrong, "
"expected contains %d Tensors, but actually contains %d "
"Tensors.",
vec_true_outs.size(), outs.size()));
for (size_t j = 0; j < vec_true_outs.size(); ++j) {
experimental::SharesStorage(
std::dynamic_pointer_cast<pten::DenseTensor>(outs.at(j).impl())
.get(),
vec_true_outs.at(j));
}
} else {
auto* true_out = ctx.Output<Tensor>(out_name);
experimental::SharesStorage(
std::dynamic_pointer_cast<pten::DenseTensor>(outs.at(i).impl())
.get(),
true_out);
try {
VLOG(3) << "Custom Operator: Run ComputeFunc.";
func(&kernel_ctx);
// sync output tensor data into original output
auto* calc_outs = kernel_ctx.AllMutableOutput();
PADDLE_ENFORCE_EQ(
true_out_ptrs.size(), calc_outs->size(),
platform::errors::InvalidArgument(
"The number of element in custom operator outputs is wrong, "
"expected contains %d Tensors, but actually contains %d "
"Tensors.",
true_out_ptrs.size(), calc_outs->size()));
for (size_t i = 0; i < true_out_ptrs.size(); ++i) {
auto* true_out = true_out_ptrs.at(i);
auto calc_out =
std::dynamic_pointer_cast<pten::DenseTensor>(calc_outs->at(i).impl());
// assgin meta info
auto* true_out_meta = pten::DenseTensorUtils::GetMutableMeta(true_out);
true_out_meta->dims = calc_out->dims();
true_out_meta->dtype = calc_out->dtype();
true_out_meta->layout = calc_out->layout();
// lod and offset no need to be reset
// reset holder if needed
if (true_out->Holder() != calc_out->Holder()) {
true_out->ResetHolder(calc_out->Holder());
}
}
} catch (platform::EnforceNotMet& exception) {
......@@ -613,7 +648,7 @@ void RegisterOperatorWithMetaInfo(
auto op_name = OpMetaInfoHelper::GetOpName(base_op_meta);
if (OpInfoMap::Instance().Has(op_name)) {
LOG(WARNING) << "Operator (" << op_name << ")has been registered.";
LOG(WARNING) << "Operator (" << op_name << ") has been registered.";
return;
}
......
......@@ -340,6 +340,7 @@ InMemoryDataFeed<T>::InMemoryDataFeed() {
this->thread_id_ = 0;
this->thread_num_ = 1;
this->parse_ins_id_ = false;
this->parse_uid_ = false;
this->parse_content_ = false;
this->parse_logkey_ = false;
this->enable_pv_merge_ = false;
......@@ -498,6 +499,11 @@ void InMemoryDataFeed<T>::SetParseInsId(bool parse_ins_id) {
parse_ins_id_ = parse_ins_id;
}
template <typename T>
void InMemoryDataFeed<T>::SetParseUid(bool parse_uid) {
parse_uid_ = parse_uid;
}
template <typename T>
void InMemoryDataFeed<T>::LoadIntoMemory() {
#ifdef _LINUX
......@@ -1047,6 +1053,7 @@ void MultiSlotInMemoryDataFeed::Init(
use_slots_shape_.push_back(local_shape);
}
}
uid_slot_ = multi_slot_desc.uid_slot();
feed_vec_.resize(use_slots_.size());
const int kEstimatedFeasignNumPerSlot = 5; // Magic Number
for (size_t i = 0; i < all_slot_num; i++) {
......@@ -1160,6 +1167,19 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
"\nWe detect the feasign number of this slot is %d, "
"which is illegal.",
str, i, num));
#ifdef PADDLE_WITH_PSLIB
if (parse_uid_ && all_slots_[i] == uid_slot_) {
PADDLE_ENFORCE(num == 1 && all_slots_type_[i][0] == 'u',
platform::errors::PreconditionNotMet(
"The uid has to be uint64 and single.\n"
"please check this error line: %s",
str));
char* uidptr = endptr;
uint64_t feasign = (uint64_t)strtoull(uidptr, &uidptr, 10);
instance->uid_ = feasign;
}
#endif
if (idx != -1) {
if (all_slots_type_[i][0] == 'f') { // float
for (int j = 0; j < num; ++j) {
......
......@@ -191,6 +191,7 @@ struct Record {
uint64_t search_id;
uint32_t rank;
uint32_t cmatch;
std::string uid_;
};
inline SlotRecord make_slotrecord() {
......@@ -562,6 +563,7 @@ class DataFeed {
virtual void SetThreadNum(int thread_num) {}
// This function will do nothing at default
virtual void SetParseInsId(bool parse_ins_id) {}
virtual void SetParseUid(bool parse_uid) {}
virtual void SetParseContent(bool parse_content) {}
virtual void SetParseLogKey(bool parse_logkey) {}
virtual void SetEnablePvMerge(bool enable_pv_merge) {}
......@@ -645,6 +647,7 @@ class DataFeed {
std::vector<std::string> ins_id_vec_;
std::vector<std::string> ins_content_vec_;
platform::Place place_;
std::string uid_slot_;
// The input type of pipe reader, 0 for one sample, 1 for one batch
int input_type_;
......@@ -709,6 +712,7 @@ class InMemoryDataFeed : public DataFeed {
virtual void SetThreadId(int thread_id);
virtual void SetThreadNum(int thread_num);
virtual void SetParseInsId(bool parse_ins_id);
virtual void SetParseUid(bool parse_uid);
virtual void SetParseContent(bool parse_content);
virtual void SetParseLogKey(bool parse_logkey);
virtual void SetEnablePvMerge(bool enable_pv_merge);
......@@ -737,6 +741,7 @@ class InMemoryDataFeed : public DataFeed {
int thread_id_;
int thread_num_;
bool parse_ins_id_;
bool parse_uid_;
bool parse_content_;
bool parse_logkey_;
bool enable_pv_merge_;
......
......@@ -22,7 +22,10 @@ message Slot {
repeated int32 shape = 5; // we can define N-D Tensor
}
message MultiSlotDesc { repeated Slot slots = 1; }
message MultiSlotDesc {
repeated Slot slots = 1;
optional string uid_slot = 2;
}
message DataFeedDesc {
optional string name = 1;
......
......@@ -57,6 +57,8 @@ DatasetImpl<T>::DatasetImpl() {
parse_logkey_ = false;
preload_thread_num_ = 0;
global_index_ = 0;
shuffle_by_uid_ = false;
parse_uid_ = false;
}
// set filelist, file_idx_ will reset to zero.
......@@ -150,6 +152,12 @@ void DatasetImpl<T>::SetMergeBySid(bool is_merge) {
merge_by_sid_ = is_merge;
}
template <typename T>
void DatasetImpl<T>::SetShuffleByUid(bool enable_shuffle_uid) {
shuffle_by_uid_ = enable_shuffle_uid;
parse_uid_ = true;
}
template <typename T>
void DatasetImpl<T>::SetEnablePvMerge(bool enable_pv_merge) {
enable_pv_merge_ = enable_pv_merge;
......@@ -664,11 +672,14 @@ void MultiSlotDataset::GlobalShuffle(int thread_num) {
<< input_channel_->Size();
auto get_client_id = [this, fleet_ptr](const Record& data) -> size_t {
if (!this->merge_by_insid_) {
return fleet_ptr->LocalRandomEngine()() % this->trainer_num_;
} else {
if (this->merge_by_insid_) {
return XXH64(data.ins_id_.data(), data.ins_id_.length(), 0) %
this->trainer_num_;
} else if (this->shuffle_by_uid_) {
return XXH64(data.uid_.data(), data.uid_.length(), 0) %
this->trainer_num_;
} else {
return fleet_ptr->LocalRandomEngine()() % this->trainer_num_;
}
};
......@@ -902,6 +913,7 @@ void DatasetImpl<T>::CreateReaders() {
readers_[i]->SetFeaNum(&total_fea_num_);
readers_[i]->SetFileList(filelist_);
readers_[i]->SetParseInsId(parse_ins_id_);
readers_[i]->SetParseUid(parse_uid_);
readers_[i]->SetParseContent(parse_content_);
readers_[i]->SetParseLogKey(parse_logkey_);
readers_[i]->SetEnablePvMerge(enable_pv_merge_);
......@@ -972,6 +984,7 @@ void DatasetImpl<T>::CreatePreLoadReaders() {
preload_readers_[i]->SetFeaNumMutex(&mutex_for_fea_num_);
preload_readers_[i]->SetFeaNum(&total_fea_num_);
preload_readers_[i]->SetParseInsId(parse_ins_id_);
preload_readers_[i]->SetParseUid(parse_uid_);
preload_readers_[i]->SetParseContent(parse_content_);
preload_readers_[i]->SetParseLogKey(parse_logkey_);
preload_readers_[i]->SetEnablePvMerge(enable_pv_merge_);
......
此差异已折叠。
......@@ -21,7 +21,10 @@ TEST(DataTypeTransform, GPUTransform) {
auto cpu_place = paddle::platform::CPUPlace();
auto gpu_place = paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
context.PartialInitWithAllocator();
auto kernel_fp16 = paddle::framework::OpKernelType(
paddle::framework::proto::VarType::FP16, gpu_place,
paddle::framework::DataLayout::kAnyLayout,
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册