提交 eb9df7b9 编写于 作者: J jingqinghe

Merge https://github.com/PaddlePaddle/Paddle into fix_hdfs_download

要显示的变更太多。

To preserve performance only 1000 of 1000+ files are displayed.
...@@ -63,8 +63,29 @@ if(WIN32) ...@@ -63,8 +63,29 @@ if(WIN32)
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
if(${flag_var} MATCHES "/MD")
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
endif()
endforeach(flag_var)
endif() endif()
# windows build turn off warnings.
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
endforeach(flag_var)
foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
set(${flag_var} "${${flag_var}} /w")
endforeach(flag_var)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
message(STATUS "Using parallel compiling (/MP)") message(STATUS "Using parallel compiling (/MP)")
......
...@@ -107,6 +107,9 @@ function(select_nvcc_arch_flags out_variable) ...@@ -107,6 +107,9 @@ function(select_nvcc_arch_flags out_variable)
elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
set(cuda_arch_bin "50") set(cuda_arch_bin "50")
elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
add_definitions("-DSUPPORTS_CUDA_FP16")
endif()
set(cuda_arch_bin "60 61") set(cuda_arch_bin "60 61")
elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
......
...@@ -22,23 +22,8 @@ SET(CRYPTOPP_TAG CRYPTOPP_8_2_0) ...@@ -22,23 +22,8 @@ SET(CRYPTOPP_TAG CRYPTOPP_8_2_0)
IF(WIN32) IF(WIN32)
SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib" CACHE FILEPATH "cryptopp library." FORCE) SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib" CACHE FILEPATH "cryptopp library." FORCE)
SET(CRYPTOPP_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
set(CompilerFlags
CMAKE_CXX_FLAGS
CMAKE_CXX_FLAGS_DEBUG
CMAKE_CXX_FLAGS_RELEASE
CMAKE_C_FLAGS
CMAKE_C_FLAGS_DEBUG
CMAKE_C_FLAGS_RELEASE
)
foreach(CompilerFlag ${CompilerFlags})
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
endforeach()
ELSE(WIN32) ELSE(WIN32)
SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a" CACHE FILEPATH "cryptopp library." FORCE) SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a" CACHE FILEPATH "cryptopp library." FORCE)
SET(CRYPTOPP_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
ENDIF(WIN32) ENDIF(WIN32)
set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS} set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
...@@ -48,7 +33,7 @@ set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS} ...@@ -48,7 +33,7 @@ set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
-DCMAKE_INSTALL_LIBDIR=${CRYPTOPP_INSTALL_DIR}/lib -DCMAKE_INSTALL_LIBDIR=${CRYPTOPP_INSTALL_DIR}/lib
-DCMAKE_INSTALL_PREFIX=${CRYPTOPP_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX=${CRYPTOPP_INSTALL_DIR}
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_CXX_FLAGS=${CRYPTOPP_CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
......
...@@ -4,7 +4,7 @@ endif() ...@@ -4,7 +4,7 @@ endif()
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(XPU_PROJECT "extern_xpu") SET(XPU_PROJECT "extern_xpu")
SET(XPU_URL "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE) SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu.tar.gz" CACHE STRING "" FORCE)
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}") SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu") SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")
......
...@@ -90,20 +90,6 @@ macro(safe_set_nvflag flag_name) ...@@ -90,20 +90,6 @@ macro(safe_set_nvflag flag_name)
endif() endif()
endmacro() endmacro()
macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared
if (BUILD_SHARED_LIBS)
return() # if build shared libs, the flags keep same with '/MD'
endif(BUILD_SHARED_LIBS)
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
if(${flag_var} MATCHES "/MD")
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
endif(${flag_var} MATCHES "/MD")
endforeach(flag_var)
endmacro()
CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS) CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
if(NOT UINT64_MAX_EXISTS) if(NOT UINT64_MAX_EXISTS)
...@@ -229,20 +215,3 @@ endforeach() ...@@ -229,20 +215,3 @@ endforeach()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
if(WIN32)
# windows build turn off warnings.
if(MSVC_STATIC_CRT)
safe_set_static_flag()
endif()
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
endforeach(flag_var)
foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
set(${flag_var} "${${flag_var}} /w")
endforeach(flag_var)
endif()
...@@ -446,6 +446,9 @@ function(nv_library TARGET_NAME) ...@@ -446,6 +446,9 @@ function(nv_library TARGET_NAME)
message(FATAL "Please specify source file or library in nv_library.") message(FATAL "Please specify source file or library in nv_library.")
endif() endif()
endif(nv_library_SRCS) endif(nv_library_SRCS)
if (WIN32)
set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
endif(WIN32)
endif() endif()
endfunction(nv_library) endfunction(nv_library)
...@@ -461,6 +464,9 @@ function(nv_binary TARGET_NAME) ...@@ -461,6 +464,9 @@ function(nv_binary TARGET_NAME)
add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
common_link(${TARGET_NAME}) common_link(${TARGET_NAME})
endif() endif()
if (WIN32)
set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
endif(WIN32)
endif() endif()
endfunction(nv_binary) endfunction(nv_binary)
...@@ -482,6 +488,9 @@ function(nv_test TARGET_NAME) ...@@ -482,6 +488,9 @@ function(nv_test TARGET_NAME)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
if (WIN32)
set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
endif(WIN32)
endif() endif()
endfunction(nv_test) endfunction(nv_test)
...@@ -712,6 +721,7 @@ function(proto_library TARGET_NAME) ...@@ -712,6 +721,7 @@ function(proto_library TARGET_NAME)
set(proto_hdrs) set(proto_hdrs)
paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS}) paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf) cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
add_dependencies(extern_xxhash ${TARGET_NAME})
endfunction() endfunction()
function(py_proto_compile TARGET_NAME) function(py_proto_compile TARGET_NAME)
......
...@@ -13,18 +13,17 @@ ...@@ -13,18 +13,17 @@
# limitations under the License. # limitations under the License.
# make package for paddle fluid shared and static library # make package for paddle fluid shared and static library
set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING set(PADDLE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_install_dir" CACHE STRING
"A path setting fluid shared and static libraries") "A path setting paddle shared and static libraries")
set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_dir" CACHE STRING
"A path setting fluid inference shared and static libraries") "A path setting paddle inference shared and static libraries")
# TODO(zhaolong) # At present, the size of static lib in Windows is very large,
# At present, the size of static lib in Windows exceeds the system limit, # so we need to crop the library size.
# so the generation of static lib is temporarily turned off.
if(WIN32) if(WIN32)
#todo: remove the option #todo: remove the option
option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." OFF) option(WITH_STATIC_LIB "Compile demo with static/shared library, default use dynamic." OFF)
if(NOT PYTHON_EXECUTABLE) if(NOT PYTHON_EXECUTABLE)
FIND_PACKAGE(PythonInterp REQUIRED) FIND_PACKAGE(PythonInterp REQUIRED)
endif() endif()
...@@ -142,14 +141,14 @@ set(inference_lib_deps third_party paddle_fluid paddle_fluid_c paddle_fluid_shar ...@@ -142,14 +141,14 @@ set(inference_lib_deps third_party paddle_fluid paddle_fluid_c paddle_fluid_shar
add_custom_target(inference_lib_dist DEPENDS ${inference_lib_deps}) add_custom_target(inference_lib_dist DEPENDS ${inference_lib_deps})
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/threadpool") set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/threadpool")
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
DSTS ${dst_dir}) DSTS ${dst_dir})
# Only GPU need cudaErrorMessage.pb # Only GPU need cudaErrorMessage.pb
IF(WITH_GPU) IF(WITH_GPU)
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data") set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${cudaerror_INCLUDE_DIR} SRCS ${cudaerror_INCLUDE_DIR}
DSTS ${dst_dir}) DSTS ${dst_dir})
...@@ -158,65 +157,66 @@ ENDIF() ...@@ -158,65 +157,66 @@ ENDIF()
# CMakeCache Info # CMakeCache Info
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
DSTS ${FLUID_INFERENCE_INSTALL_DIR}) DSTS ${PADDLE_INFERENCE_INSTALL_DIR})
copy_part_of_thrid_party(inference_lib_dist ${FLUID_INFERENCE_INSTALL_DIR}) copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
if(WIN32) if(WIN32)
if(WITH_STATIC_LIB) if(WITH_STATIC_LIB)
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib) set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib
${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.*)
else() else()
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.dll set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.dll
${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.lib) ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.lib)
endif() endif()
copy(inference_lib_dist
SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib
${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
else(WIN32) else(WIN32)
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*) set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
endif(WIN32) copy(inference_lib_dist
if(WIN32 AND NOT WITH_STATIC_LIB)
copy(inference_lib_dist
SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib} SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib) endif(WIN32)
else()
copy(inference_lib_dist
SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib)
endif()
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/internal) DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/internal)
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io/crypto/cipher.h SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io/crypto/cipher.h
DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/crypto/) DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io) include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
# CAPI inference library for only inference # CAPI inference library for only inference
set(FLUID_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_c_install_dir" CACHE STRING set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING
"A path setting CAPI fluid inference shared") "A path setting CAPI paddle inference shared")
copy_part_of_thrid_party(inference_lib_dist ${FLUID_INFERENCE_C_INSTALL_DIR}) copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_C_INSTALL_DIR})
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_fluid_c.*) if(WIN32)
set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/${CMAKE_BUILD_TYPE}/paddle_fluid_c.*)
else(WIN32)
set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_fluid_c.*)
endif(WIN32)
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${src_dir}/inference/capi/paddle_c_api.h ${paddle_fluid_c_lib} SRCS ${src_dir}/inference/capi/paddle_c_api.h ${paddle_fluid_c_lib}
DSTS ${FLUID_INFERENCE_C_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_C_INSTALL_DIR}/paddle/lib) DSTS ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib)
# fluid library for both train and inference # fluid library for both train and inference
set(fluid_lib_deps inference_lib_dist) set(fluid_lib_deps inference_lib_dist)
add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps}) add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps})
set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid") set(dst_dir "${PADDLE_INSTALL_DIR}/paddle/fluid")
set(module "inference") set(module "inference")
if(WIN32 AND NOT WITH_STATIC_LIB) if(WIN32)
copy(fluid_lib_dist copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib} SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
) )
else() else()
copy(fluid_lib_dist copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib} SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
...@@ -273,22 +273,22 @@ copy(fluid_lib_dist ...@@ -273,22 +273,22 @@ copy(fluid_lib_dist
DSTS ${dst_dir}/${module} DSTS ${dst_dir}/${module}
) )
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3") set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/eigen3")
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported) DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost") set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/boost")
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${BOOST_INCLUDE_DIR}/boost SRCS ${BOOST_INCLUDE_DIR}/boost
DSTS ${dst_dir}) DSTS ${dst_dir})
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/dlpack") set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/dlpack")
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${DLPACK_INCLUDE_DIR}/dlpack SRCS ${DLPACK_INCLUDE_DIR}/dlpack
DSTS ${dst_dir}) DSTS ${dst_dir})
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/install/zlib")
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib) DSTS ${dst_dir} ${dst_dir}/lib)
...@@ -296,8 +296,8 @@ copy(inference_lib_dist ...@@ -296,8 +296,8 @@ copy(inference_lib_dist
# CMakeCache Info # CMakeCache Info
copy(fluid_lib_dist copy(fluid_lib_dist
SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt SRCS ${PADDLE_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
DSTS ${FLUID_INSTALL_DIR} ${FLUID_INSTALL_DIR} DSTS ${PADDLE_INSTALL_DIR} ${PADDLE_INSTALL_DIR}
) )
# paddle fluid version # paddle fluid version
...@@ -323,6 +323,6 @@ function(version version_file) ...@@ -323,6 +323,6 @@ function(version version_file)
endif() endif()
endfunction() endfunction()
version(${FLUID_INSTALL_DIR}/version.txt) version(${PADDLE_INSTALL_DIR}/version.txt)
version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt) version(${PADDLE_INFERENCE_INSTALL_DIR}/version.txt)
version(${FLUID_INFERENCE_C_INSTALL_DIR}/version.txt) version(${PADDLE_INFERENCE_C_INSTALL_DIR}/version.txt)
...@@ -26,4 +26,7 @@ if(WITH_GPU) ...@@ -26,4 +26,7 @@ if(WITH_GPU)
set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG") set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
endif() endif()
if(WIN32)
set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
endif()
...@@ -62,9 +62,9 @@ function(op_library TARGET) ...@@ -62,9 +62,9 @@ function(op_library TARGET)
endif() endif()
endif() endif()
if(WITH_XPU) if(WITH_XPU)
string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}") string(REPLACE "_op" "_op_xpu" XPU_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${XPU_FILE}.cc)
list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc) list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
endif() endif()
endif() endif()
else() else()
...@@ -83,7 +83,7 @@ function(op_library TARGET) ...@@ -83,7 +83,7 @@ function(op_library TARGET)
list(APPEND mkldnn_cc_srcs ${src}) list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$") elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src}) list(APPEND cu_cc_srcs ${src})
elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$") elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
list(APPEND xpu_cc_srcs ${src}) list(APPEND xpu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$") elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src}) list(APPEND cc_srcs ${src})
...@@ -127,7 +127,8 @@ function(op_library TARGET) ...@@ -127,7 +127,8 @@ function(op_library TARGET)
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
"sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op" "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op") "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
"fused_bn_add_activation_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}") if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
......
<?xml version="1.0" encoding="utf-8"?>
<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemDefinitionGroup>
<CudaCompile>
<!-- Project schema: Host properties -->
<UseHostDefines>true</UseHostDefines>
<Emulation>false</Emulation>
<HostDebugInfo Condition="'$(Configuration)' == 'Debug'">true</HostDebugInfo>
<HostDebugInfo Condition="'$(Configuration)' != 'Debug'">false</HostDebugInfo>
<FastMath>false</FastMath>
<Optimization>InheritFromHost</Optimization>
<Runtime>InheritFromHost</Runtime>
<RuntimeChecks>InheritFromHost</RuntimeChecks>
<TypeInfo>InheritFromHost</TypeInfo>
<Warning>InheritFromHost</Warning>
<BaseCommandLineTemplate>-ccbin "%(VCBinDir)" -x cu [GenerateRelocatableDeviceCode] [Include] [RequiredIncludes] [InterleaveSourceInPTX] [GPUDebugInfo] [GenerateLineInfo] [Keep] [KeepDir] [MaxRegCount] [PtxAsOptionV] [TargetMachinePlatform] [NvccCompilation] [CudaRuntime] [AdditionalOptions]</BaseCommandLineTemplate>
<BuildCommandLineTemplate>--use-local-env</BuildCommandLineTemplate>
<BuildDynamicCommandLineTemplate>[CodeGeneration]</BuildDynamicCommandLineTemplate>
<CleanCommandLineTemplate>-clean</CleanCommandLineTemplate>
<!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [ProgramDataBaseFileName] $(CudaForceSynchronousPdbWrites) /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->
<HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [ProgramDataBaseFileName] $(CudaForceSynchronousPdbWrites) [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate>
<DriverApiCommandLineTemplate>%(BaseCommandLineTemplate) [CompileOut] "%(FullPath)"</DriverApiCommandLineTemplate>
<RuntimeApiCommandLineTemplate>%(BaseCommandLineTemplate) [HostDebugInfo] [Emulation] [FastMath] [Defines] %(HostCommandLineTemplate) [CompileOut] "%(FullPath)"</RuntimeApiCommandLineTemplate>
<CommandLineTemplate>
# (Approximate command-line. Settings inherited from host are not visible below.)
# (Please see the output window after a build for the full command-line)
# Driver API (NVCC Compilation Type is .cubin, .gpu, or .ptx)
set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)"
"$(CudaToolkitNvccPath)" %(BuildCommandLineTemplate) %(DriverApiCommandLineTemplate)
# Runtime API (NVCC Compilation Type is hybrid object or .c file)
set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)"
"$(CudaToolkitNvccPath)" %(BuildCommandLineTemplate) %(RuntimeApiCommandLineTemplate)
</CommandLineTemplate>
<ExecutionDescription>Compiling CUDA source file %(Identity)...</ExecutionDescription>
<ExclusionDescription>Skipping CUDA source file %(Identity) (excluded from build).</ExclusionDescription>
<!-- Miscellaneous -->
<PropsCacheOutputFile>%(Filename)%(Extension).cache</PropsCacheOutputFile>
<PropsCacheOutputPath>$(IntDir)%(PropsCacheOutputFile)</PropsCacheOutputPath>
<CudaCompileCoreProject>$(MSBuildProjectFullPath)</CudaCompileCoreProject>
</CudaCompile>
<CudaLink>
<PerformDeviceLink>true</PerformDeviceLink>
<LinkOut>$(IntDir)$(TargetName).device-link.obj</LinkOut>
<AdditionalLibraryDirectories></AdditionalLibraryDirectories>
<UseHostLibraryDirectories>true</UseHostLibraryDirectories>
<AdditionalDependencies></AdditionalDependencies>
<UseHostLibraryDependencies>true</UseHostLibraryDependencies>
<GPUDebugInfo>InheritFromProject</GPUDebugInfo>
<Optimization>InheritFromProject</Optimization>
<!-- Implicitly inherited from the project via @(CudaCompile) -->
<CodeGeneration></CodeGeneration>
<RuntimeChecks></RuntimeChecks>
<Runtime></Runtime>
<TargetMachinePlatform></TargetMachinePlatform>
<TypeInfo></TypeInfo>
<Warning></Warning>
<Inputs></Inputs>
<!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->
<HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate>
<LinkCommandLineTemplate>"$(CudaToolkitNvccPath)" -dlink [LinkOut] %(HostCommandLineTemplate) [AdditionalLibraryDirectories] [AdditionalDependencies] [AdditionalOptions] [CodeGeneration] [GPUDebugInfo] [TargetMachinePlatform] [Inputs]</LinkCommandLineTemplate>
<CommandLineTemplate>
# (Approximate command-line. Settings inherited from host are not visible below.)
# (Please see the output window after a build for the full command-line)
%(LinkCommandLineTemplate)
</CommandLineTemplate>
</CudaLink>
<Link>
<AdditionalLibraryDirectories>%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
</Link>
<ClCompile>
<AdditionalIncludeDirectories>%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
</ClCompile>
</ItemDefinitionGroup>
</Project>
...@@ -39,6 +39,7 @@ set(third_party_deps) ...@@ -39,6 +39,7 @@ set(third_party_deps)
# REPOSITORY ${TARGET_REPOSITORY} # REPOSITORY ${TARGET_REPOSITORY}
# TAG ${TARGET_TAG} # TAG ${TARGET_TAG}
# DIR ${TARGET_SOURCE_DIR}) # DIR ${TARGET_SOURCE_DIR})
FUNCTION(cache_third_party TARGET) FUNCTION(cache_third_party TARGET)
SET(options "") SET(options "")
SET(oneValueArgs URL REPOSITORY TAG DIR) SET(oneValueArgs URL REPOSITORY TAG DIR)
...@@ -269,6 +270,10 @@ if(WITH_PSLIB) ...@@ -269,6 +270,10 @@ if(WITH_PSLIB)
endif() endif()
endif(WITH_PSLIB) endif(WITH_PSLIB)
if(NOT WIN32 AND NOT APPLE)
include(external/gloo)
list(APPEND third_party_deps extern_gloo)
endif()
if(WITH_BOX_PS) if(WITH_BOX_PS)
include(external/box_ps) include(external/box_ps)
...@@ -276,10 +281,6 @@ if(WITH_BOX_PS) ...@@ -276,10 +281,6 @@ if(WITH_BOX_PS)
endif(WITH_BOX_PS) endif(WITH_BOX_PS)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
if(WITH_GLOO)
include(external/gloo)
list(APPEND third_party_deps extern_gloo)
endif()
if(WITH_GRPC) if(WITH_GRPC)
list(APPEND third_party_deps extern_grpc) list(APPEND third_party_deps extern_grpc)
......
# Paddle 预测golang API # Paddle 预测golang API
## 安装 ## 安装
首先cmake编译时打开`-DON_INFER=ON`,在编译目录下得到``fluid_inference_c_install_dir``,将该目录移动到当前目录中并重命名为`paddle_c` 首先cmake编译时打开`-DON_INFER=ON`,在编译目录下得到``paddle_inference_c_install_dir``,将该目录移动到当前目录中并重命名为`paddle_c`
## 在Go中使用Paddle预测 ## 在Go中使用Paddle预测
首先创建预测配置 首先创建预测配置
......
...@@ -14,8 +14,6 @@ limitations under the License. */ ...@@ -14,8 +14,6 @@ limitations under the License. */
#include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/attribute.h"
#include <vector>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -30,6 +30,8 @@ namespace paddle { ...@@ -30,6 +30,8 @@ namespace paddle {
namespace framework { namespace framework {
class ProgramDesc; class ProgramDesc;
class OpDesc;
class VarDesc;
// Each Protobuf Message, we provide a XXXBind class. In that class, we optimize // Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
// read/write speed. Only when we want the protobuf message, the local changes // read/write speed. Only when we want the protobuf message, the local changes
......
...@@ -12,17 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,17 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/c/c_api.h" #include "paddle/fluid/framework/c/c_api.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/init.h"
extern "C" { extern "C" {
......
...@@ -24,6 +24,15 @@ limitations under the License. */ ...@@ -24,6 +24,15 @@ limitations under the License. */
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
class OpInfoMap;
} // namespace framework
namespace platform {
class DeviceContextPool;
} // namespace platform
} // namespace paddle
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
......
...@@ -277,7 +277,7 @@ class ChannelObject { ...@@ -277,7 +277,7 @@ class ChannelObject {
size_t finished = 0; size_t finished = 0;
while (finished < n && WaitForWrite(lock)) { while (finished < n && WaitForWrite(lock)) {
size_t m = size_t m =
std::min(n - finished, capacity_ + reading_count_ - data_.size()); (std::min)(n - finished, capacity_ + reading_count_ - data_.size());
for (size_t i = 0; i < m; i++) { for (size_t i = 0; i < m; i++) {
data_.push_back(std::move(p[finished++])); data_.push_back(std::move(p[finished++]));
} }
......
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <cstring>
#include <random> #include <random>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
......
...@@ -21,6 +21,8 @@ limitations under the License. */ ...@@ -21,6 +21,8 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class Tensor;
void TransDataDevice(const Tensor& in, const platform::Place& dst_place, void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
Tensor* out); Tensor* out);
......
...@@ -527,6 +527,8 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) { ...@@ -527,6 +527,8 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
VLOG(0) << "error: the number of ids is a negative number: " << num; VLOG(0) << "error: the number of ids is a negative number: " << num;
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: " << all_slots_.size();
return false; return false;
} else if (num == 0) { } else if (num == 0) {
VLOG(0) VLOG(0)
...@@ -536,42 +538,66 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) { ...@@ -536,42 +538,66 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
"characters."; "characters.";
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: " << all_slots_.size();
return false; return false;
} else if (errno == ERANGE || num > INT_MAX) { } else if (errno == ERANGE || num > INT_MAX) {
VLOG(0) << "error: the number of ids greater than INT_MAX"; VLOG(0) << "error: the number of ids greater than INT_MAX";
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: " << all_slots_.size();
return false; return false;
} }
if (all_slots_type_[i] == "float") { if (all_slots_type_[i] == "float") {
for (int i = 0; i < num; ++i) { for (int j = 0; j < num; ++j) {
strtof(endptr, &endptr); strtof(endptr, &endptr);
if (errno == ERANGE) { if (errno == ERANGE) {
VLOG(0) << "error: the value is out of the range of " VLOG(0) << "error: the value is out of the range of "
"representable values for float"; "representable values for float";
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: "
<< all_slots_.size();
VLOG(0) << "and in this slot: " << j
<< " th id with total id number: " << num;
return false; return false;
} }
if (i + 1 != num && endptr - str == len) { if (j + 1 != num && endptr - str == len) {
VLOG(0) << "error: there is a wrong with the number of ids."; VLOG(0) << "error: there is a wrong with the number of ids.";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: "
<< all_slots_.size();
VLOG(0) << "and in this slot: " << j
<< " th id with total id number: " << num;
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
return false; return false;
} }
} }
} else if (all_slots_type_[i] == "uint64") { } else if (all_slots_type_[i] == "uint64") {
for (int i = 0; i < num; ++i) { for (int j = 0; j < num; ++j) {
strtoull(endptr, &endptr, 10); strtoull(endptr, &endptr, 10);
if (errno == ERANGE) { if (errno == ERANGE) {
VLOG(0) << "error: the value is out of the range of " VLOG(0) << "error: the value is out of the range of "
"representable values for uint64_t"; "representable values for uint64_t";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: "
<< all_slots_.size();
VLOG(0) << "and in this slot: " << j
<< " th id with total id number: " << num;
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
return false; return false;
} }
if (i + 1 != num && endptr - str == len) { if (j + 1 != num && endptr - str == len) {
VLOG(0) << "error: there is a wrong with the number of ids."; VLOG(0) << "error: there is a wrong with the number of ids.";
VLOG(0) << "Error occured when parsing " << i
<< " th slot with total slots number: "
<< all_slots_.size();
VLOG(0) << "and in this slot: " << j
<< " th id with total id number: " << num;
VLOG(0) << "please check line<" << instance_cout << "> in file<" VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">"; << filename << ">";
return false; return false;
...@@ -632,8 +658,13 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe( ...@@ -632,8 +658,13 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe(
"The number of ids can not be zero, you need padding " "The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with " "it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable " "the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s", "characters.\nplease check this error line: %s, \n Specifically, "
str)); "something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot",
"\nWe detect the feasign number of this slot is %d, "
"which is illegal.",
str, i, num));
if (idx != -1) { if (idx != -1) {
(*instance)[idx].Init(all_slots_type_[i]); (*instance)[idx].Init(all_slots_type_[i]);
if ((*instance)[idx].GetType()[0] == 'f') { // float if ((*instance)[idx].GetType()[0] == 'f') { // float
...@@ -683,8 +714,13 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) { ...@@ -683,8 +714,13 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
"The number of ids can not be zero, you need padding " "The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with " "it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable " "the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s.", "characters.\nplease check this error line: %s, \n Specifically, "
str)); "something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot",
"\nWe detect the feasign number of this slot is %d, "
"which is illegal.",
str, i, num));
if (idx != -1) { if (idx != -1) {
(*instance)[idx].Init(all_slots_type_[i]); (*instance)[idx].Init(all_slots_type_[i]);
...@@ -916,8 +952,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) { ...@@ -916,8 +952,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
"The number of ids can not be zero, you need padding " "The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with " "it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable " "the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s.", "characters.\nplease check this error line: %s, \n Specifically, "
str)); "something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot",
"\nWe detect the feasign number of this slot is %d, "
"which is illegal.",
str, i, num));
if (idx != -1) { if (idx != -1) {
if (all_slots_type_[i][0] == 'f') { // float if (all_slots_type_[i][0] == 'f') { // float
for (int j = 0; j < num; ++j) { for (int j = 0; j < num; ++j) {
...@@ -982,8 +1023,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) { ...@@ -982,8 +1023,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) {
"The number of ids can not be zero, you need padding " "The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with " "it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable " "the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s.", "characters.\nplease check this error line: %s, \n Specifically, "
str)); "something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots."
"Maybe something wrong around this slot",
"\nWe detect the feasign number of this slot is %d, "
"which is illegal.",
str, i, num));
if (idx != -1) { if (idx != -1) {
if (all_slots_type_[i][0] == 'f') { // float if (all_slots_type_[i][0] == 'f') { // float
......
...@@ -41,6 +41,15 @@ limitations under the License. */ ...@@ -41,6 +41,15 @@ limitations under the License. */
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
namespace paddle {
namespace framework {
class DataFeedDesc;
class LoDTensor;
class Scope;
class Variable;
} // namespace framework
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -418,6 +427,7 @@ class MultiSlotType { ...@@ -418,6 +427,7 @@ class MultiSlotType {
std::string DebugString() { std::string DebugString() {
std::stringstream ss; std::stringstream ss;
ss << "\ntype: " << type_ << "\n"; ss << "\ntype: " << type_ << "\n";
ss << "offset: "; ss << "offset: ";
ss << "["; ss << "[";
......
...@@ -17,10 +17,10 @@ limitations under the License. */ ...@@ -17,10 +17,10 @@ limitations under the License. */
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include "paddle/fluid/framework/data_feed.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class DataFeed;
typedef std::shared_ptr<DataFeed> (*Createdata_feedFunction)(); typedef std::shared_ptr<DataFeed> (*Createdata_feedFunction)();
typedef std::unordered_map<std::string, Createdata_feedFunction> data_feedMap; typedef std::unordered_map<std::string, Createdata_feedFunction> data_feedMap;
data_feedMap g_data_feed_map; data_feedMap g_data_feed_map;
......
...@@ -16,10 +16,13 @@ limitations under the License. */ ...@@ -16,10 +16,13 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include "paddle/fluid/framework/data_feed.h" #include "paddle/fluid/framework/data_feed.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class DataFeed;
class DataFeedFactory { class DataFeedFactory {
public: public:
static std::string DataFeedTypeList(); static std::string DataFeedTypeList();
......
...@@ -13,8 +13,8 @@ ...@@ -13,8 +13,8 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/data_layout_transform.h"
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
......
...@@ -17,10 +17,18 @@ ...@@ -17,10 +17,18 @@
#include <map> #include <map>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_kernel_type.h" #include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
namespace paddle {
namespace framework {
class OpKernelType;
class Tensor;
} // namespace framework
} // namespace paddle
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#endif #endif
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/data_layout_transform.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
TEST(DataTransform, DataLayoutFunction) { TEST(DataTransform, DataLayoutFunction) {
auto place = paddle::platform::CPUPlace(); auto place = paddle::platform::CPUPlace();
......
...@@ -18,8 +18,13 @@ limitations under the License. */ ...@@ -18,8 +18,13 @@ limitations under the License. */
#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/data_type_transform.h"
namespace paddle {
namespace framework {
class Variable;
} // namespace framework
} // namespace paddle
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include <algorithm>
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#endif #endif
......
...@@ -30,6 +30,10 @@ limitations under the License. */ ...@@ -30,6 +30,10 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class OpKernelType;
class Tensor;
class Variable;
void TransformData(const OpKernelType &expected_kernel_type, void TransformData(const OpKernelType &expected_kernel_type,
const OpKernelType &kernel_type_for_var, const OpKernelType &kernel_type_for_var,
const Tensor &input_tensor, Tensor *out); const Tensor &input_tensor, Tensor *out);
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include <stdint.h>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
......
...@@ -15,12 +15,19 @@ limitations under the License. */ ...@@ -15,12 +15,19 @@ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#include <typeindex> #include <typeindex>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace platform {
struct bfloat16;
struct float16;
} // namespace platform
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -14,9 +14,17 @@ ...@@ -14,9 +14,17 @@
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include <string> #include <string>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace platform {
struct bfloat16;
struct float16;
} // namespace platform
} // namespace paddle
TEST(DataType, float16) { TEST(DataType, float16) {
using paddle::framework::Tensor; using paddle::framework::Tensor;
using paddle::platform::CPUPlace; using paddle::platform::CPUPlace;
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <utility> #include <utility>
#include "paddle/fluid/framework/op_kernel_type.h" #include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
...@@ -23,6 +24,9 @@ limitations under the License. */ ...@@ -23,6 +24,9 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class OpKernelType;
class Tensor;
using KernelTypePair = std::pair<OpKernelType, OpKernelType>; using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
void TransDataType(const OpKernelType& kernel_type_for_var, void TransDataType(const OpKernelType& kernel_type_for_var,
......
...@@ -13,12 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,12 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/dataset_factory.h" #include "paddle/fluid/framework/dataset_factory.h"
#include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include "paddle/fluid/framework/data_set.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
typedef std::unique_ptr<Dataset> (*CreateDatasetFunction)(); typedef std::unique_ptr<Dataset> (*CreateDatasetFunction)();
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/data_set.h"
namespace paddle { namespace paddle {
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/dim.h" #include "paddle/fluid/framework/dim.h"
namespace paddle { namespace paddle {
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <sstream> #include <sstream>
#include <vector>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
......
...@@ -74,6 +74,7 @@ set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto ...@@ -74,6 +74,7 @@ set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto
eager_deletion_pass eager_deletion_pass
buffer_shared_inplace_op_pass buffer_shared_inplace_op_pass
buffer_shared_cross_op_memory_reuse_pass buffer_shared_cross_op_memory_reuse_pass
inplace_addto_op_pass
set_reader_device_info_utils set_reader_device_info_utils
add_reader_dependency_pass) add_reader_dependency_pass)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
......
...@@ -12,7 +12,9 @@ ...@@ -12,7 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include <algorithm> #include <algorithm>
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
...@@ -34,14 +36,24 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -34,14 +36,24 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const platform::NCCLCommunicator *ctxs) const platform::NCCLCommunicator *ctxs)
: NCCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) { : NCCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
platform::errors::InvalidArgument(
"The number of places and the number of local scopes "
"should be equal, but got number of places is %d and "
"number of local scopes is %d.",
places_.size(), local_scopes_.size()));
} }
#else #else
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places) const std::vector<platform::Place> &places)
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) { : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
platform::errors::InvalidArgument(
"The number of places and the number of local scopes "
"should be equal, but got number of places is %d and "
"number of local scopes is %d.",
places_.size(), local_scopes_.size()));
} }
#endif #endif
...@@ -60,13 +72,25 @@ void AllReduceOpHandle::AllReduceImpl( ...@@ -60,13 +72,25 @@ void AllReduceOpHandle::AllReduceImpl(
const std::vector<VarHandle *> &in_var_handles, const std::vector<VarHandle *> &in_var_handles,
const std::vector<VarHandle *> &out_var_handles) { const std::vector<VarHandle *> &out_var_handles) {
size_t num_places = places_.size(); size_t num_places = places_.size();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(in_var_handles.size(), num_places,
in_var_handles.size(), num_places, platform::errors::InvalidArgument(
"The NoDummyInputSize should be equal to the number of places."); "The NoDummyInputSize should be equal "
"to the number of places, but got NoDummyInputSize is "
"%d and the number of places is %d.",
in_var_handles.size(), num_places));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(), in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal."); platform::errors::InvalidArgument(
PADDLE_ENFORCE_EQ(local_exec_scopes_.size(), num_places); "The NoDummyInputSize and NoDummyOutputSize should be "
"equal, but got NoDummyInputSize is %d and NoDummyOutputSize is %d.",
in_var_handles.size(), out_var_handles.size()));
PADDLE_ENFORCE_EQ(
local_exec_scopes_.size(), num_places,
platform::errors::InvalidArgument(
"The number of local scopes should be equal "
"to the number of places, but got the number of local scopes is "
"%d and the number of places is %d.",
in_var_handles.size(), num_places));
std::vector<const void *> lod_tensor_data; std::vector<const void *> lod_tensor_data;
std::vector<platform::Place> places; std::vector<platform::Place> places;
...@@ -78,23 +102,36 @@ void AllReduceOpHandle::AllReduceImpl( ...@@ -78,23 +102,36 @@ void AllReduceOpHandle::AllReduceImpl(
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
auto &local_scope = local_exec_scopes_[i]; auto &local_scope = local_exec_scopes_[i];
auto var = local_scope->FindVar(in_var_handles[i]->name()); auto var = local_scope->FindVar(in_var_handles[i]->name());
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found int scope.", PADDLE_ENFORCE_NOT_NULL(var, platform::errors::NotFound(
in_var_handles[i]->name()); "Variable %s is not found in local scope.",
in_var_handles[i]->name()));
auto &lod_tensor = var->Get<LoDTensor>(); auto &lod_tensor = var->Get<LoDTensor>();
if (i == 0) { if (i == 0) {
numel = static_cast<int64_t>(lod_tensor.numel()); numel = static_cast<int64_t>(lod_tensor.numel());
// only enforce place0, we will enforce other palce numel == place0 numel // only enforce place0, we will enforce other palce numel == place0 numel
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
numel, 0, platform::errors::InvalidArgument( numel, 0,
"The numel of tensos=[%s] must > 0. But now numel=[%d]", platform::errors::PreconditionNotMet(
in_var_handles[i]->name(), numel)); "The numel of tensor %s should be > 0, but got numel is %d.",
in_var_handles[i]->name(), numel));
dtype = lod_tensor.type(); dtype = lod_tensor.type();
is_gpu_place = platform::is_gpu_place(lod_tensor.place()); is_gpu_place = platform::is_gpu_place(lod_tensor.place());
} }
PADDLE_ENFORCE_EQ(numel, static_cast<int64_t>(lod_tensor.numel())); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE_EQ(dtype, lod_tensor.type()); numel, static_cast<int64_t>(lod_tensor.numel()),
PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place())); platform::errors::PreconditionNotMet(
"The size of tensors of the same variable in different local "
"scopes should be equal."));
PADDLE_ENFORCE_EQ(
dtype, lod_tensor.type(),
platform::errors::PreconditionNotMet(
"The dtype of tensors of the same variable in different local "
"scopes should be equal."));
PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()),
platform::errors::PreconditionNotMet(
"The place type of tensors of the same variable "
"in different local scopes should be equal."));
lod_tensor_data.emplace_back(lod_tensor.data<void>()); lod_tensor_data.emplace_back(lod_tensor.data<void>());
places.emplace_back(lod_tensor.place()); places.emplace_back(lod_tensor.place());
...@@ -102,8 +139,12 @@ void AllReduceOpHandle::AllReduceImpl( ...@@ -102,8 +139,12 @@ void AllReduceOpHandle::AllReduceImpl(
VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name() VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
<< ", out_name:" << out_var_handles[i]->name(); << ", out_name:" << out_var_handles[i]->name();
PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(), PADDLE_ENFORCE_EQ(
"The name of input and output should be equal."); in_var_handles[i]->name(), out_var_handles[i]->name(),
platform::errors::InvalidArgument(
"The name of input and output of all_reduce op should be equal, "
"but got input is %s and output is %s.",
in_var_handles[i]->name(), out_var_handles[i]->name()));
} }
std::vector<std::string> grad_var_names; std::vector<std::string> grad_var_names;
...@@ -122,7 +163,9 @@ void AllReduceOpHandle::AllReduceFunc( ...@@ -122,7 +163,9 @@ void AllReduceOpHandle::AllReduceFunc(
const std::vector<std::string> &out_var_names) { const std::vector<std::string> &out_var_names) {
if (is_gpu_place(places[0])) { if (is_gpu_place(places[0])) {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr."); PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_,
platform::errors::InvalidArgument(
"The nccl context should not be NULL."));
ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype); ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
std::vector<std::function<void()>> all_reduce_calls; std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
...@@ -134,7 +177,8 @@ void AllReduceOpHandle::AllReduceFunc( ...@@ -134,7 +177,8 @@ void AllReduceOpHandle::AllReduceFunc(
} }
NCCLAllReduceFunc(all_reduce_calls); NCCLAllReduceFunc(all_reduce_calls);
#else #else
PADDLE_THROW("Not compiled with CUDA."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} else { // Special handle CPU only Operator's gradient. Like CRF } else { // Special handle CPU only Operator's gradient. Like CRF
auto &trg = *local_exec_scopes_[0] auto &trg = *local_exec_scopes_[0]
......
...@@ -20,6 +20,17 @@ ...@@ -20,6 +20,17 @@
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class NCCLCommunicator;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
......
...@@ -89,8 +89,19 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( ...@@ -89,8 +89,19 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
places_(std::move(places)), places_(std::move(places)),
graphs_(std::move(graphs)) { graphs_(std::move(graphs)) {
VLOG(3) << "build AsyncSSAGraphExecutor"; VLOG(3) << "build AsyncSSAGraphExecutor";
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size()); platform::errors::InvalidArgument(
"The number of places and the number of local scopes "
"should be equal, but got number of places is %d and "
"number of local scopes is %d.",
places_.size(), local_scopes_.size()));
PADDLE_ENFORCE_EQ(
local_scopes_.size(), local_exec_scopes_.size(),
platform::errors::InvalidArgument(
"The number of local scopes and the number of local execution scopes "
"should be equal, but got number of local scopes is %d and "
"number of local execution scopes is %d.",
local_scopes_.size(), local_exec_scopes_.size()));
// set the correct size of thread pool to each device. // set the correct size of thread pool to each device.
strategy_.num_threads_ = strategy_.num_threads_ < places_.size() strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -31,10 +32,15 @@ void BroadcastOpHandle::RunImpl() { ...@@ -31,10 +32,15 @@ void BroadcastOpHandle::RunImpl() {
auto out_var_handles = DynamicCast<VarHandle>(outputs_); auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL, PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
"The number of input should be one."); platform::errors::PreconditionNotMet(
PADDLE_ENFORCE_EQ( "The number of inputs should be 1, but got %d.",
out_var_handles.size(), places_.size(), in_var_handles.size()));
"The number of output should equal to the number of places."); PADDLE_ENFORCE_EQ(out_var_handles.size(), places_.size(),
platform::errors::PreconditionNotMet(
"The number of outputs and the number of places should "
"be equal, but got the number of outputs is %d and the "
"number of places is %d.",
out_var_handles.size(), places_.size()));
VarHandle *in_var_handle = in_var_handles[0]; VarHandle *in_var_handle = in_var_handles[0];
...@@ -47,7 +53,9 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -47,7 +53,9 @@ void BroadcastOpHandle::BroadcastOneVar(
const std::vector<Scope *> &var_scopes) { const std::vector<Scope *> &var_scopes) {
auto *in_var = auto *in_var =
var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name()); var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name());
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound("Variable %s is not found in scopes.",
in_var_handle.name()));
Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var); Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
if (UNLIKELY(!in_tensor.IsInitialized())) { if (UNLIKELY(!in_tensor.IsInitialized())) {
VLOG(3) << "in var " << in_var_handle.name() << "not inited, return!"; VLOG(3) << "in var " << in_var_handle.name() << "not inited, return!";
...@@ -103,7 +111,7 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -103,7 +111,7 @@ void BroadcastOpHandle::BroadcastOneVar(
broadcast_calls.emplace_back( broadcast_calls.emplace_back(
[send_recv_buffer, numel, type, root_id, &nccl_ctx] { [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
PADDLE_ENFORCE(platform::dynload::ncclBcast( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
send_recv_buffer, numel, static_cast<ncclDataType_t>(type), send_recv_buffer, numel, static_cast<ncclDataType_t>(type),
root_id, nccl_ctx.comm_, nccl_ctx.stream())); root_id, nccl_ctx.comm_, nccl_ctx.stream()));
}); });
...@@ -131,7 +139,8 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -131,7 +139,8 @@ void BroadcastOpHandle::BroadcastOneVar(
nccl_ctxs_->DevCtx(p)->Wait(); nccl_ctxs_->DevCtx(p)->Wait();
} }
#else #else
PADDLE_THROW("CUDA is not enabled."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} }
} }
...@@ -154,10 +163,13 @@ void BroadcastOpHandle::InitOutputValue( ...@@ -154,10 +163,13 @@ void BroadcastOpHandle::InitOutputValue(
auto t_out_p = out_var_handle->place(); auto t_out_p = out_var_handle->place();
auto *out_var = var_scopes.at(out_var_handle->scope_idx()) auto *out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name()); ->FindVar(out_var_handle->name());
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(out_var, platform::errors::NotFound(
"Variable %s is not found in scopes.",
out_var_handle->name()));
if (is_gpu_place(in_tensor.place())) { if (is_gpu_place(in_tensor.place())) {
PADDLE_ENFORCE(platform::is_gpu_place(t_out_p), PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
"Places of input and output must be all on GPU."); platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
} else { } else {
t_out_p = platform::CPUPlace(); t_out_p = platform::CPUPlace();
} }
......
...@@ -24,6 +24,20 @@ ...@@ -24,6 +24,20 @@
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
struct VarHandle;
} // namespace details
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
struct NCCLContextMap;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
......
...@@ -21,13 +21,15 @@ ...@@ -21,13 +21,15 @@
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
struct DummyVarHandle;
struct VarHandle;
namespace f = paddle::framework; namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
...@@ -77,7 +79,8 @@ struct TestBroadcastOpHandle { ...@@ -77,7 +79,8 @@ struct TestBroadcastOpHandle {
} }
nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_)); nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} else { } else {
int count = 8; int count = 8;
...@@ -111,7 +114,8 @@ struct TestBroadcastOpHandle { ...@@ -111,7 +114,8 @@ struct TestBroadcastOpHandle {
op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
place_list_, nccl_ctxs_.get()); place_list_, nccl_ctxs_.get());
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
...@@ -169,7 +173,9 @@ struct TestBroadcastOpHandle { ...@@ -169,7 +173,9 @@ struct TestBroadcastOpHandle {
float val_scalar = 0.0) { float val_scalar = 0.0) {
auto var = param_scopes_[input_scope_idx]->FindVar(varname); auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto lod_tensor = var->GetMutable<f::LoDTensor>(); auto lod_tensor = var->GetMutable<f::LoDTensor>();
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims))); std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
for (size_t k = 0; k < send_vector.size(); ++k) { for (size_t k = 0; k < send_vector.size(); ++k) {
...@@ -192,7 +198,9 @@ struct TestBroadcastOpHandle { ...@@ -192,7 +198,9 @@ struct TestBroadcastOpHandle {
} }
auto var = param_scopes_[input_scope_idx]->FindVar(varname); auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto selected_rows = var->GetMutable<f::SelectedRows>(); auto selected_rows = var->GetMutable<f::SelectedRows>();
auto value = selected_rows->mutable_value(); auto value = selected_rows->mutable_value();
value->mutable_data<float>(kDims, place_list_[input_scope_idx]); value->mutable_data<float>(kDims, place_list_[input_scope_idx]);
...@@ -209,13 +217,24 @@ struct TestBroadcastOpHandle { ...@@ -209,13 +217,24 @@ struct TestBroadcastOpHandle {
const std::vector<float>& send_vector, const std::vector<float>& send_vector,
const std::vector<int64_t>& rows, int height) { const std::vector<int64_t>& rows, int height) {
auto var = param_scopes_[input_scope_idx]->FindVar(varname); auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto& selected_rows = var->Get<f::SelectedRows>(); auto& selected_rows = var->Get<f::SelectedRows>();
auto rt = selected_rows.value(); auto rt = selected_rows.value();
PADDLE_ENFORCE_EQ(selected_rows.height(), height, "height is not equal."); PADDLE_ENFORCE_EQ(selected_rows.height(), height,
platform::errors::InvalidArgument(
"The height of SelectedRows is not equal to "
"the expected, expect %d, but got %ld.",
height, selected_rows.height()));
for (size_t k = 0; k < selected_rows.rows().size(); ++k) { for (size_t k = 0; k < selected_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(selected_rows.rows()[k], rows[k]); PADDLE_ENFORCE_EQ(
selected_rows.rows()[k], rows[k],
platform::errors::InvalidArgument(
"The item at position %zu of rows of SelectedRows "
"is not equal to the expected, expect %ld, but got %ld.",
k, rows[k], selected_rows.rows()[k]));
} }
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
...@@ -233,9 +252,15 @@ struct TestBroadcastOpHandle { ...@@ -233,9 +252,15 @@ struct TestBroadcastOpHandle {
framework::Scope* scope) { framework::Scope* scope) {
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
auto var = scope->FindVar(varname); auto var = scope->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto tensor = var->Get<f::LoDTensor>(); auto tensor = var->Get<f::LoDTensor>();
PADDLE_ENFORCE_EQ(tensor.lod(), lod, "lod is not equal."); PADDLE_ENFORCE_EQ(tensor.lod(), lod,
platform::errors::InvalidArgument(
"The LoD of tensor is not equal to "
"the expected, expect %s, but got %s.",
lod, tensor.lod()));
f::Tensor result_tensor; f::Tensor result_tensor;
f::TensorCopySync(tensor, cpu_place, &result_tensor); f::TensorCopySync(tensor, cpu_place, &result_tensor);
float* ct = result_tensor.mutable_data<float>(cpu_place); float* ct = result_tensor.mutable_data<float>(cpu_place);
......
...@@ -235,7 +235,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -235,7 +235,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
AppendPass("reduce_mode_multi_devices_pass").get(); AppendPass("reduce_mode_multi_devices_pass").get();
break; break;
default: default:
PADDLE_THROW("Unknown reduce strategy."); PADDLE_THROW(
platform::errors::Unimplemented("Unknown reduce strategy."));
} }
} }
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "boost/optional.hpp" #include "boost/optional.hpp"
#include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
...@@ -26,6 +27,18 @@ ...@@ -26,6 +27,18 @@
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
class Graph;
class PassBuilder;
} // namespace ir
} // namespace framework
namespace platform {
class NCCLCommunicator;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
...@@ -119,6 +132,9 @@ struct BuildStrategy { ...@@ -119,6 +132,9 @@ struct BuildStrategy {
// Turn on inplace by default. // Turn on inplace by default.
bool enable_inplace_{true}; bool enable_inplace_{true};
// Turn off inplace addto by default.
bool enable_addto_{false};
// FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
// num_trainers is 1, so the current fields of build_strategy doesn't tell if // num_trainers is 1, so the current fields of build_strategy doesn't tell if
// it's distributed model. // it's distributed model.
......
...@@ -19,6 +19,8 @@ ...@@ -19,6 +19,8 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
struct VarHandleBase;
ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
platform::Place place, platform::Place place,
size_t scope_idx) size_t scope_idx)
......
...@@ -24,9 +24,21 @@ ...@@ -24,9 +24,21 @@
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
class OperatorBase;
class Scope;
namespace ir {
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
struct VarHandleBase;
class ComputationOpHandle : public OpHandleBase { class ComputationOpHandle : public OpHandleBase {
public: public:
ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place, ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
......
...@@ -12,11 +12,12 @@ ...@@ -12,11 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include <memory> #include <memory>
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -47,15 +48,19 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( ...@@ -47,15 +48,19 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
if (dynamic_cast<StreamGarbageCollector *>(gc_)) { if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
platform::CUDADeviceGuard guard( platform::CUDADeviceGuard guard(
BOOST_GET_CONST(platform::CUDAPlace, place).device); BOOST_GET_CONST(platform::CUDAPlace, place).device);
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); PADDLE_ENFORCE_CUDA_SUCCESS(
PADDLE_ENFORCE_NOT_NULL(event_); cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument(
"The cuda envet created is NULL."));
} }
} }
#endif #endif
PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument( PADDLE_ENFORCE_NE(vars.empty(), true,
"Variable names are empty.")); platform::errors::InvalidArgument(
"The variables to be deleted are empty."));
for (auto *var : var_infos_) { for (auto *var : var_infos_) {
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument(
"The memory optimization info is NULL."));
} }
} }
...@@ -64,7 +69,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { ...@@ -64,7 +69,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
if (event_) { if (event_) {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace()); auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace());
platform::CUDADeviceGuard guard(gpu_place.device); platform::CUDADeviceGuard guard(gpu_place.device);
PADDLE_ENFORCE(cudaEventDestroy(event_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
} }
#endif #endif
} }
...@@ -78,12 +83,17 @@ void EagerDeletionOpHandle::InitCUDA() { ...@@ -78,12 +83,17 @@ void EagerDeletionOpHandle::InitCUDA() {
} }
void EagerDeletionOpHandle::CallOnce() { void EagerDeletionOpHandle::CallOnce() {
PADDLE_ENFORCE(vars_.empty(), "vars_ must be initialized here"); PADDLE_ENFORCE_EQ(
vars_.empty(), true,
platform::errors::InvalidArgument(
"The variables to be deleted should be initialized here."));
Scope *exec_scope = local_exec_scopes_[0]; Scope *exec_scope = local_exec_scopes_[0];
for (auto *var_info : var_infos_) { for (auto *var_info : var_infos_) {
auto *var = exec_scope->FindVar(var_info->Name()); auto *var = exec_scope->FindVar(var_info->Name());
PADDLE_ENFORCE_NOT_NULL(var, "Variable %s should not be nullptr", PADDLE_ENFORCE_NOT_NULL(
var_info->Name()); var, platform::errors::NotFound(
"The variable(%s) to be inplaced is not found in scope.",
var_info->Name()));
vars_.emplace_back(var); vars_.emplace_back(var);
} }
} }
...@@ -119,8 +129,9 @@ void EagerDeletionOpHandle::RunImpl() { ...@@ -119,8 +129,9 @@ void EagerDeletionOpHandle::RunImpl() {
garbages.emplace_back(t.MoveMemoryHolder()); garbages.emplace_back(t.MoveMemoryHolder());
} }
} else { } else {
PADDLE_THROW("Type %s of %s is not supported eager deletion", PADDLE_THROW(platform::errors::Unimplemented(
framework::ToTypeName(var->Type()), var_info->Name()); "The variable(%s) of type %s is not supported in eager deletion.",
framework::ToTypeName(var->Type()), var_info->Name()));
} }
} }
...@@ -137,8 +148,9 @@ void EagerDeletionOpHandle::ClearGarbages( ...@@ -137,8 +148,9 @@ void EagerDeletionOpHandle::ClearGarbages(
auto callback_stream = auto callback_stream =
reinterpret_cast<StreamGarbageCollector *>(gc_)->stream(); reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
auto callback_func = [=]() { auto callback_func = [=]() {
PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream));
PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(callback_stream, event_, 0));
}; };
gc_->Add(std::move(*garbages), callback_func); gc_->Add(std::move(*garbages), callback_func);
} else { } else {
......
...@@ -19,12 +19,23 @@ ...@@ -19,12 +19,23 @@
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class Scope; class Scope;
class GarbageCollector;
namespace ir {
class Node;
} // namespace ir
namespace ir { namespace ir {
class MemOptVarInfo; class MemOptVarInfo;
......
...@@ -13,8 +13,7 @@ ...@@ -13,8 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/exception_holder.h"
#include <memory>
#include <unordered_map>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
......
...@@ -12,12 +12,14 @@ ...@@ -12,12 +12,14 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include <deque> #include <deque>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/fetch_async_op_handle.h" #include "paddle/fluid/framework/details/fetch_async_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
...@@ -48,7 +50,9 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( ...@@ -48,7 +50,9 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
bootstrap_ops_.emplace_back(op); bootstrap_ops_.emplace_back(op);
} }
} }
PADDLE_ENFORCE_GT(op_deps_.size(), 0, "The graph doesn't have operators."); PADDLE_ENFORCE_GT(op_deps_.size(), 0,
platform::errors::PreconditionNotMet(
"The graph doesn't have operators."));
PrepareAtomicOpDeps(); PrepareAtomicOpDeps();
} }
......
...@@ -15,9 +15,14 @@ ...@@ -15,9 +15,14 @@
#include "paddle/fluid/framework/details/fetch_async_op_handle.h" #include "paddle/fluid/framework/details/fetch_async_op_handle.h"
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector>
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
......
...@@ -22,6 +22,18 @@ ...@@ -22,6 +22,18 @@
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
class LoDTensor;
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
......
...@@ -19,6 +19,8 @@ ...@@ -19,6 +19,8 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
struct VarHandleBase;
FetchBarrierOpHandle::FetchBarrierOpHandle( FetchBarrierOpHandle::FetchBarrierOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes, ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places) const std::vector<platform::Place> &places)
......
...@@ -24,6 +24,15 @@ ...@@ -24,6 +24,15 @@
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
class Scope;
namespace ir {
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
...@@ -32,6 +41,8 @@ namespace details { ...@@ -32,6 +41,8 @@ namespace details {
// all places if there are multiple places, must init with // all places if there are multiple places, must init with
// multiple dev_ctxes_ !!!! // multiple dev_ctxes_ !!!!
struct VarHandleBase;
struct FetchBarrierOpHandle : public OpHandleBase { struct FetchBarrierOpHandle : public OpHandleBase {
public: public:
FetchBarrierOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes, FetchBarrierOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
......
...@@ -13,9 +13,11 @@ ...@@ -13,9 +13,11 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/fetch_op_handle.h"
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
...@@ -138,8 +140,10 @@ void FetchOpHandle::RunImpl() { ...@@ -138,8 +140,10 @@ void FetchOpHandle::RunImpl() {
auto *var_handle = static_cast<VarHandle *>(inputs_[i]); auto *var_handle = static_cast<VarHandle *>(inputs_[i]);
auto &scope = scopes.at(var_handle->scope_idx()); auto &scope = scopes.at(var_handle->scope_idx());
auto *var = scope->FindVar(var_handle->name()); auto *var = scope->FindVar(var_handle->name());
PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope", PADDLE_ENFORCE_NOT_NULL(
var_handle->name()); var,
platform::errors::NotFound(
"Cannot find variable %s in execution scope.", var_handle->name()));
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
auto &t = var->Get<framework::LoDTensor>(); auto &t = var->Get<framework::LoDTensor>();
......
...@@ -22,6 +22,17 @@ ...@@ -22,6 +22,17 @@
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
......
...@@ -12,8 +12,10 @@ ...@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h" #include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
#include <algorithm> #include <algorithm>
#include <utility> #include <utility>
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
...@@ -56,10 +58,20 @@ void FusedAllReduceOpHandle::RunImpl() { ...@@ -56,10 +58,20 @@ void FusedAllReduceOpHandle::RunImpl() {
size_t place_num = places_.size(); size_t place_num = places_.size();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), place_num * num_of_all_reduce_, in_var_handles.size(), place_num * num_of_all_reduce_,
"The NoDummyInputSize should be equal to the number of places."); platform::errors::PreconditionNotMet(
"The number of input variable handles should be equal to the number "
"of places plus the number of all reduce handles, "
"but got the number of input variable handles is %d, the "
"number of places is %d, and the number of all reduce handles "
"is %d.",
in_var_handles.size(), place_num, num_of_all_reduce_));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(), in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal."); platform::errors::PreconditionNotMet(
"The number of input variable handles should be equal to the number "
"of output variable handles, but got the number of input variable "
"handles is %d, and the number of output variable handles is %d.",
in_var_handles.size(), out_var_handles.size()));
// Note: some gradient op doesn't have CUDAKernel, so the gradients of // Note: some gradient op doesn't have CUDAKernel, so the gradients of
// those op are in CPUPlace, in this case, the all reduce should not be fused. // those op are in CPUPlace, in this case, the all reduce should not be fused.
...@@ -106,7 +118,13 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc( ...@@ -106,7 +118,13 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
dtype = ele_dtype; dtype = ele_dtype;
} }
PADDLE_ENFORCE_EQ(ele_dtype, dtype); PADDLE_ENFORCE_EQ(
ele_dtype, dtype,
platform::errors::InvalidArgument(
"The DataType of grad tensors of fused_all_reduce_op_handle "
"must be consistent. The current dtype is %s, but the "
"previous dtype is %s.",
DataTypeToString(ele_dtype), DataTypeToString(dtype)));
// Check whether the address space is contiguous. // Check whether the address space is contiguous.
std::sort( std::sort(
...@@ -130,16 +148,29 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc( ...@@ -130,16 +148,29 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
"input[%d] address: 0X%02x. The offset: %d", "input[%d] address: 0X%02x. The offset: %d",
k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k, k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k,
next_address, k, infer_next_address, offset); next_address, k, infer_next_address, offset);
PADDLE_ENFORCE_EQ(infer_next_address, next_address, PADDLE_ENFORCE_EQ(
"The address is not consistent."); infer_next_address, next_address,
platform::errors::InvalidArgument(
"The infered address of the next tensor should be equal to the "
"real address of the next tensor. But got infered address is %p "
"and real address is %p.",
infer_next_address, next_address));
} }
} }
if (!FLAGS_skip_fused_all_reduce_check) { if (!FLAGS_skip_fused_all_reduce_check) {
for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) { for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
for (size_t j = 1; j < num_of_all_reduce_; ++j) { for (size_t j = 1; j < num_of_all_reduce_; ++j) {
PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first, PADDLE_ENFORCE_EQ(
grads_tensor.at(scope_idx).at(j).first); grads_tensor.at(0).at(j).first,
grads_tensor.at(scope_idx).at(j).first,
platform::errors::InvalidArgument(
"The variable name of grad tensors of "
"fused_all_reduce_op_handle "
"must be consistent. The current name is %s, but the "
"previous name is %s.",
grads_tensor.at(0).at(j).first,
grads_tensor.at(scope_idx).at(j).first));
} }
} }
} }
...@@ -167,7 +198,9 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace( ...@@ -167,7 +198,9 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace(
for (size_t j = 0; j < in_var_handles.size(); j += place_num) { for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
auto var_name = in_var_handles[j]->name(); auto var_name = in_var_handles[j]->name();
auto var = local_scope->FindVar(var_name); auto var = local_scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound(
"The variable '%s' is not found in local scope.", var_name));
auto &lod_tensor = var->Get<LoDTensor>(); auto &lod_tensor = var->Get<LoDTensor>();
if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) { if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
return true; return true;
...@@ -185,14 +218,24 @@ void FusedAllReduceOpHandle::GetGradLoDTensor( ...@@ -185,14 +218,24 @@ void FusedAllReduceOpHandle::GetGradLoDTensor(
size_t place_num = places_.size(); size_t place_num = places_.size();
for (size_t j = 0; j < in_var_handles.size(); j += place_num) { for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
auto var_name = in_var_handles[j]->name(); auto var_name = in_var_handles[j]->name();
PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name()); PADDLE_ENFORCE_EQ(
var_name, out_var_handles[j]->name(),
platform::errors::InvalidArgument(
"The name of input variable should be equal "
"to the name of output variable. But got the name of input "
"variable is %s and the name of output variable is %s.",
var_name, out_var_handles[j]->name()));
auto var = local_scope->FindVar(var_name); auto var = local_scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound(
"The variable '%s' is not found in local scope.", var_name));
auto &lod_tensor = var->Get<LoDTensor>(); auto &lod_tensor = var->Get<LoDTensor>();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
platform::is_same_place(lod_tensor.place(), places_.at(scope_idx)), platform::is_same_place(lod_tensor.place(), places_.at(scope_idx)),
true, "%s(%d) is not in the right place.", var_name, scope_idx); true, platform::errors::InvalidArgument(
"The variable '%s' at scope %d is not in the right place.",
var_name, scope_idx));
grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor)); grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
} }
} }
...@@ -204,16 +247,26 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel( ...@@ -204,16 +247,26 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
size_t size_of_dtype = 0; size_t size_of_dtype = 0;
for (size_t i = 0; i < grad_tensor.size(); ++i) { for (size_t i = 0; i < grad_tensor.size(); ++i) {
// Get dtype // Get dtype
auto ele_type = grad_tensor.at(i).second->type(); auto ele_dtype = grad_tensor.at(i).second->type();
if (i == 0) { if (i == 0) {
*dtype = ele_type; *dtype = ele_dtype;
size_of_dtype = framework::SizeOfType(ele_type); size_of_dtype = framework::SizeOfType(ele_dtype);
} }
PADDLE_ENFORCE_EQ(ele_type, *dtype); PADDLE_ENFORCE_EQ(
ele_dtype, *dtype,
platform::errors::InvalidArgument(
"The DataType of grad tensors of fused_all_reduce_op_handle "
"must be consistent. The current dtype is %s, but the "
"previous dtype is %s.",
DataTypeToString(ele_dtype), DataTypeToString(*dtype)));
// Get element number // Get element number
int64_t len = grad_tensor.at(i).second->numel(); int64_t len = grad_tensor.at(i).second->numel();
PADDLE_ENFORCE_GT(len, 0); PADDLE_ENFORCE_GT(
len, 0, platform::errors::InvalidArgument(
"The size of grad tensors of fused_all_reduce_op_handle "
"must be > 0, but got %d.",
len));
*numel += *numel +=
platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype; platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
} }
......
...@@ -17,10 +17,22 @@ ...@@ -17,10 +17,22 @@
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class NCCLCommunicator;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -32,7 +33,15 @@ void FusedBroadcastOpHandle::RunImpl() { ...@@ -32,7 +33,15 @@ void FusedBroadcastOpHandle::RunImpl() {
WaitInputVarGenerated(); WaitInputVarGenerated();
size_t place_num = places_.size(); size_t place_num = places_.size();
PADDLE_ENFORCE_EQ(in_var_handles.size() * place_num, out_var_handles.size()); PADDLE_ENFORCE_EQ(
in_var_handles.size() * place_num, out_var_handles.size(),
platform::errors::PreconditionNotMet(
"The number of input variable handles plus the number "
"of places should be equal to the number of output variable handles, "
"but got the number of input variable handles is %d, the "
"number of places is %d, and the number of output variable handles "
"is %d.",
in_var_handles.size(), place_num, out_var_handles.size()));
for (size_t i = 0; i < in_var_handles.size(); ++i) { for (size_t i = 0; i < in_var_handles.size(); ++i) {
BroadcastOneVar( BroadcastOneVar(
......
...@@ -25,6 +25,17 @@ ...@@ -25,6 +25,17 @@
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
struct NCCLContextMap;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
......
...@@ -13,15 +13,26 @@ ...@@ -13,15 +13,26 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include <memory> #include <memory>
#include <unordered_map> #include <unordered_map>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/details/broadcast_op_handle_test.h" #include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
struct VarHandle;
struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
std::vector<std::string> out_varnames_; std::vector<std::string> out_varnames_;
std::vector<std::unique_ptr<ir::Node>> nodes_; std::vector<std::unique_ptr<ir::Node>> nodes_;
...@@ -49,7 +60,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ...@@ -49,7 +60,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
op_handle_ = new FusedBroadcastOpHandle( op_handle_ = new FusedBroadcastOpHandle(
nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
#else #else
PADDLE_THROW("CUDA is not supported."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/gather_op_handle.h" #include "paddle/fluid/framework/details/gather_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
...@@ -32,13 +33,20 @@ void GatherOpHandle::RunImpl() { ...@@ -32,13 +33,20 @@ void GatherOpHandle::RunImpl() {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(), in_var_handles.size(), places_.size(),
"The number of output should equal to the number of places."); platform::errors::InvalidArgument(
"The number of input variables should be equal "
"to the number of places, but got the number of input variables is "
"%d and the number of places is %d.",
in_var_handles.size(), places_.size()));
VarHandle *out_var_handle; VarHandle *out_var_handle;
{ {
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs()); auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1, PADDLE_ENFORCE_EQ(
"The number of output should be one."); out_var_handles.size(), 1,
platform::errors::InvalidArgument(
"The number of output variables should be 1, but got %d.",
out_var_handles.size()));
out_var_handle = out_var_handles.front(); out_var_handle = out_var_handles.front();
} }
...@@ -47,10 +55,14 @@ void GatherOpHandle::RunImpl() { ...@@ -47,10 +55,14 @@ void GatherOpHandle::RunImpl() {
auto in_0_handle = in_var_handles[0]; auto in_0_handle = in_var_handles[0];
auto pre_in_var = auto pre_in_var =
var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name()); var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
PADDLE_ENFORCE_NOT_NULL(pre_in_var); PADDLE_ENFORCE_NOT_NULL(
pre_in_var,
platform::errors::NotFound("The variable '%s' is not found in the scope.",
in_0_handle->name()));
PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(), PADDLE_ENFORCE_EQ(pre_in_var->IsType<framework::SelectedRows>(), true,
"Currently, gather_op only can gather SelectedRows."); platform::errors::Unimplemented(
"Currently, gather_op only supports SelectedRows."));
// Wait input done, this Wait is asynchronous operation // Wait input done, this Wait is asynchronous operation
WaitInputVarGenerated(); WaitInputVarGenerated();
...@@ -63,7 +75,10 @@ void GatherOpHandle::RunImpl() { ...@@ -63,7 +75,10 @@ void GatherOpHandle::RunImpl() {
for (auto *in_handle : in_var_handles) { for (auto *in_handle : in_var_handles) {
auto *in_var = auto *in_var =
var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name()); var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
in_var,
platform::errors::NotFound(
"The variable '%s' is not found in the scope.", in_handle->name()));
VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var); VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var);
auto &in_sr_value = in_var->Get<framework::SelectedRows>(); auto &in_sr_value = in_var->Get<framework::SelectedRows>();
...@@ -76,15 +91,19 @@ void GatherOpHandle::RunImpl() { ...@@ -76,15 +91,19 @@ void GatherOpHandle::RunImpl() {
// NOTE: The Places of all input tensor must be all on CPU or all on GPU. // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
platform::Place t_out_p = out_var_handle->place(); platform::Place t_out_p = out_var_handle->place();
if (platform::is_gpu_place(pre_in_value.place())) { if (platform::is_gpu_place(pre_in_value.place())) {
PADDLE_ENFORCE(platform::is_gpu_place(t_out_p), PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
"Places of input and output must be all on GPU."); platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
} else { } else {
t_out_p = platform::CPUPlace(); t_out_p = platform::CPUPlace();
} }
auto out_var = var_scopes.at(out_var_handle->scope_idx()) auto out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name()); ->FindVar(out_var_handle->name());
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(
out_var,
platform::errors::NotFound("The variable '%s' is not found in the scope.",
out_var_handle->name()));
auto out_value = out_var->GetMutable<framework::SelectedRows>(); auto out_value = out_var->GetMutable<framework::SelectedRows>();
out_value->set_height(pre_in_value.height()); out_value->set_height(pre_in_value.height());
out_value->set_rows(out_rows); out_value->set_rows(out_rows);
......
...@@ -24,6 +24,14 @@ ...@@ -24,6 +24,14 @@
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
......
...@@ -13,15 +13,17 @@ ...@@ -13,15 +13,17 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/gather_op_handle.h" #include "paddle/fluid/framework/details/gather_op_handle.h"
#include <memory> #include <memory>
#include <unordered_map> #include <unordered_map>
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h" #include "gtest/gtest.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
struct DummyVarHandle;
namespace f = paddle::framework; namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
...@@ -60,7 +62,8 @@ struct TestGatherOpHandle { ...@@ -60,7 +62,8 @@ struct TestGatherOpHandle {
ctxs_.emplace_back(new p::CUDADeviceContext(p)); ctxs_.emplace_back(new p::CUDADeviceContext(p));
} }
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} else { } else {
int count = 8; int count = 8;
...@@ -141,7 +144,9 @@ struct TestGatherOpHandle { ...@@ -141,7 +144,9 @@ struct TestGatherOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) { ++input_scope_idx) {
auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input"); auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"The variable '%s' is not found in the scope.", "input"));
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>(); auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value(); auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]); value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
...@@ -155,7 +160,9 @@ struct TestGatherOpHandle { ...@@ -155,7 +160,9 @@ struct TestGatherOpHandle {
} }
auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out"); auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(
out_var, platform::errors::NotFound(
"The variable '%s' is not found in the scope.", "out"));
auto out_selected_rows = out_var->GetMutable<f::SelectedRows>(); auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input"); auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input");
...@@ -173,9 +180,19 @@ struct TestGatherOpHandle { ...@@ -173,9 +180,19 @@ struct TestGatherOpHandle {
auto& out_select_rows = out_var->Get<f::SelectedRows>(); auto& out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value(); auto rt = out_select_rows.value();
PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal."); PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
platform::errors::InvalidArgument(
"The height of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
height, out_select_rows.height()));
for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]); PADDLE_ENFORCE_EQ(
out_select_rows.rows()[k], rows[k % rows.size()],
platform::errors::InvalidArgument(
"The item at position %d of rows of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
k, rows[k % rows.size()], out_select_rows.rows()[k]));
} }
f::Tensor result_tensor; f::Tensor result_tensor;
...@@ -207,6 +224,7 @@ TEST(GatherTester, TestGPUGatherTestSelectedRows) { ...@@ -207,6 +224,7 @@ TEST(GatherTester, TestGPUGatherTestSelectedRows) {
test_op.TestGatherSelectedRows(input_scope_idx); test_op.TestGatherSelectedRows(input_scope_idx);
} }
#endif #endif
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -20,16 +20,21 @@ ...@@ -20,16 +20,21 @@
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/framework/ir/graph.h" namespace paddle {
#include "paddle/fluid/framework/ir/pass.h" namespace framework {
class OpDesc;
} // namespace framework
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -19,6 +19,12 @@ ...@@ -19,6 +19,12 @@
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
namespace paddle {
namespace framework {
class Tensor;
} // namespace framework
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
......
...@@ -46,14 +46,17 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -46,14 +46,17 @@ class NCCLOpHandleBase : public OpHandleBase {
} }
virtual ~NCCLOpHandleBase() { virtual ~NCCLOpHandleBase() {
for (auto& ev : inter_events_) { for (auto& ev : inter_events_) {
PADDLE_ENFORCE(cudaEventDestroy(ev.second)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
} }
for (auto& ev : exter_events_) { for (auto& ev : exter_events_) {
PADDLE_ENFORCE(cudaEventDestroy(ev.second)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
} }
} }
void SetRunEnv(int run_order, bool use_hierarchical_allreduce) { void SetRunEnv(int run_order, bool use_hierarchical_allreduce) {
PADDLE_ENFORCE(run_order >= 0, "run_order must >= 0"); PADDLE_ENFORCE_GE(
run_order, 0,
platform::errors::InvalidArgument(
"The argument run_order must be >= 0, but got %d.", run_order));
run_order_ = run_order; run_order_ = run_order;
use_hierarchical_allreduce_ = use_hierarchical_allreduce; use_hierarchical_allreduce_ = use_hierarchical_allreduce;
...@@ -74,8 +77,11 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -74,8 +77,11 @@ class NCCLOpHandleBase : public OpHandleBase {
return; return;
} }
PADDLE_ENFORCE(places_.size() == 1, PADDLE_ENFORCE_EQ(places_.size(), 1,
"HierarchicalAllReduce run one proc with one card mode."); platform::errors::InvalidArgument(
"HierarchicalAllReduce can only run "
"one proccess with one card mode, but got %d cards.",
places_.size()));
for (auto& p : places_) { for (auto& p : places_) {
auto ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order); auto ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order);
...@@ -88,11 +94,11 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -88,11 +94,11 @@ class NCCLOpHandleBase : public OpHandleBase {
continue; continue;
} }
PADDLE_ENFORCE(cudaSetDevice(dev_id)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
PADDLE_ENFORCE(cudaEventCreateWithFlags(&inter_events_[dev_id], PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
cudaEventDisableTiming)); &inter_events_[dev_id], cudaEventDisableTiming));
PADDLE_ENFORCE(cudaEventCreateWithFlags(&exter_events_[dev_id], PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
cudaEventDisableTiming)); &exter_events_[dev_id], cudaEventDisableTiming));
VLOG(10) << "Create events on dev_id:" << dev_id VLOG(10) << "Create events on dev_id:" << dev_id
<< ", inter_event:" << &inter_events_[dev_id] << ", inter_event:" << &inter_events_[dev_id]
<< ", exter_event:" << &exter_events_[dev_id]; << ", exter_event:" << &exter_events_[dev_id];
...@@ -102,7 +108,10 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -102,7 +108,10 @@ class NCCLOpHandleBase : public OpHandleBase {
void FlatNCCLAllReduce(platform::Place place, const void* sendbuff, void FlatNCCLAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count, ncclDataType_t datatype, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op) { ncclRedOp_t op) {
PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0"); PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_); auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
auto& nccl_ctx = flat_nccl_ctxs->at(dev_id); auto& nccl_ctx = flat_nccl_ctxs->at(dev_id);
...@@ -113,14 +122,17 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -113,14 +122,17 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", dev_id:" << dev_id << ", dtype:" << datatype << ", dev_id:" << dev_id << ", dtype:" << datatype
<< ", place:" << place; << ", place:" << place;
PADDLE_ENFORCE(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, count, datatype, op, comm, stream)); sendbuff, recvbuff, count, datatype, op, comm, stream));
} }
void NCCLAllReduce(platform::Place place, const void* sendbuff, void NCCLAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count, ncclDataType_t datatype, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op) { ncclRedOp_t op) {
PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0"); PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
if (!use_hierarchical_allreduce_) { if (!use_hierarchical_allreduce_) {
FlatNCCLAllReduce(place, sendbuff, recvbuff, count, datatype, op); FlatNCCLAllReduce(place, sendbuff, recvbuff, count, datatype, op);
return; return;
...@@ -132,7 +144,10 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -132,7 +144,10 @@ class NCCLOpHandleBase : public OpHandleBase {
void HierarchicalAllReduce(platform::Place place, const void* sendbuff, void HierarchicalAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op) { ncclDataType_t datatype, ncclRedOp_t op) {
PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0"); PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
InterReduce(place, sendbuff, recvbuff, count, datatype, op); InterReduce(place, sendbuff, recvbuff, count, datatype, op);
// When a trainer is not in exter allreduce ring // When a trainer is not in exter allreduce ring
// they need not to call this. // they need not to call this.
...@@ -157,14 +172,13 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -157,14 +172,13 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", dtype:" << datatype << ", place:" << place << ", dtype:" << datatype << ", place:" << place
<< ", stream:" << stream; << ", stream:" << stream;
PADDLE_ENFORCE(platform::dynload::ncclReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream)); sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream));
cudaEventRecord(inter_events_.at(dev_id), stream); cudaEventRecord(inter_events_.at(dev_id), stream);
if (FLAGS_sync_nccl_allreduce) { if (FLAGS_sync_nccl_allreduce) {
PADDLE_ENFORCE(cudaStreamSynchronize(stream), PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
"sync HierarchicalAllReduce inter stream error");
} }
} }
...@@ -172,7 +186,9 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -172,7 +186,9 @@ class NCCLOpHandleBase : public OpHandleBase {
void* recvbuff, size_t count, ncclDataType_t datatype, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op) { ncclRedOp_t op) {
auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_); auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_);
PADDLE_ENFORCE(nccl_ctxs_, "can't get exter %d nccl_ctxs", run_order_); PADDLE_ENFORCE_NOT_NULL(
nccl_ctxs_, platform::errors::NotFound(
"Can't get exter %d nccl contexts.", run_order_));
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
auto& nccl_ctx = nccl_ctxs->at(dev_id); auto& nccl_ctx = nccl_ctxs->at(dev_id);
auto stream = nccl_ctx.stream(); auto stream = nccl_ctx.stream();
...@@ -185,14 +201,13 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -185,14 +201,13 @@ class NCCLOpHandleBase : public OpHandleBase {
cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0); cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
PADDLE_ENFORCE(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, count, datatype, op, comm, stream)); sendbuff, recvbuff, count, datatype, op, comm, stream));
cudaEventRecord(exter_events_.at(dev_id), stream); cudaEventRecord(exter_events_.at(dev_id), stream);
if (FLAGS_sync_nccl_allreduce) { if (FLAGS_sync_nccl_allreduce) {
PADDLE_ENFORCE(cudaStreamSynchronize(stream), PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
"sync HierarchicalAllReduce exter stream error");
} }
} }
...@@ -210,8 +225,8 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -210,8 +225,8 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", stream:" << stream; << ", stream:" << stream;
cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0); cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
PADDLE_ENFORCE(platform::dynload::ncclBcast(sendbuff, count, datatype, 0, PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
comm, stream)); sendbuff, count, datatype, 0, comm, stream));
} }
protected: protected:
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include <map> #include <map>
#include <unordered_set> #include <unordered_set>
...@@ -46,8 +47,8 @@ void OpHandleBase::InitCUDA() { ...@@ -46,8 +47,8 @@ void OpHandleBase::InitCUDA() {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
for (auto &p : dev_ctxes_) { for (auto &p : dev_ctxes_) {
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
PADDLE_ENFORCE(cudaSetDevice(dev_id)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
PADDLE_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
} }
if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) { if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
...@@ -61,17 +62,22 @@ void OpHandleBase::InitCUDA() { ...@@ -61,17 +62,22 @@ void OpHandleBase::InitCUDA() {
} }
} }
} else { } else {
PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL, PADDLE_ENFORCE_EQ(
"%s should have only one dev_ctx.", Name()); dev_ctxes_.size(), 1UL,
platform::errors::InvalidArgument(
"Operator %s should have only one dev_ctx, but got %d.", Name(),
dev_ctxes_.size()));
auto &place = dev_ctxes_.begin()->first; auto &place = dev_ctxes_.begin()->first;
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
for (auto &out_var : outputs_) { for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var); auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
if (out_var_handle) { if (out_var_handle) {
PADDLE_ENFORCE(platform::is_same_place(place, out_var_handle->place()), PADDLE_ENFORCE_EQ(
"The place of output(%s) is not consistent with the " platform::is_same_place(place, out_var_handle->place()), true,
"place of current op(%s).", platform::errors::InvalidArgument(
out_var_handle->Name(), Name()); "The place of output(%s) is not consistent with the "
"place of current op(%s).",
out_var_handle->Name(), Name()));
out_var_handle->SetGenerateEvent(events_.at(dev_id)); out_var_handle->SetGenerateEvent(events_.at(dev_id));
} }
} }
...@@ -85,25 +91,37 @@ void OpHandleBase::Run(bool use_cuda) { ...@@ -85,25 +91,37 @@ void OpHandleBase::Run(bool use_cuda) {
InitCUDA(); InitCUDA();
} }
#else #else
PADDLE_ENFORCE(!use_cuda); PADDLE_ENFORCE_EQ(use_cuda, false,
platform::errors::InvalidArgument(
"Argument use_cuda should be false when Paddle is not "
"compiled with CUDA."));
#endif #endif
// skip running current op, used with inplace_addto_op_pass
if (skip_running_) {
VLOG(4) << "skip running: " << Name();
return;
}
RunImpl(); RunImpl();
} }
void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_NOT_NULL(waited_ctx); PADDLE_ENFORCE_NOT_NULL(waited_ctx, platform::errors::InvalidArgument(
"Argument waited_ctx is NULL."));
if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) { if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
for (auto &dev_ctx : dev_ctxes_) { for (auto &dev_ctx : dev_ctxes_) {
PADDLE_ENFORCE_NOT_NULL(dev_ctx.second); PADDLE_ENFORCE_NOT_NULL(
dev_ctx.second,
platform::errors::InvalidArgument("The device context is NULL."));
dev_ctx.second->Wait(); dev_ctx.second->Wait();
} }
} else { } else {
auto stream = auto stream =
static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream(); static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
for (auto &ev : events_) { for (auto &ev : events_) {
PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
} }
} }
#else #else
...@@ -138,10 +156,11 @@ void OpHandleBase::WaitInputVarGenerated() { ...@@ -138,10 +156,11 @@ void OpHandleBase::WaitInputVarGenerated() {
auto stream = auto stream =
static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place)) static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
->stream(); ->stream();
PADDLE_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else #else
PADDLE_THROW("Doesn't compile the GPU."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} }
// There are nothing to do when the place is CPUPlace. // There are nothing to do when the place is CPUPlace.
...@@ -162,10 +181,11 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { ...@@ -162,10 +181,11 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
auto stream = static_cast<platform::CUDADeviceContext *>( auto stream = static_cast<platform::CUDADeviceContext *>(
dev_ctxes_.at(in_var_handle->place())) dev_ctxes_.at(in_var_handle->place()))
->stream(); ->stream();
PADDLE_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else #else
PADDLE_THROW("Doesn't compile the GPU."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} }
// There are nothing to do when the place is CPUPlace. // There are nothing to do when the place is CPUPlace.
...@@ -235,7 +255,9 @@ void OpHandleBase::SetLocalExecScopes( ...@@ -235,7 +255,9 @@ void OpHandleBase::SetLocalExecScopes(
auto scopes = GetLocalScopes(); auto scopes = GetLocalScopes();
for (auto *scope : scopes) { for (auto *scope : scopes) {
auto iter = scope_map.find(scope); auto iter = scope_map.find(scope);
PADDLE_ENFORCE(iter != scope_map.end(), "Local scope not found"); PADDLE_ENFORCE_NE(
iter, scope_map.end(),
platform::errors::NotFound("Local scope not found in scope map."));
local_exec_scopes_.emplace_back(iter->second); local_exec_scopes_.emplace_back(iter->second);
} }
} }
......
...@@ -18,15 +18,28 @@ ...@@ -18,15 +18,28 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
namespace paddle {
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class Scope; class Scope;
namespace details {
struct VarHandleBase;
} // namespace details
namespace ir {
class Node;
} // namespace ir
namespace details { namespace details {
...@@ -52,6 +65,10 @@ class OpHandleBase { ...@@ -52,6 +65,10 @@ class OpHandleBase {
virtual Priority GetPriority() const { return kNormal; } virtual Priority GetPriority() const { return kNormal; }
virtual bool GetSkipRunning() const { return skip_running_; }
virtual void SetSkipRunning(bool skip_runing) { skip_running_ = skip_runing; }
virtual std::string Name() const = 0; virtual std::string Name() const = 0;
void Run(bool use_cuda); void Run(bool use_cuda);
...@@ -131,6 +148,7 @@ class OpHandleBase { ...@@ -131,6 +148,7 @@ class OpHandleBase {
std::map<platform::Place, platform::DeviceContext *> dev_ctxes_; std::map<platform::Place, platform::DeviceContext *> dev_ctxes_;
std::vector<Scope *> local_exec_scopes_; std::vector<Scope *> local_exec_scopes_;
bool skip_running_ = false;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
std::unordered_map<int, cudaEvent_t> events_; std::unordered_map<int, cudaEvent_t> events_;
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/grad_op_desc_maker.h" #include "paddle/fluid/framework/grad_op_desc_maker.h"
#include "paddle/fluid/framework/inplace_op_inference.h" #include "paddle/fluid/framework/inplace_op_inference.h"
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
...@@ -186,19 +187,20 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> { ...@@ -186,19 +187,20 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
void operator()(const char* op_type, OpInfo* info) const { void operator()(const char* op_type, OpInfo* info) const {
PADDLE_ENFORCE_EQ(info->proto_, nullptr, PADDLE_ENFORCE_EQ(info->proto_, nullptr,
platform::errors::AlreadyExists( platform::errors::AlreadyExists(
"OpProto of %s has been registered", op_type)); "OpProto of %s has been registered.", op_type));
PADDLE_ENFORCE_EQ(info->checker_, nullptr, PADDLE_ENFORCE_EQ(info->checker_, nullptr,
platform::errors::AlreadyExists( platform::errors::AlreadyExists(
"OpAttrChecker of %s has been registered", op_type)); "OpAttrChecker of %s has been registered.", op_type));
info->proto_ = new proto::OpProto; info->proto_ = new proto::OpProto;
info->checker_ = new OpAttrChecker(); info->checker_ = new OpAttrChecker();
T maker; T maker;
maker(info->proto_, info->checker_); maker(info->proto_, info->checker_);
info->proto_->set_type(op_type); info->proto_->set_type(op_type);
PADDLE_ENFORCE( PADDLE_ENFORCE_EQ(
info->proto_->IsInitialized(), info->proto_->IsInitialized(), true,
"Fail to initialize %s's OpProto, because %s is not initialized", platform::errors::PreconditionNotMet(
op_type, info->proto_->InitializationErrorString()); "Fail to initialize %s's OpProto, because %s is not initialized.",
op_type, info->proto_->InitializationErrorString()));
} }
}; };
......
...@@ -13,9 +13,11 @@ ...@@ -13,9 +13,11 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <utility> #include <utility>
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
namespace paddle { namespace paddle {
...@@ -104,7 +106,12 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( ...@@ -104,7 +106,12 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
places_(places), places_(places),
graphs_(std::move(graphs)), graphs_(std::move(graphs)),
feed_status_(places.size(), FeedStatus::kNone) { feed_status_(places.size(), FeedStatus::kNone) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
platform::errors::InvalidArgument(
"The number of places and the number of local scopes "
"should be equal, but got number of places is %d and "
"number of local scopes is %d.",
places_.size(), local_scopes_.size()));
PADDLE_ENFORCE_EQ(places_.size(), graphs_.size(), PADDLE_ENFORCE_EQ(places_.size(), graphs_.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
...@@ -32,9 +33,13 @@ struct ReduceLoDTensor { ...@@ -32,9 +33,13 @@ struct ReduceLoDTensor {
template <typename T> template <typename T>
void apply() const { void apply() const {
PADDLE_ENFORCE(!src_tensors_.empty()); PADDLE_ENFORCE_NE(src_tensors_.empty(), true,
platform::errors::InvalidArgument(
"The number of tensors to be reduced is 0."));
auto &t0 = *src_tensors_[0]; auto &t0 = *src_tensors_[0];
PADDLE_ENFORCE_NE(t0.numel(), 0); PADDLE_ENFORCE_NE(t0.numel(), 0,
platform::errors::InvalidArgument(
"The size of first tensor to be reduced is 0."));
dst_tensor_.Resize(t0.dims()); dst_tensor_.Resize(t0.dims());
T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace()); T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
...@@ -45,8 +50,19 @@ struct ReduceLoDTensor { ...@@ -45,8 +50,19 @@ struct ReduceLoDTensor {
continue; continue;
} }
PADDLE_ENFORCE_EQ(t.dims(), t0.dims()); PADDLE_ENFORCE_EQ(t.dims(), t0.dims(),
PADDLE_ENFORCE_EQ(t.type(), t0.type()); platform::errors::InvalidArgument(
"The shape of tensors to be reduced must be "
"consistent. The shape of current tensor is %s, "
"but the shape of the first tensor is %s.",
t.dims(), t0.dims()));
PADDLE_ENFORCE_EQ(t.type(), t0.type(),
platform::errors::InvalidArgument(
"The type of tensors to be reduced must be "
"consistent. The type of current tensor is %s, "
"but the type of the first tensor is %s.",
t.type(), t0.type()));
std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst, std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
[](T a, T b) -> T { return a + b; }); [](T a, T b) -> T { return a + b; });
} }
...@@ -88,7 +104,9 @@ struct GatherLocalSelectedRowsFunctor { ...@@ -88,7 +104,9 @@ struct GatherLocalSelectedRowsFunctor {
in_places_(in_places), in_places_(in_places),
out_place_(out_place), out_place_(out_place),
dst_selected_rows_(dst_selected_rows) { dst_selected_rows_(dst_selected_rows) {
PADDLE_ENFORCE_EQ(src_selected_rows.empty(), false); PADDLE_ENFORCE_NE(src_selected_rows.empty(), true,
platform::errors::InvalidArgument(
"The number of selected_rows to be gathered is 0."));
std::vector<int64_t> out_rows; std::vector<int64_t> out_rows;
......
...@@ -13,7 +13,9 @@ ...@@ -13,7 +13,9 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/reduce_op_handle.h"
#include <memory> #include <memory>
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
...@@ -116,8 +118,15 @@ void ReduceOpHandle::GatherSelectedRows( ...@@ -116,8 +118,15 @@ void ReduceOpHandle::GatherSelectedRows(
merged_dev_ctx->Wait(); merged_dev_ctx->Wait();
scope->EraseVars(std::vector<std::string>{gathered_var_name}); scope->EraseVars(std::vector<std::string>{gathered_var_name});
PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope)); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(remote.size() == vars.size()); client->Gather(vars, &remote, *merged_dev_ctx, scope), true,
platform::errors::PreconditionNotMet("Gather SelectedRows failed."));
PADDLE_ENFORCE_EQ(remote.size(), vars.size(),
platform::errors::PreconditionNotMet(
"The number of remotes should be equal to the number "
"of variables to be gathered, but got the number of "
"remotes is %d and the number of variables is %d.",
remote.size(), vars.size()));
// 4. merged local selected rows. // 4. merged local selected rows.
std::vector<const SelectedRows *> all; std::vector<const SelectedRows *> all;
...@@ -151,14 +160,19 @@ void ReduceOpHandle::RunImpl() { ...@@ -151,14 +160,19 @@ void ReduceOpHandle::RunImpl() {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(), in_var_handles.size(), places_.size(),
"The number of output should equal to the number of places."); platform::errors::InvalidArgument(
"The number of inputs should equal to the number of places, but got "
"the number of inputs is %d and the number of places is %d.",
in_var_handles.size(), places_.size()));
VarHandle *out_var_handle; VarHandle *out_var_handle;
{ {
auto out_var_handles = DynamicCast<VarHandle>(outputs_); auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL, PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL,
"The number of output should be one."); platform::errors::InvalidArgument(
"The number of output should be one, but got %d.",
out_var_handles.size()));
out_var_handle = out_var_handles.front(); out_var_handle = out_var_handles.front();
} }
...@@ -168,7 +182,10 @@ void ReduceOpHandle::RunImpl() { ...@@ -168,7 +182,10 @@ void ReduceOpHandle::RunImpl() {
auto pre_in_var = auto pre_in_var =
var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name()); var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
PADDLE_ENFORCE_NOT_NULL(pre_in_var);
PADDLE_ENFORCE_NOT_NULL(pre_in_var, platform::errors::NotFound(
"Variable %s is not found in scope.",
in_0_handle->name()));
// NOTE: The Places of all input tensor must be all on CPU or all on GPU. // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
std::vector<platform::Place> in_places; // used to get dev_ctx std::vector<platform::Place> in_places; // used to get dev_ctx
...@@ -176,21 +193,29 @@ void ReduceOpHandle::RunImpl() { ...@@ -176,21 +193,29 @@ void ReduceOpHandle::RunImpl() {
in_places.emplace_back(in_handle->place()); in_places.emplace_back(in_handle->place());
auto in_var = auto in_var =
var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name()); var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound("Variable %s is not found in scope.",
in_handle->name()));
VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var); VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var);
} }
auto out_var = var_scopes.at(out_var_handle->scope_idx()) auto out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name()); ->FindVar(out_var_handle->name());
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_NOT_NULL(
out_var, platform::errors::NotFound("Variable %s is not found in scope.",
out_var_handle->name()));
// NOTE: The tensors' Place of input and output must be all on GPU or all on // NOTE: The tensors' Place of input and output must be all on GPU or all on
// CPU. // CPU.
auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place(); auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place();
platform::Place t_out_p; platform::Place t_out_p;
if (platform::is_gpu_place(in_p)) { if (platform::is_gpu_place(in_p)) {
PADDLE_ENFORCE(platform::is_gpu_place(out_var_handle->place()), PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_var_handle->place()), true,
"Places of input and output must be all on GPU."); platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
t_out_p = out_var_handle->place(); t_out_p = out_var_handle->place();
} else { } else {
t_out_p = platform::CPUPlace(); t_out_p = platform::CPUPlace();
...@@ -229,7 +254,10 @@ void ReduceOpHandle::RunImpl() { ...@@ -229,7 +254,10 @@ void ReduceOpHandle::RunImpl() {
in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
out_var->GetMutable<framework::SelectedRows>()); out_var->GetMutable<framework::SelectedRows>());
} else { } else {
PADDLE_THROW("only support double or float when gather SelectedRows"); PADDLE_THROW(platform::errors::Unimplemented(
"Only support double or float when gather SelectedRows, but got "
"%s.",
framework::DataTypeToString(in_selected_rows[0]->value().type())));
} }
#endif #endif
}); });
...@@ -292,7 +320,7 @@ void ReduceOpHandle::RunImpl() { ...@@ -292,7 +320,7 @@ void ReduceOpHandle::RunImpl() {
size_t numel = static_cast<size_t>(lod_tensor.numel()); size_t numel = static_cast<size_t>(lod_tensor.numel());
all_reduce_calls.emplace_back( all_reduce_calls.emplace_back(
[buffer, recvbuffer, type, numel, root_id, &nccl_ctx] { [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] {
PADDLE_ENFORCE(platform::dynload::ncclReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type), buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type),
ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream())); ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream()));
}); });
...@@ -306,10 +334,13 @@ void ReduceOpHandle::RunImpl() { ...@@ -306,10 +334,13 @@ void ReduceOpHandle::RunImpl() {
} }
}); });
#else #else
PADDLE_THROW("CUDA is not enabled."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} else { } else {
PADDLE_THROW("Place should be CPUPlace or CUDAPlace."); PADDLE_THROW(platform::errors::InvalidArgument(
"The place of tensor should be CPUPlace or CUDAPlace, but got %s.",
lod_tensors[0]->place()));
} }
} }
} }
......
...@@ -24,6 +24,21 @@ ...@@ -24,6 +24,21 @@
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
class SelectedRows;
namespace details {
struct VarHandle;
} // namespace details
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
struct NCCLContextMap;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
......
...@@ -13,7 +13,9 @@ ...@@ -13,7 +13,9 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/reduce_op_handle.h"
#include <unordered_map> #include <unordered_map>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -69,7 +71,8 @@ struct TestReduceOpHandle { ...@@ -69,7 +71,8 @@ struct TestReduceOpHandle {
} }
nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_)); nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} else { } else {
int count = 8; int count = 8;
...@@ -103,7 +106,8 @@ struct TestReduceOpHandle { ...@@ -103,7 +106,8 @@ struct TestReduceOpHandle {
op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
gpu_list_, nccl_ctxs_.get())); gpu_list_, nccl_ctxs_.get()));
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
...@@ -164,7 +168,10 @@ struct TestReduceOpHandle { ...@@ -164,7 +168,10 @@ struct TestReduceOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) { ++input_scope_idx) {
auto in_var = param_scopes_[input_scope_idx]->FindVar("input"); auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"Variable %s is not found in scope.", "input"));
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>(); auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value(); auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]); value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
...@@ -178,7 +185,9 @@ struct TestReduceOpHandle { ...@@ -178,7 +185,9 @@ struct TestReduceOpHandle {
} }
auto out_var = param_scopes_[output_scope_idx]->FindVar("out"); auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(out_var,
platform::errors::NotFound(
"Variable %s is not found in scope.", "out"));
auto out_selected_rows = out_var->GetMutable<f::SelectedRows>(); auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
auto in_var = param_scopes_[output_scope_idx]->FindVar("input"); auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
...@@ -196,9 +205,18 @@ struct TestReduceOpHandle { ...@@ -196,9 +205,18 @@ struct TestReduceOpHandle {
auto &out_select_rows = out_var->Get<f::SelectedRows>(); auto &out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value(); auto rt = out_select_rows.value();
PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal."); PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
platform::errors::InvalidArgument(
"The height of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
height, out_select_rows.height()));
for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]); PADDLE_ENFORCE_EQ(
out_select_rows.rows()[k], rows[k % rows.size()],
platform::errors::InvalidArgument(
"The item at position %d of rows of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
k, rows[k % rows.size()], out_select_rows.rows()[k]));
} }
f::Tensor result_tensor; f::Tensor result_tensor;
...@@ -208,7 +226,7 @@ struct TestReduceOpHandle { ...@@ -208,7 +226,7 @@ struct TestReduceOpHandle {
for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) { for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5); ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
} }
} } // namespace details
void TestReduceLodTensors(size_t output_scope_idx) { void TestReduceLodTensors(size_t output_scope_idx) {
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims))); std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
...@@ -220,7 +238,9 @@ struct TestReduceOpHandle { ...@@ -220,7 +238,9 @@ struct TestReduceOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) { ++input_scope_idx) {
auto in_var = param_scopes_[input_scope_idx]->FindVar("input"); auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"Variable %s is not found in scope.", "input"));
auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>(); auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]); in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
in_lod_tensor->set_lod(lod); in_lod_tensor->set_lod(lod);
...@@ -230,7 +250,9 @@ struct TestReduceOpHandle { ...@@ -230,7 +250,9 @@ struct TestReduceOpHandle {
} }
auto out_var = param_scopes_[output_scope_idx]->FindVar("out"); auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(out_var,
platform::errors::NotFound(
"Variable %s is not found in scope.", "out"));
auto out_lodtensor = out_var->GetMutable<f::LoDTensor>(); auto out_lodtensor = out_var->GetMutable<f::LoDTensor>();
auto in_var = param_scopes_[output_scope_idx]->FindVar("input"); auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
...@@ -254,7 +276,7 @@ struct TestReduceOpHandle { ...@@ -254,7 +276,7 @@ struct TestReduceOpHandle {
ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5); ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5);
} }
} }
}; }; // namespace details
TEST(ReduceTester, TestCPUReduceTestSelectedRows) { TEST(ReduceTester, TestCPUReduceTestSelectedRows) {
TestReduceOpHandle test_op; TestReduceOpHandle test_op;
......
...@@ -24,6 +24,16 @@ ...@@ -24,6 +24,16 @@
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
class OpDesc;
class Scope;
namespace ir {
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
......
...@@ -13,9 +13,17 @@ ...@@ -13,9 +13,17 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include <string> #include <string>
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
class Tensor;
} // namespace framework
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
......
...@@ -21,6 +21,18 @@ ...@@ -21,6 +21,18 @@
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
class Scope;
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
......
...@@ -17,7 +17,9 @@ ...@@ -17,7 +17,9 @@
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
......
...@@ -13,10 +13,12 @@ ...@@ -13,10 +13,12 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/framework/variable_helper.h"
...@@ -37,7 +39,13 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( ...@@ -37,7 +39,13 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
var_infos_(std::move(var_infos)), var_infos_(std::move(var_infos)),
places_(std::move(places)), places_(std::move(places)),
scope_monitor_(places_, local_exec_scopes_) { scope_monitor_(places_, local_exec_scopes_) {
PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size()); PADDLE_ENFORCE_EQ(
local_scopes_.size(), local_exec_scopes_.size(),
platform::errors::InvalidArgument(
"The number of local scopes and the number of local execution scopes "
"should be equal, but got number of local scopes is %d and "
"number of local execution scopes is %d.",
local_scopes_.size(), local_exec_scopes_.size()));
PrepareLocalExeScopes(); PrepareLocalExeScopes();
} }
......
...@@ -13,13 +13,26 @@ ...@@ -13,13 +13,26 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
class Scope;
class Tensor;
class Variable;
namespace ir {
class MemOptVarInfo;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
...@@ -29,7 +42,8 @@ static inline const Tensor &GetTensorFromVar(const Variable *var) { ...@@ -29,7 +42,8 @@ static inline const Tensor &GetTensorFromVar(const Variable *var) {
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
return var->Get<LoDTensor>(); return var->Get<LoDTensor>();
} else { } else {
PADDLE_THROW("Variable must be type of LoDTensor"); PADDLE_THROW(platform::errors::InvalidArgument(
"Variable must be type of LoDTensor."));
} }
} }
...@@ -37,20 +51,27 @@ static inline Tensor *GetMutableTensorFromVar(Variable *var) { ...@@ -37,20 +51,27 @@ static inline Tensor *GetMutableTensorFromVar(Variable *var) {
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
return var->GetMutable<LoDTensor>(); return var->GetMutable<LoDTensor>();
} else { } else {
PADDLE_THROW("Variable must be type of LoDTensor"); PADDLE_THROW(platform::errors::InvalidArgument(
"Variable must be type of LoDTensor."));
} }
} }
ShareTensorBufferFunctor::ShareTensorBufferFunctor( ShareTensorBufferFunctor::ShareTensorBufferFunctor(
Scope *scope, size_t scope_idx, const std::string &op_type, Scope *scope, size_t scope_idx, const std::string &op_type,
const std::vector<const ir::MemOptVarInfo *> &in_var_infos, const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
const std::vector<std::string> &out_var_names) const std::vector<std::string> &out_var_names, bool share_dims)
: scope_(scope), : scope_(scope),
scope_idx_(scope_idx), scope_idx_(scope_idx),
op_type_(op_type), op_type_(op_type),
in_var_infos_(in_var_infos), in_var_infos_(in_var_infos),
out_var_names_(out_var_names) { out_var_names_(out_var_names),
PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size()); share_dims_(share_dims) {
PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size(),
platform::errors::PreconditionNotMet(
"The number of input variables and output variables "
"should be equal, but got number of input variables is "
"%d and number of output variables is %d.",
in_var_infos_.size(), out_var_names_.size()));
for (size_t i = 0; i < in_var_infos_.size(); ++i) { for (size_t i = 0; i < in_var_infos_.size(); ++i) {
AddReuseVarPair(in_var_infos_[i], out_var_names_[i]); AddReuseVarPair(in_var_infos_[i], out_var_names_[i]);
} }
...@@ -67,32 +88,58 @@ ShareTensorBufferFunctor::ReusedVars() const { ...@@ -67,32 +88,58 @@ ShareTensorBufferFunctor::ReusedVars() const {
void ShareTensorBufferFunctor::AddReuseVarPair( void ShareTensorBufferFunctor::AddReuseVarPair(
const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name) { const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name) {
PADDLE_ENFORCE_NOT_NULL(in_var_info, "in_var_info cannot be nullptr"); PADDLE_ENFORCE_NOT_NULL(
in_var_info,
platform::errors::InvalidArgument(
"The input variables to be inplaced should not be NULL."));
PADDLE_ENFORCE_NE(in_var_info->Name(), out_var_name, PADDLE_ENFORCE_NE(in_var_info->Name(), out_var_name,
"in/out cannot have same name: %s", out_var_name); platform::errors::InvalidArgument(
"The input variable and output variable to be inplaced "
"cannot have the same name: %s.",
out_var_name));
in_var_infos_.emplace_back(in_var_info); in_var_infos_.emplace_back(in_var_info);
out_var_names_.emplace_back(out_var_name); out_var_names_.emplace_back(out_var_name);
} }
void ShareTensorBufferFunctor::CallOnce() { void ShareTensorBufferFunctor::CallOnce() {
PADDLE_ENFORCE(in_out_vars_.empty(), "in_out_vars_ must be initialized here"); PADDLE_ENFORCE(in_out_vars_.empty(),
platform::errors::InvalidArgument(
"The input-output variable pairs to be "
"inplaced should be initialized here."));
for (size_t i = 0; i < in_var_infos_.size(); ++i) { for (size_t i = 0; i < in_var_infos_.size(); ++i) {
auto *in_var = exec_scope_->FindVar(in_var_infos_[i]->Name()); auto *in_var = exec_scope_->FindVar(in_var_infos_[i]->Name());
auto *out_var = exec_scope_->FindVar(out_var_names_[i]); auto *out_var = exec_scope_->FindVar(out_var_names_[i]);
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
PADDLE_ENFORCE_NOT_NULL(out_var); in_var, platform::errors::NotFound(
PADDLE_ENFORCE_NE(in_var, out_var); "The variable(%s) to be inplaced is not found in scope.",
in_var_infos_[i]->Name()));
PADDLE_ENFORCE_NOT_NULL(
out_var, platform::errors::NotFound(
"The variable(%s) to be inplaced is not found in scope.",
out_var_names_[i]));
PADDLE_ENFORCE_NE(
in_var, out_var,
platform::errors::PreconditionNotMet(
"The input variable and output variable to be inplaced "
"cannot be the same variable(%s).",
out_var_names_[i]));
in_out_vars_.emplace_back(in_var, out_var); in_out_vars_.emplace_back(in_var, out_var);
} }
} }
void ShareTensorBufferFunctor::operator()(Scope *exec_scope) { void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
if (!exec_scope_) { if (!exec_scope_) {
PADDLE_ENFORCE_NOT_NULL(exec_scope); PADDLE_ENFORCE_NOT_NULL(exec_scope,
platform::errors::InvalidArgument(
"The given execution scope should not be NULL "
"if the cached scope is NULL."));
exec_scope_ = exec_scope; exec_scope_ = exec_scope;
CallOnce(); CallOnce();
} else { } else {
PADDLE_ENFORCE(exec_scope_ == exec_scope, "Scope must be the same"); PADDLE_ENFORCE_EQ(exec_scope_, exec_scope,
platform::errors::InvalidArgument(
"The given execution scope and the cached execution "
"scope should be the same."));
} }
for (size_t i = 0; i < in_var_infos_.size(); ++i) { for (size_t i = 0; i < in_var_infos_.size(); ++i) {
...@@ -115,6 +162,13 @@ void ShareTensorBufferFunctor::operator()(Scope *exec_scope) { ...@@ -115,6 +162,13 @@ void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
} else { } else {
out_tensor->ShareBufferWith(in_tensor); out_tensor->ShareBufferWith(in_tensor);
// NOTE(zhiqiu): In the case of inplace addto, if the operator of
// the in_out_vars is skipped during running, we should set the dims of
// output as the same as input.
if (share_dims_) {
out_tensor->Resize(in_tensor.dims());
}
VLOG(2) << "Share tensor buffer when running " << op_type_ << " : " VLOG(2) << "Share tensor buffer when running " << op_type_ << " : "
<< in_var_info->Name() << " -> " << out_var_names_[i]; << in_var_info->Name() << " -> " << out_var_names_[i];
} }
......
...@@ -19,11 +19,21 @@ ...@@ -19,11 +19,21 @@
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
namespace paddle {
namespace framework {
class Scope;
namespace ir {
class MemOptVarInfo;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
...@@ -40,11 +50,13 @@ class ShareTensorBufferFunctor { ...@@ -40,11 +50,13 @@ class ShareTensorBufferFunctor {
ShareTensorBufferFunctor( ShareTensorBufferFunctor(
Scope *scope, size_t scope_idx, const std::string &op_type, Scope *scope, size_t scope_idx, const std::string &op_type,
const std::vector<const ir::MemOptVarInfo *> &in_var_infos, const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
const std::vector<std::string> &out_var_names); const std::vector<std::string> &out_var_names, bool share_dims = false);
void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info, void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
const std::string &out_var_name); const std::string &out_var_name);
void SetShareDims(bool share_dims) { share_dims_ = share_dims; }
void operator()(Scope *exec_scope); void operator()(Scope *exec_scope);
std::unordered_map<std::string, std::string> ReusedVars() const; std::unordered_map<std::string, std::string> ReusedVars() const;
...@@ -66,6 +78,11 @@ class ShareTensorBufferFunctor { ...@@ -66,6 +78,11 @@ class ShareTensorBufferFunctor {
std::vector<std::string> out_var_names_; std::vector<std::string> out_var_names_;
std::vector<std::pair<const Variable *, Variable *>> in_out_vars_; std::vector<std::pair<const Variable *, Variable *>> in_out_vars_;
// NOTE(zhiqiu): In the case of inplace addto, if the operator of
// the in_out_vars is skipped during running, we should set the dims of output
// as the same as input.
bool share_dims_{false};
}; };
} // namespace details } // namespace details
......
...@@ -13,18 +13,30 @@ ...@@ -13,18 +13,30 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h" #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
class MemOptVarInfo;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
class ComputationOpHandle;
ComputationOpHandle *GetUniquePendingComputationOpHandle( ComputationOpHandle *GetUniquePendingComputationOpHandle(
ShareTensorBufferOpHandle *share_tensor_op) { ShareTensorBufferOpHandle *share_tensor_op) {
ComputationOpHandle *result_op = nullptr; ComputationOpHandle *result_op = nullptr;
...@@ -32,26 +44,35 @@ ComputationOpHandle *GetUniquePendingComputationOpHandle( ...@@ -32,26 +44,35 @@ ComputationOpHandle *GetUniquePendingComputationOpHandle(
for (ir::Node *pending_op : out_var->outputs) { for (ir::Node *pending_op : out_var->outputs) {
auto &op = pending_op->Wrapper<OpHandleBase>(); auto &op = pending_op->Wrapper<OpHandleBase>();
auto *compute_op = dynamic_cast<ComputationOpHandle *>(&op); auto *compute_op = dynamic_cast<ComputationOpHandle *>(&op);
PADDLE_ENFORCE_NOT_NULL(compute_op); PADDLE_ENFORCE_NOT_NULL(
compute_op,
platform::errors::PreconditionNotMet(
"The pending OpHandle should be ComputationOpHandle."));
if (result_op == nullptr) { if (result_op == nullptr) {
result_op = compute_op; result_op = compute_op;
} else { } else {
PADDLE_ENFORCE_EQ(result_op, compute_op); PADDLE_ENFORCE_EQ(
result_op, compute_op,
platform::errors::PreconditionNotMet(
"The pending OpHandle should be the unique one."));
} }
} }
} }
PADDLE_ENFORCE_NOT_NULL(result_op); PADDLE_ENFORCE_NOT_NULL(result_op,
platform::errors::PreconditionNotMet(
"The pending OpHandle should not be NULL."));
return result_op; return result_op;
} }
ShareTensorBufferOpHandle::ShareTensorBufferOpHandle( ShareTensorBufferOpHandle::ShareTensorBufferOpHandle(
ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type, ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type,
const std::vector<const ir::MemOptVarInfo *> &in_var_infos, const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
const std::vector<std::string> &out_var_names) const std::vector<std::string> &out_var_names, bool share_dims)
: OpHandleBase(node), : OpHandleBase(node),
functor_(scope, scope_idx, op_type, in_var_infos, out_var_names) {} functor_(scope, scope_idx, op_type, in_var_infos, out_var_names,
share_dims) {}
std::unordered_map<std::string, std::string> std::unordered_map<std::string, std::string>
ShareTensorBufferOpHandle::ReusedVars() const { ShareTensorBufferOpHandle::ReusedVars() const {
...@@ -63,6 +84,10 @@ void ShareTensorBufferOpHandle::AddReuseVarPair( ...@@ -63,6 +84,10 @@ void ShareTensorBufferOpHandle::AddReuseVarPair(
functor_.AddReuseVarPair(in_var_info, out_var_name); functor_.AddReuseVarPair(in_var_info, out_var_name);
} }
void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) {
functor_.SetShareDims(share_dims);
}
void ShareTensorBufferOpHandle::InitCUDA() { void ShareTensorBufferOpHandle::InitCUDA() {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
int dev_id = int dev_id =
......
...@@ -17,21 +17,34 @@ ...@@ -17,21 +17,34 @@
#include <unordered_map> #include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
namespace paddle {
namespace framework {
class Scope;
namespace ir {
class MemOptVarInfo;
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
class ComputationOpHandle;
class ShareTensorBufferOpHandle : public OpHandleBase { class ShareTensorBufferOpHandle : public OpHandleBase {
public: public:
ShareTensorBufferOpHandle( ShareTensorBufferOpHandle(
ir::Node *node, Scope *scope, size_t scope_idx, ir::Node *node, Scope *scope, size_t scope_idx,
const std::string &op_type, const std::string &op_type,
const std::vector<const ir::MemOptVarInfo *> &in_vars_infos, const std::vector<const ir::MemOptVarInfo *> &in_vars_infos,
const std::vector<std::string> &out_var_names); const std::vector<std::string> &out_var_names, bool share_dims = false);
std::unordered_map<std::string, std::string> ReusedVars() const; std::unordered_map<std::string, std::string> ReusedVars() const;
...@@ -42,6 +55,8 @@ class ShareTensorBufferOpHandle : public OpHandleBase { ...@@ -42,6 +55,8 @@ class ShareTensorBufferOpHandle : public OpHandleBase {
void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info, void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
const std::string &out_var_name); const std::string &out_var_name);
void SetShareDims(bool share_dims);
const ShareTensorBufferFunctor &Functor() const { return functor_; } const ShareTensorBufferFunctor &Functor() const { return functor_; }
protected: protected:
......
...@@ -12,8 +12,10 @@ ...@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h" #include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
#include <algorithm> #include <algorithm>
#include <utility> #include <utility>
#include "dgc/dgc.h" #include "dgc/dgc.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
...@@ -38,18 +40,23 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle( ...@@ -38,18 +40,23 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle(
is_encoded_(is_encoded), is_encoded_(is_encoded),
nranks_(nranks) { nranks_(nranks) {
// TODO(gongwb) :polish them! // TODO(gongwb) :polish them!
PADDLE_ENFORCE_EQ(is_encoded, true); PADDLE_ENFORCE_EQ(is_encoded, true, platform::errors::InvalidArgument(
"The argument is_encoded is false."));
VLOG(1) << "Use dgc allreduce mode" VLOG(1) << "Use dgc allreduce mode"
<< ", nranks:" << nranks_; << ", nranks:" << nranks_;
PADDLE_ENFORCE_GT(local_scopes_.size(), 0); PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.",
local_scopes_.size()));
auto nranks_name = g_dgc_nranks; auto nranks_name = g_dgc_nranks;
for (size_t i = 0; i < local_scopes_.size(); ++i) { for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto *local_scope = local_scopes_[i]; auto *local_scope = local_scopes_[i];
auto nranks_var = local_scope->FindVar(nranks_name); auto nranks_var = local_scope->FindVar(nranks_name);
if (nranks_var == nullptr) {
PADDLE_THROW("not find nranks_var:%s", nranks_name); PADDLE_ENFORCE_NOT_NULL(
} nranks_var, platform::errors::NotFound(
"Variable %s is not found in scope.", nranks_name));
float *dgc_nranks = nranks_var->GetMutable<LoDTensor>()->data<float>(); float *dgc_nranks = nranks_var->GetMutable<LoDTensor>()->data<float>();
*dgc_nranks = nranks; *dgc_nranks = nranks;
...@@ -64,10 +71,18 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -64,10 +71,18 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs()); auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(), in_var_handles.size(), places_.size(),
"The NoDummyInputSize should be equal to the number of places."); platform::errors::PreconditionNotMet(
"The number of input variables should be equal to the number of "
"places, but got the number of input variables is %zu and the the "
"number of places is %zu.",
in_var_handles.size(), places_.size()));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(), in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal."); platform::errors::PreconditionNotMet(
"The number of input variables should be equal to the number of "
"output variables, but got the number of input variables is %zu and "
"the the number of output variables is %zu.",
in_var_handles.size(), out_var_handles.size()));
std::vector<const LoDTensor *> ins; std::vector<const LoDTensor *> ins;
std::vector<LoDTensor *> gathers; std::vector<LoDTensor *> gathers;
...@@ -80,14 +95,17 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -80,14 +95,17 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
auto encode_var_name = original_name + g_dgc_encoded; auto encode_var_name = original_name + g_dgc_encoded;
auto *in_var = local_scope->FindVar(encode_var_name); auto *in_var = local_scope->FindVar(encode_var_name);
PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name); PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound("Variable %s is not found in scope.",
encode_var_name));
auto &in = in_var->Get<LoDTensor>(); auto &in = in_var->Get<LoDTensor>();
ins.emplace_back(&in); ins.emplace_back(&in);
auto gather_var_name = original_name + g_dgc_gather; auto gather_var_name = original_name + g_dgc_gather;
auto *gather_var = local_scope->FindVar(gather_var_name); auto *gather_var = local_scope->FindVar(gather_var_name);
PADDLE_ENFORCE_NOT_NULL(gather_var, "%s should not be null", PADDLE_ENFORCE_NOT_NULL(
gather_var_name); gather_var, platform::errors::NotFound(
"Variable %s is not found in scope.", gather_var));
auto *gather = gather_var->GetMutable<LoDTensor>(); auto *gather = gather_var->GetMutable<LoDTensor>();
gathers.emplace_back(gather); gathers.emplace_back(gather);
...@@ -100,14 +118,26 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -100,14 +118,26 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
} }
} }
PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place())); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place())); platform::is_gpu_place(ins[0]->place()), true,
PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); platform::errors::InvalidArgument(
"The place of input variable should be CUDAPlace, but got %s.",
ins[0]->place()));
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(outs[0]->place()), true,
platform::errors::InvalidArgument(
"The place of input variable should be CUDAPlace, but got %s.",
outs[0]->place()));
PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, platform::errors::PreconditionNotMet(
"The nccl contexts are NULL."));
int dtype = -1; int dtype = -1;
size_t in_numel = 0; size_t in_numel = 0;
size_t out_numel = 0; size_t out_numel = 0;
PADDLE_ENFORCE(nranks_ > 1); PADDLE_ENFORCE_GT(
nranks_, 1,
platform::errors::PreconditionNotMet(
"The number of ranks should be > 1, but got %d.", nranks_));
std::vector<std::function<void()>> all_gather_calls; std::vector<std::function<void()>> all_gather_calls;
std::vector<std::function<void()>> sparse_reduce_calls; std::vector<std::function<void()>> sparse_reduce_calls;
...@@ -123,8 +153,16 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -123,8 +153,16 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype; dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel; in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
PADDLE_ENFORCE(in_numel % 2 == 0); PADDLE_ENFORCE_EQ(in_numel % 2, 0,
PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k)); platform::errors::InvalidArgument(
"The number of elements of input variable should be "
"even, but got %zu.",
in_numel));
PADDLE_ENFORCE_EQ(in_numel / 2, static_cast<size_t>(k),
platform::errors::InvalidArgument(
"The number of elements of input variable should be "
"even, but got %zu.",
in_numel));
out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel; out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
...@@ -154,7 +192,8 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -154,7 +192,8 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
PADDLE_ENFORCE_EQ(paddle::communication::dgc::sparseReduce( PADDLE_ENFORCE_EQ(paddle::communication::dgc::sparseReduce(
gather_buff, k, out_tensor_buf, gather_buff, k, out_tensor_buf,
static_cast<int>(out_numel), nranks_, stream), static_cast<int>(out_numel), nranks_, stream),
true); true, platform::errors::Unavailable(
"Calling sparseReduce() failed."));
}); });
} }
...@@ -187,11 +226,16 @@ void SparseAllReduceOpHandle::SparseAllReduceFunc( ...@@ -187,11 +226,16 @@ void SparseAllReduceOpHandle::SparseAllReduceFunc(
int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) { int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) {
auto original_name = paddle::framework::GradOriginalVarName(grad_name); auto original_name = paddle::framework::GradOriginalVarName(grad_name);
auto var_name = original_name + g_dgc_k; auto var_name = original_name + g_dgc_k;
PADDLE_ENFORCE(local_scopes_.size() > 0); PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.",
local_scopes_.size()));
auto *scope = local_exec_scopes_[0]; auto *scope = local_exec_scopes_[0];
auto var = scope->FindVar(var_name); auto var = scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
var_name));
auto tensor = var->Get<LoDTensor>().data<float>(); auto tensor = var->Get<LoDTensor>().data<float>();
return *tensor; return *tensor;
} }
...@@ -202,15 +246,22 @@ bool SparseAllReduceOpHandle::IsEncoded() { ...@@ -202,15 +246,22 @@ bool SparseAllReduceOpHandle::IsEncoded() {
} }
auto counter_name = g_dgc_counter_name; auto counter_name = g_dgc_counter_name;
auto step_name = g_dgc_rampup_begin_step; auto step_name = g_dgc_rampup_begin_step;
PADDLE_ENFORCE(local_scopes_.size() > 0);
PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.",
local_scopes_.size()));
auto *local_scope = local_exec_scopes_[0]; auto *local_scope = local_exec_scopes_[0];
auto count_var = local_scope->FindVar(counter_name); auto count_var = local_scope->FindVar(counter_name);
auto step_var = local_scope->FindVar(step_name); auto step_var = local_scope->FindVar(step_name);
if (count_var == nullptr || step_var == nullptr) {
PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name, PADDLE_ENFORCE_NOT_NULL(
step_var); count_var, platform::errors::NotFound(
} "Variable %s is not found in scope.", counter_name));
PADDLE_ENFORCE_NOT_NULL(
step_var, platform::errors::NotFound("Variable %s is not found in scope.",
step_var));
float count = *count_var->Get<LoDTensor>().data<float>(); float count = *count_var->Get<LoDTensor>().data<float>();
float step = *step_var->Get<LoDTensor>().data<float>(); float step = *step_var->Get<LoDTensor>().data<float>();
......
...@@ -23,6 +23,17 @@ ...@@ -23,6 +23,17 @@
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
namespace platform {
class NCCLCommunicator;
} // namespace platform
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph_executor.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h"
#include "paddle/fluid/framework/details/fetch_async_op_handle.h" #include "paddle/fluid/framework/details/fetch_async_op_handle.h"
namespace paddle { namespace paddle {
...@@ -27,8 +28,9 @@ void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) { ...@@ -27,8 +28,9 @@ void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) {
PADDLE_ENFORCE_EQ(dynamic_cast<FetchOpHandle*>(op) != nullptr || PADDLE_ENFORCE_EQ(dynamic_cast<FetchOpHandle*>(op) != nullptr ||
dynamic_cast<FetchAsyncOpHandle*>(op) != nullptr, dynamic_cast<FetchAsyncOpHandle*>(op) != nullptr,
true, true,
"The input ops of ClearFetchOp function should be " platform::errors::PreconditionNotMet(
"FetchOpHandle or FetchAsyncOpHandle."); "The input ops of ClearFetchOp function should be "
"FetchOpHandle or FetchAsyncOpHandle."));
for (auto& out_var : op->Node()->outputs) { for (auto& out_var : op->Node()->outputs) {
graph->RemoveNode(out_var); graph->RemoveNode(out_var);
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -138,7 +139,10 @@ inline FetchResultType ThreadedSSAGraphExecutor::RunImpl( ...@@ -138,7 +139,10 @@ inline FetchResultType ThreadedSSAGraphExecutor::RunImpl(
} }
} }
} }
PADDLE_ENFORCE(ready_ops.empty()); PADDLE_ENFORCE_EQ(
ready_ops.empty(), true,
platform::errors::Fatal("After the execution of computation graph, "
"there are unexecuted operators left."));
} }
// Wait FetchOps. // Wait FetchOps.
...@@ -165,9 +169,8 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( ...@@ -165,9 +169,8 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
FetchResultType *fetch_data, bool return_merged) { FetchResultType *fetch_data, bool return_merged) {
std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars; std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
std::unordered_set<VarHandleBase *> local_ready_vars; std::unordered_set<VarHandleBase *> local_ready_vars;
std::unordered_set<std::string> fetch_tensor_set(fetch_tensors.begin(),
fetch_tensors.end()); for (auto &fetch_var_name : fetch_tensors) {
for (auto &fetch_var_name : fetch_tensor_set) {
for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) { for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
auto it = var_map.find(fetch_var_name); auto it = var_map.find(fetch_var_name);
if (it != var_map.end()) { if (it != var_map.end()) {
...@@ -231,7 +234,11 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( ...@@ -231,7 +234,11 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
ready_ops->insert(static_cast<OpHandleBase *>(op)); ready_ops->insert(static_cast<OpHandleBase *>(op));
} }
} }
PADDLE_ENFORCE_EQ(local_ready_vars.size(), 0); PADDLE_ENFORCE_EQ(
local_ready_vars.size(), 0,
platform::errors::Fatal(
"The number of ready variables should be 0, but got %d.",
local_ready_vars.size()));
} }
void ThreadedSSAGraphExecutor::InsertPendingOp( void ThreadedSSAGraphExecutor::InsertPendingOp(
...@@ -277,7 +284,9 @@ void ThreadedSSAGraphExecutor::PrepareOpDeps() { ...@@ -277,7 +284,9 @@ void ThreadedSSAGraphExecutor::PrepareOpDeps() {
} }
} }
op_deps_->num_ops_ = ready_ops.size() + pending_ops.size(); op_deps_->num_ops_ = ready_ops.size() + pending_ops.size();
PADDLE_ENFORCE_GT(op_deps_->num_ops_, 0, "The graph doesn't have operators."); PADDLE_ENFORCE_GT(
op_deps_->num_ops_, 0,
platform::errors::InvalidArgument("The graph doesn't have operators."));
for (auto ready_var : ready_vars) { for (auto ready_var : ready_vars) {
pending_vars.erase(ready_var); pending_vars.erase(ready_var);
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#pragma once #pragma once
#include <ThreadPool.h> // ThreadPool in thrird party
#include <deque> #include <deque>
#include <functional> #include <functional>
#include <list> #include <list>
...@@ -24,8 +26,6 @@ ...@@ -24,8 +26,6 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <ThreadPool.h> // ThreadPool in thrird party
#include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/exception_holder.h"
#include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h"
......
...@@ -24,6 +24,14 @@ ...@@ -24,6 +24,14 @@
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
} // namespace ir
} // namespace framework
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
...@@ -54,8 +62,10 @@ struct VarHandleBase { ...@@ -54,8 +62,10 @@ struct VarHandleBase {
void AddOutput(OpHandleBase* out, ir::Node* node) { void AddOutput(OpHandleBase* out, ir::Node* node) {
if (pending_ops_.find(out) == pending_ops_.end()) { if (pending_ops_.find(out) == pending_ops_.end()) {
PADDLE_ENFORCE(out != nullptr, "The output of %s should not be nullptr", PADDLE_ENFORCE_NOT_NULL(out,
this->Node()->Name()); platform::errors::InvalidArgument(
"The output added to VarHandle %s is NULL.",
this->Node()->Name()));
pending_ops_.insert(out); pending_ops_.insert(out);
node_->outputs.push_back(node); node_->outputs.push_back(node);
} }
...@@ -120,7 +130,10 @@ struct VarHandle : public VarHandleBase { ...@@ -120,7 +130,10 @@ struct VarHandle : public VarHandleBase {
bool HasEvent() { return has_event_; } bool HasEvent() { return has_event_; }
const cudaEvent_t& GetEvent() { const cudaEvent_t& GetEvent() {
PADDLE_ENFORCE(HasEvent(), "The event is not set."); PADDLE_ENFORCE_EQ(
HasEvent(), true,
platform::errors::PreconditionNotMet(
"The cuda event is not set, maybe InitCUDA() is not called."));
return event_; return event_;
} }
......
...@@ -13,7 +13,16 @@ ...@@ -13,7 +13,16 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
namespace paddle {
namespace framework {
class LoDTensor;
class Variable;
} // namespace framework
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
...@@ -24,7 +33,9 @@ static void VisitVariable(Variable* var, Func* func) { ...@@ -24,7 +33,9 @@ static void VisitVariable(Variable* var, Func* func) {
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
(*func)(var->GetMutable<SelectedRows>()); (*func)(var->GetMutable<SelectedRows>());
} else { } else {
PADDLE_THROW("Not supported type %s", ToTypeName(var->Type())); PADDLE_THROW(platform::errors::Unimplemented(
"VisitVariable is not supported for type %s.",
ToTypeName(var->Type())));
} }
} }
...@@ -35,7 +46,8 @@ static void VisitVariable(const Variable& var, Func* func) { ...@@ -35,7 +46,8 @@ static void VisitVariable(const Variable& var, Func* func) {
} else if (var.IsType<SelectedRows>()) { } else if (var.IsType<SelectedRows>()) {
(*func)(var.Get<SelectedRows>()); (*func)(var.Get<SelectedRows>());
} else { } else {
PADDLE_THROW("Not supported type %s", ToTypeName(var.Type())); PADDLE_THROW(platform::errors::Unimplemented(
"VisitVariable is not supported for type %s.", ToTypeName(var.Type())));
} }
} }
...@@ -50,7 +62,8 @@ struct TensorVisitor { ...@@ -50,7 +62,8 @@ struct TensorVisitor {
template <typename T> template <typename T>
void operator()() { void operator()() {
PADDLE_THROW("Not Support to get LoDTensor from %s", typeid(T).name()); PADDLE_THROW(platform::errors::Unimplemented(
"Getting tensor from type %s is not supported.", typeid(T).name()));
} }
}; };
...@@ -78,8 +91,8 @@ struct ShareDimsAndLoDVisitor { ...@@ -78,8 +91,8 @@ struct ShareDimsAndLoDVisitor {
template <typename T> template <typename T>
void operator()(const T&) { void operator()(const T&) {
PADDLE_ENFORCE("ShareDimsAndLoD is not supported by type %s", PADDLE_THROW(platform::errors::Unimplemented(
typeid(T).name()); "ShareDimsAndLoD is not supported for type %s.", typeid(T).name()));
} }
}; };
...@@ -89,42 +102,54 @@ void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) { ...@@ -89,42 +102,54 @@ void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) {
} }
struct EnforceShapeAndDTypeEQVisitor { struct EnforceShapeAndDTypeEQVisitor {
const Variable* trg_; const Variable* dst_;
void operator()(const LoDTensor& src) { void operator()(const LoDTensor& src) {
auto& tensor = trg_->Get<LoDTensor>(); auto& tensor = dst_->Get<LoDTensor>();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(src.place().which(), tensor.place().which(),
src.place().which(), tensor.place().which(), platform::errors::PreconditionNotMet(
"The Places of the two Variable must be all on CPU or all on GPU."); "The place type of the two variables is not equal."));
PADDLE_ENFORCE_EQ(src.type(), tensor.type(), PADDLE_ENFORCE_EQ(src.type(), tensor.type(),
"The dtype of the two Variable is not equal."); platform::errors::PreconditionNotMet(
PADDLE_ENFORCE_EQ(src.dims(), tensor.dims(), "The dtype of the two variables is not equal."));
"The dims of the two Variable is not equal."); PADDLE_ENFORCE_EQ(
src.dims(), tensor.dims(),
platform::errors::PreconditionNotMet(
"The layout of the two variables' tensors is not equal."));
PADDLE_ENFORCE_EQ(src.lod(), tensor.lod(), PADDLE_ENFORCE_EQ(src.lod(), tensor.lod(),
"The lod of the two Variable is not equal."); platform::errors::PreconditionNotMet(
PADDLE_ENFORCE_EQ(src.layout(), tensor.layout(), "The lod of the two variable is not equal."));
"The layout of the two Variable's tensor is not equal."); PADDLE_ENFORCE_EQ(
src.layout(), tensor.layout(),
platform::errors::PreconditionNotMet(
"The layout of the two variables' tensors tensor is not equal."));
} }
void operator()(const SelectedRows& src) { void operator()(const SelectedRows& src) {
auto& selected_rows = trg_->Get<SelectedRows>(); auto& selected_rows = dst_->Get<SelectedRows>();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(src.place().which(), selected_rows.place().which(),
src.place().which(), selected_rows.place().which(), platform::errors::PreconditionNotMet(
"The Places of the two Variable must be all on CPU or all on GPU."); "The place type of the two variables is not equal."));
PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(), PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(),
"The dtype of the two Variable is not equal."); platform::errors::PreconditionNotMet(
PADDLE_ENFORCE_EQ(src.value().layout(), selected_rows.value().layout(), "The dtype of the two variables is not equal."));
"The layout of the two Variable's tensor is not equal."); PADDLE_ENFORCE_EQ(
src.value().layout(), selected_rows.value().layout(),
platform::errors::PreconditionNotMet(
"The layout of the two variables' tensors is not equal."));
PADDLE_ENFORCE_EQ(src.height(), selected_rows.height(), PADDLE_ENFORCE_EQ(src.height(), selected_rows.height(),
"The height of the two Variable is not equal."); platform::errors::PreconditionNotMet(
"The height of the two variables is not equal."));
PADDLE_ENFORCE_EQ(src.GetCompleteDims(), selected_rows.GetCompleteDims(), PADDLE_ENFORCE_EQ(src.GetCompleteDims(), selected_rows.GetCompleteDims(),
"The dims of the two Variable is not equal."); platform::errors::PreconditionNotMet(
"The dims of the two variables is not equal."));
} }
template <typename T> template <typename T>
void operator()(const T&) { void operator()(const T&) {
PADDLE_ENFORCE("EnforceShapeAndDTypeEQ is not supported by type %s", PADDLE_THROW(platform::errors::Unimplemented(
typeid(T).name()); "EnforceShapeAndDTypeEQ is not supported for type %s.",
typeid(T).name()));
} }
}; };
......
...@@ -17,6 +17,13 @@ ...@@ -17,6 +17,13 @@
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
namespace paddle {
namespace framework {
class Tensor;
class Variable;
} // namespace framework
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
......
...@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and ...@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/device_worker.h"
#include "xxhash.h" // NOLINT
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class LoDTensor;
class Scope;
void DeviceWorker::SetRootScope(Scope* root_scope) { root_scope_ = root_scope; } void DeviceWorker::SetRootScope(Scope* root_scope) { root_scope_ = root_scope; }
void DeviceWorker::SetDataFeed(DataFeed* data_feed) { void DeviceWorker::SetDataFeed(DataFeed* data_feed) {
......
...@@ -39,6 +39,18 @@ limitations under the License. */ ...@@ -39,6 +39,18 @@ limitations under the License. */
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/timer.h" #include "paddle/fluid/platform/timer.h"
namespace paddle {
namespace framework {
class LoDTensor;
class ProgramDesc;
class Scope;
class Tensor;
} // namespace framework
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
...@@ -62,7 +74,9 @@ class PullDenseWorker { ...@@ -62,7 +74,9 @@ class PullDenseWorker {
virtual void Initialize(const TrainerDesc& param); virtual void Initialize(const TrainerDesc& param);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); } void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); }
#endif
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
void AddPlace(const paddle::platform::Place place) { void AddPlace(const paddle::platform::Place place) {
places_.push_back(place); places_.push_back(place);
} }
...@@ -123,9 +137,9 @@ class PullDenseWorker { ...@@ -123,9 +137,9 @@ class PullDenseWorker {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
std::vector<cudaStream_t> copy_streams_; std::vector<cudaStream_t> copy_streams_;
#endif
std::vector<paddle::platform::Place> places_; std::vector<paddle::platform::Place> places_;
std::vector<Scope*> thread_scopes_; std::vector<Scope*> thread_scopes_;
#endif
}; };
// should incorporate different type of device // should incorporate different type of device
...@@ -149,6 +163,7 @@ class DeviceWorker { ...@@ -149,6 +163,7 @@ class DeviceWorker {
virtual void SetDataFeed(DataFeed* data_feed); virtual void SetDataFeed(DataFeed* data_feed);
virtual void SetWorkerNum(int num) {} virtual void SetWorkerNum(int num) {}
virtual void CacheProgram(const ProgramDesc& main_program) {} virtual void CacheProgram(const ProgramDesc& main_program) {}
virtual void GetXpuOpIndex() {}
virtual void SetNeedDumpField(bool need_dump_field) { virtual void SetNeedDumpField(bool need_dump_field) {
need_dump_field_ = need_dump_field; need_dump_field_ = need_dump_field;
} }
......
...@@ -20,6 +20,8 @@ limitations under the License. */ ...@@ -20,6 +20,8 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class DeviceWorker;
typedef std::shared_ptr<DeviceWorker> (*Createdevice_workerFunction)(); typedef std::shared_ptr<DeviceWorker> (*Createdevice_workerFunction)();
typedef std::unordered_map<std::string, Createdevice_workerFunction> typedef std::unordered_map<std::string, Createdevice_workerFunction>
device_workerMap; device_workerMap;
......
...@@ -16,11 +16,14 @@ limitations under the License. */ ...@@ -16,11 +16,14 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/device_worker.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class DeviceWorker;
class DeviceWorkerFactory { class DeviceWorkerFactory {
public: public:
static std::string DeviceWorkerTypeList(); static std::string DeviceWorkerTypeList();
......
...@@ -13,9 +13,10 @@ ...@@ -13,9 +13,10 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/device_worker.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/trainer.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -97,6 +97,7 @@ message AsyncConfig { ...@@ -97,6 +97,7 @@ message AsyncConfig {
optional int32 thread_pool_size = 6 [ default = 1 ]; optional int32 thread_pool_size = 6 [ default = 1 ];
optional int32 send_wait_times = 7 [ default = 1 ]; optional int32 send_wait_times = 7 [ default = 1 ];
optional bool runtime_split_send_recv = 8 [ default = false ]; optional bool runtime_split_send_recv = 8 [ default = false ];
optional bool launch_barrier = 9 [ default = true ];
} }
message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; } message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; }
...@@ -127,6 +128,7 @@ message DistributedStrategy { ...@@ -127,6 +128,7 @@ message DistributedStrategy {
optional int32 conv_workspace_size_limit = 22 [ default = 4000 ]; optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ]; optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
optional bool adaptive_localsgd = 24 [ default = false ]; optional bool adaptive_localsgd = 24 [ default = false ];
optional bool fp16_allreduce = 25 [ default = false ];
optional RecomputeConfig recompute_configs = 101; optional RecomputeConfig recompute_configs = 101;
optional AMPConfig amp_configs = 102; optional AMPConfig amp_configs = 102;
......
...@@ -11,10 +11,17 @@ ...@@ -11,10 +11,17 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/dlpack_tensor.h"
#include <unordered_map> #include <unordered_map>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/dlpack_tensor.h"
namespace paddle {
namespace platform {
struct bfloat16;
struct float16;
} // namespace platform
} // namespace paddle
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册