“2c34f34338d8c3a1a781be6e2822077320543f4d”上不存在“mobile/src/framework/program/program-optimize/node.cpp”
未验证 提交 c3158527 编写于 作者: Z zhang wenhui 提交者: GitHub

【NPU】Merge NPU ccl code (#32381)

* add allreduce and broadcast without test (#31024)

add allreduce and broadcast without test

* Refactor HCCLCommContext to be compatible with Paddle (#31359)

Refactor HCCLCommContext to be compatible with Paddle (#31359)

* [NPU] add npu kernel for communication op (#31437)

* add allreduce and broadcast without test

* add c_broadcast_test case

* build c_comm_init and c_create_group operators

* make the whole thing compile

* add broadcast and init op test case but run failed

* make unit test compile

* fix broadcast test bug and change into hcom for ccl

* change c_comm_init and c_create_group ops accordingly

* make tests compile

* transfer code to 27

* compiled successfully in 28, but run failed

* test broadcast in 28, but failed

* make hcom primitives work

* change hccl data type for base.h

* fix broadcast bug

* make attributes work

* fix group name bug

* add allreduce but test failed

* allreduce bug for qiuliang

* allreduce finished

* add allgather and reducescatter

* merge all op code

* add allgather test

* finish run all ccl op test exclude send/recv

* all all op and test exclude send/recv

* send_v2_npu.cc recv_v2_npiu.cc compiled

* fix ccl core dump bug and test allgather, reducescatter, broadcast op

* fix allreduce bug just for test

* hcom send&recv test pass, without hcom_destroy

* for qiuliang test

* Ascend Send&Recv Test Pass

* all op (ex send/recv) ok

* fix bug

* merge all ccl op

* style merge to PaddlePaddle

* merge style

* new merge style

* merge style 2

* insert an empty at the end

* disable ctest for hcom to pass ci
Co-authored-by: Nvoid-main <voidmain1313113@gmail.com>
Co-authored-by: Nf2hkop <f2huestc@outlook.com>

* Add auto-increasing tag id for Hcom OPs (#31702)

* add c_reduce_sum op (#31793)

add c_reduce_sum op

* update Ascendrc hccl to 20.3 (#32126)

update Ascendrc hccl to 20.3 (#32126)

* fix merge code

* change cmake.txt1

* [NPU] Support npu kernel for c sync stream op (#31386)

* sync stream npu op

* add with_ascend_acl

* update c++ unittest

* compile all failed

* try to pre commit

* after pre commit

* merge&compile&test hccl successfully!

* fix code style

* fix code style

* fix bugs about hccl

* fix some bugs

* fix code style

* fix style

* fix style

* fix

* fixed

* merge develop
Co-authored-by: Nlw921014 <liuwei921014@yeah.net>
Co-authored-by: NVoid Main <voidmain1313113@gmail.com>
Co-authored-by: Nf2hkop <f2huestc@outlook.com>
Co-authored-by: Nxiayanming <41795079@qq.com>
上级 bc90916e
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License # limitations under the License
cmake_minimum_required(VERSION 3.15) cmake_minimum_required(VERSION 3.10)
cmake_policy(VERSION 3.10) cmake_policy(VERSION 3.10)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
...@@ -34,7 +34,7 @@ option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) ...@@ -34,7 +34,7 @@ option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF) option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF)
# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON # NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
# to develop some acl related functionality on x86 # to develop some acl related functionality on x86
option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND}) option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND})
option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF) option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF)
...@@ -65,7 +65,7 @@ if(WITH_MUSL) ...@@ -65,7 +65,7 @@ if(WITH_MUSL)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
endif() endif()
if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
endif() endif()
...@@ -103,7 +103,7 @@ if(WIN32) ...@@ -103,7 +103,7 @@ if(WIN32)
endif() endif()
endforeach(flag_var) endforeach(flag_var)
endif() endif()
# NOTE(Avin0323): Less parallel count result in faster compilation. # NOTE(Avin0323): Less parallel count result in faster compilation.
math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3") math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
# windows build turn off warnings, use parallel compiling. # windows build turn off warnings, use parallel compiling.
......
...@@ -26,7 +26,8 @@ if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_str ...@@ -26,7 +26,8 @@ if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_str
add_definitions(-DPADDLE_WITH_ASCEND_STRING) add_definitions(-DPADDLE_WITH_ASCEND_STRING)
endif() endif()
if(WITH_ASCEND)
if(WITH_ASCEND OR WITH_ASCEND_CL)
set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64) set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common) set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share) set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
...@@ -49,7 +50,6 @@ if(WITH_ASCEND) ...@@ -49,7 +50,6 @@ if(WITH_ASCEND)
INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR}) INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL) ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib}) SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
...@@ -65,6 +65,7 @@ endif() ...@@ -65,6 +65,7 @@ endif()
if(WITH_ASCEND_CL) if(WITH_ASCEND_CL)
set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so) set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so) set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
...@@ -78,6 +79,9 @@ if(WITH_ASCEND_CL) ...@@ -78,6 +79,9 @@ if(WITH_ASCEND_CL)
ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL) ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib}) SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL) ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib}) SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})
add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler) add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
......
...@@ -447,9 +447,20 @@ function(cc_test TARGET_NAME) ...@@ -447,9 +447,20 @@ function(cc_test TARGET_NAME)
cc_test_build(${TARGET_NAME} cc_test_build(${TARGET_NAME}
SRCS ${cc_test_SRCS} SRCS ${cc_test_SRCS}
DEPS ${cc_test_DEPS}) DEPS ${cc_test_DEPS})
cc_test_run(${TARGET_NAME} # we dont test hcom op, because it need complex configuration
COMMAND ${TARGET_NAME} # with more than one machine
ARGS ${cc_test_ARGS}) if(NOT ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "send_v2_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "c_reduce_sum_op_npu_test" OR
"${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test"))
cc_test_run(${TARGET_NAME}
COMMAND ${TARGET_NAME}
ARGS ${cc_test_ARGS})
endif()
endif() endif()
endfunction(cc_test) endfunction(cc_test)
...@@ -807,7 +818,7 @@ function(py_test TARGET_NAME) ...@@ -807,7 +818,7 @@ function(py_test TARGET_NAME)
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif() endif()
if (WIN32) if (WIN32)
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150) set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
endif() endif()
......
...@@ -29,9 +29,9 @@ set(third_party_deps) ...@@ -29,9 +29,9 @@ set(third_party_deps)
# 2. REPOSITORY: specify git REPOSITORY of 3rd party # 2. REPOSITORY: specify git REPOSITORY of 3rd party
# 3. TAG: specify git tag/branch/commitID of 3rd party # 3. TAG: specify git tag/branch/commitID of 3rd party
# 4. DIR: overwrite the original SOURCE_DIR when cache directory # 4. DIR: overwrite the original SOURCE_DIR when cache directory
# #
# The function Return 1 PARENT_SCOPE variables: # The function Return 1 PARENT_SCOPE variables:
# - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add, # - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add,
# and you no longer need to set any donwnload steps in ExternalProject_Add. # and you no longer need to set any donwnload steps in ExternalProject_Add.
# For example: # For example:
# Cache_third_party(${TARGET} # Cache_third_party(${TARGET}
...@@ -52,7 +52,7 @@ FUNCTION(cache_third_party TARGET) ...@@ -52,7 +52,7 @@ FUNCTION(cache_third_party TARGET)
SET(${TARGET_NAME}_DOWNLOAD_CMD SET(${TARGET_NAME}_DOWNLOAD_CMD
GIT_REPOSITORY ${cache_third_party_REPOSITORY}) GIT_REPOSITORY ${cache_third_party_REPOSITORY})
IF(cache_third_party_TAG) IF(cache_third_party_TAG)
LIST(APPEND ${TARGET_NAME}_DOWNLOAD_CMD LIST(APPEND ${TARGET_NAME}_DOWNLOAD_CMD
GIT_TAG ${cache_third_party_TAG}) GIT_TAG ${cache_third_party_TAG})
ENDIF() ENDIF()
ELSEIF(cache_third_party_URL) ELSEIF(cache_third_party_URL)
...@@ -130,7 +130,7 @@ ENDFUNCTION() ...@@ -130,7 +130,7 @@ ENDFUNCTION()
# Correction of flags on different Platform(WIN/MAC) and Print Warning Message # Correction of flags on different Platform(WIN/MAC) and Print Warning Message
if (APPLE) if (APPLE)
if(WITH_MKL) if(WITH_MKL)
MESSAGE(WARNING MESSAGE(WARNING
"Mac is not supported with MKL in Paddle yet. Force WITH_MKL=OFF.") "Mac is not supported with MKL in Paddle yet. Force WITH_MKL=OFF.")
set(WITH_MKL OFF CACHE STRING "Disable MKL for building on mac" FORCE) set(WITH_MKL OFF CACHE STRING "Disable MKL for building on mac" FORCE)
endif() endif()
...@@ -141,7 +141,7 @@ if(WIN32 OR APPLE) ...@@ -141,7 +141,7 @@ if(WIN32 OR APPLE)
SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE) SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
if(WITH_LIBXSMM) if(WITH_LIBXSMM)
MESSAGE(WARNING MESSAGE(WARNING
"Windows, Mac are not supported with libxsmm in Paddle yet." "Windows, Mac are not supported with libxsmm in Paddle yet."
"Force WITH_LIBXSMM=OFF") "Force WITH_LIBXSMM=OFF")
SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM in Windows and MacOS" FORCE) SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM in Windows and MacOS" FORCE)
...@@ -276,7 +276,7 @@ endif(WITH_BOX_PS) ...@@ -276,7 +276,7 @@ endif(WITH_BOX_PS)
if(WITH_ASCEND OR WITH_ASCEND_CL) if(WITH_ASCEND OR WITH_ASCEND_CL)
include(external/ascend) include(external/ascend)
if(WITH_ASCEND) if(WITH_ASCEND OR WITH_ASCEND_CL)
list(APPEND third_party_deps extern_ascend) list(APPEND third_party_deps extern_ascend)
endif() endif()
if(WITH_ASCEND_CL) if(WITH_ASCEND_CL)
...@@ -290,7 +290,7 @@ if (WITH_PSCORE) ...@@ -290,7 +290,7 @@ if (WITH_PSCORE)
include(external/leveldb) include(external/leveldb)
list(APPEND third_party_deps extern_leveldb) list(APPEND third_party_deps extern_leveldb)
include(external/brpc) include(external/brpc)
list(APPEND third_party_deps extern_brpc) list(APPEND third_party_deps extern_brpc)
......
...@@ -43,6 +43,6 @@ cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_conte ...@@ -43,6 +43,6 @@ cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_conte
cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell) cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
if(WITH_ASCEND) if(WITH_ASCEND OR WITH_ASCEND_CL)
cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph) cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph)
endif(WITH_ASCEND) endif()
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifdef PADDLE_WITH_ASCEND #ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/framework/fleet/ascend_wrapper.h" #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#ifdef PADDLE_WITH_ASCEND #ifdef PADDLE_WITH_ASCEND_CL
#include <glog/logging.h> #include <glog/logging.h>
#include <map> #include <map>
...@@ -29,7 +29,6 @@ limitations under the License. */ ...@@ -29,7 +29,6 @@ limitations under the License. */
#include "paddle/fluid/platform/timer.h" #include "paddle/fluid/platform/timer.h"
#include "ge/ge_api.h" #include "ge/ge_api.h"
#include "ge/ge_api_types.h"
#include "graph/attr_value.h" #include "graph/attr_value.h"
#include "graph/tensor.h" #include "graph/tensor.h"
#include "graph/types.h" #include "graph/types.h"
......
...@@ -36,6 +36,11 @@ ...@@ -36,6 +36,11 @@
#endif #endif
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
#include <hccl/hccl.h>
#include <hccl/hccl_types.h>
#endif
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
#include "xpu/bkcl.h" #include "xpu/bkcl.h"
#endif #endif
...@@ -50,6 +55,10 @@ class Communicator; ...@@ -50,6 +55,10 @@ class Communicator;
class NCCLCommunicator; class NCCLCommunicator;
#endif #endif
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
class Communicator;
class HCCLCommunicator;
#endif
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
class BKCLCommunicator; class BKCLCommunicator;
...@@ -162,6 +171,9 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< ...@@ -162,6 +171,9 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
#endif #endif
operators::CudnnRNNCache, operators::CudnnRNNCache,
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL)
HcclRootInfo,
#endif
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
BKCLUniqueId, platform::BKCLCommunicator, BKCLUniqueId, platform::BKCLCommunicator,
#endif #endif
......
...@@ -11,7 +11,7 @@ foreach(src ${OPS}) ...@@ -11,7 +11,7 @@ foreach(src ${OPS})
set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS}) set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS})
endforeach() endforeach()
register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op c_gen_hccl_id_op gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
if(WITH_NCCL OR WITH_RCCL) if(WITH_NCCL OR WITH_RCCL)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
...@@ -19,12 +19,6 @@ if(WITH_NCCL OR WITH_RCCL) ...@@ -19,12 +19,6 @@ if(WITH_NCCL OR WITH_RCCL)
op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
endif() endif()
if(WITH_ASCEND)
op_library(gen_nccl_id_op)
op_library(c_gen_nccl_id_op)
endif()
if(WITH_GLOO) if(WITH_GLOO)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
endif() endif()
...@@ -35,5 +29,38 @@ if(WITH_XPU_BKCL) ...@@ -35,5 +29,38 @@ if(WITH_XPU_BKCL)
op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS}) op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
endif() endif()
if(WITH_ASCEND_CL)
cc_library(gen_hccl_id_op_helper SRCS gen_hccl_id_op_helper.cc DEPS dynload_warpctc dynamic_loader scope)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper gen_hccl_id_op_helper)
op_library(c_gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
op_library(gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
endif()
set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE) set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency") set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")
if(WITH_ASCEND_CL)
set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hccl_op c_gen_hccl_id_op gen_hccl_id_op_helper
gen_hccl_id_op op_registry ascend_hccl flags
dynamic_loader dynload_warpctc scope device_context enforce executor)
cc_test(c_broadcast_op_npu_test SRCS c_broadcast_op_npu_test.cc
DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc
DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc
DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc
DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc
DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc
DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc
DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc
DEPS op_registry elementwise_add_op c_sync_calc_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
endif()
...@@ -42,6 +42,10 @@ class CAllGatherOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -42,6 +42,10 @@ class CAllGatherOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "(Tensor) the allgather result"); AddOutput("Out", "(Tensor) the allgather result");
AddAttr<int>("ring_id", "(int default 0) communication ring id.") AddAttr<int>("ring_id", "(int default 0) communication ring id.")
.SetDefault(0); .SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for all gather.")
.SetDefault("tag");
#endif
AddAttr<bool>( AddAttr<bool>(
"use_calc_stream", "use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.") "(bool default false) eject CUDA operations to calculation stream.")
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allgather_op.h"
#include <memory>
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace paddle {
namespace operators {
template <typename T>
class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out");
HcclDataType dtype = platform::ToHCCLDataType(in->type());
int ring_id = ctx.Attr<int>("ring_id");
std::string group =
std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
auto place = ctx.GetPlace();
auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
int nranks = comm->nranks();
framework::DDim out_dims = in->dims();
out_dims[0] *= nranks;
out->mutable_data<T>(out_dims, place);
uint64_t send_numel = in->numel();
void *send_buff = reinterpret_cast<void *>(const_cast<T *>(in->data<T>()));
void *recv_buff = reinterpret_cast<void *>(out->data<T>());
aclrtStream stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::NPUDeviceContext *>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
VLOG(3) << "begin hccl allgather, parameter is: "
<< ", group is " << group << ", ring_id is " << ring_id
<< ", nranks is " << nranks;
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllGather(
send_buff, recv_buff, send_numel, dtype, comm->comm(),
reinterpret_cast<void *>(stream)));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(c_allgather, ops::CAllGatherOpASCENDKernel<int8_t>,
ops::CAllGatherOpASCENDKernel<int>,
ops::CAllGatherOpASCENDKernel<float>,
ops::CAllGatherOpASCENDKernel<plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/operators/collective/c_allgather_op.h"
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(c_allgather);
USE_NO_KERNEL_OP(c_gen_hccl_id);
USE_NO_KERNEL_OP(c_comm_init_hccl);
USE_OP_DEVICE_KERNEL(c_allgather, NPU);
DECLARE_string(selected_npus);
template <typename T>
void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
std::string debugstring = "";
for (auto ele : data) {
debugstring += std::to_string(ele) + std::string(",");
}
VLOG(2) << preStr << ":" << std::endl << debugstring;
}
void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
HcclRootInfo* hccl_id) {
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
std::vector<int> rank_ids{0, 1};
f::AttributeMap gen_hccl_id;
std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
gen_hccl_id["rank"] = rank_id;
gen_hccl_id["endpoint"] = endpointList[rank_id];
std::vector<std::string> other_endpoints = {
endpointList[rank_id == 0 ? 1 : 0]};
gen_hccl_id["other_endpoints"] = other_endpoints;
auto out = scope->Var("Out");
auto id = out->GetMutable<HcclRootInfo>();
VLOG(3) << "break";
auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
{{"Out", {"Out"}}}, gen_hccl_id);
VLOG(3) << "break";
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
memcpy(hccl_id, id, 1024);
}
void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
HcclRootInfo* hccl_id) {
auto x = scope->Var("X");
auto id = x->GetMutable<HcclRootInfo>();
memcpy(id, hccl_id, 1024);
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
// std::vector<int> rank_ids{0, 1};
f::AttributeMap comm_init_attrs;
comm_init_attrs["ring_id"] = 0;
comm_init_attrs["rank_ids"] = 2;
comm_init_attrs["rank"] = rank_id;
comm_init_attrs["device_id"] = device_id;
// comm_init_attrs["rank_ids"] = rank_ids;
auto comm_init_op = f::OpRegistry::CreateOp(
"c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
}
void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
// init
auto x = scope->Var("Data");
auto tensor_x = x->GetMutable<f::LoDTensor>();
std::vector<float> init;
int rank_id = atoi(getenv("RANK_ID"));
int num1 = 1;
int num2 = 4;
for (int64_t i = 0; i < num1 * num2; ++i) {
init.push_back(1.0 + rank_id);
}
PrintDebugInfo("input data", init);
TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num1, num2});
ctx.Wait();
auto place = ctx.GetPlace();
auto out = scope->Var("OutData");
auto tensor_out = out->GetMutable<f::LoDTensor>();
tensor_out->Resize({num1, num2});
tensor_out->mutable_data<float>(place); // allocate
ctx.Wait();
// run
f::AttributeMap attrs;
attrs["tag"] = std::string("tagx");
attrs["ring_id"] = 0;
attrs["nranks"] = 2;
auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"Data"}}},
{{"Out", {"OutData"}}}, attrs);
for (int i = 0; i < 10; i++) {
op->Run(*scope, place);
}
ctx.Wait();
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
PrintDebugInfo("output data", out_vec);
EXPECT_EQ(out_vec.size(), init.size() * 2);
for (uint32_t i = 0; i < out_vec.size() / 2; i++) {
EXPECT_EQ(out_vec[i], 1.0);
}
for (uint32_t i = out_vec.size() / 2; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], 2.0);
}
}
TEST(c_allgather, NPU) {
f::Scope scope;
HcclRootInfo hccl_id;
// only support one device, if more than one device, use first default
p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
PrepareUniqueId(&scope, ctx, &hccl_id);
Prepare(&scope, ctx, &hccl_id);
TestHCCLAllGatherOp(&scope, ctx);
}
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace paddle {
namespace platform {
struct ASCENDPlace;
struct float16;
} // namespace platform
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
c_allreduce_max, ops::CAllReduceOpASCENDKernel<ops::kRedMax, int>,
ops::CAllReduceOpASCENDKernel<ops::kRedMax, int8_t>,
ops::CAllReduceOpASCENDKernel<ops::kRedMax, float>,
ops::CAllReduceOpASCENDKernel<ops::kRedMax, plat::float16>)
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/operators/collective/c_allgather_op.h"
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(c_allreduce_max);
USE_NO_KERNEL_OP(c_gen_hccl_id);
USE_NO_KERNEL_OP(c_comm_init_hccl);
USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU);
DECLARE_string(selected_npus);
template <typename T>
void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
std::string debugstring = "";
for (auto ele : data) {
debugstring += std::to_string(ele) + std::string(",");
}
VLOG(2) << preStr << ":" << std::endl << debugstring;
}
void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
HcclRootInfo* hccl_id) {
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
std::vector<int> rank_ids{0, 1};
f::AttributeMap gen_hccl_id;
std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
gen_hccl_id["rank"] = rank_id;
gen_hccl_id["endpoint"] = endpointList[rank_id];
std::vector<std::string> other_endpoints = {
endpointList[rank_id == 0 ? 1 : 0]};
gen_hccl_id["other_endpoints"] = other_endpoints;
auto out = scope->Var("Out");
auto id = out->GetMutable<HcclRootInfo>();
VLOG(3) << "break";
auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
{{"Out", {"Out"}}}, gen_hccl_id);
VLOG(3) << "break";
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
memcpy(hccl_id, id, 1024);
}
void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
HcclRootInfo* hccl_id) {
auto x = scope->Var("X");
auto id = x->GetMutable<HcclRootInfo>();
memcpy(id, hccl_id, 1024);
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
// std::vector<int> rank_ids{0, 1};
f::AttributeMap comm_init_attrs;
comm_init_attrs["ring_id"] = 0;
comm_init_attrs["rank_ids"] = 2;
comm_init_attrs["rank"] = rank_id;
comm_init_attrs["device_id"] = device_id;
// comm_init_attrs["rank_ids"] = rank_ids;
auto comm_init_op = f::OpRegistry::CreateOp(
"c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
}
void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
// init
auto x = scope->Var("Data");
auto tensor_x = x->GetMutable<f::LoDTensor>();
std::vector<float> init;
int rank_id = atoi(getenv("RANK_ID"));
int num1 = 100;
int num2 = 100;
for (int64_t i = 0; i < num1 * num2; ++i) {
init.push_back(1.0 + rank_id * 3);
}
PrintDebugInfo("input data", init);
TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num1, num2});
ctx.Wait();
auto place = ctx.GetPlace();
auto out = scope->Var("OutData");
auto tensor_out = out->GetMutable<f::LoDTensor>();
tensor_out->Resize({num1, num2});
tensor_out->mutable_data<float>(place); // allocate
ctx.Wait();
// run
f::AttributeMap attrs;
attrs["tag"] = std::string("tagx");
attrs["ring_id"] = 0;
auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"Data"}}},
{{"Out", {"OutData"}}}, attrs);
for (int i = 0; i < 10; i++) {
op->Run(*scope, place);
}
ctx.Wait();
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
PrintDebugInfo("output data", out_vec);
EXPECT_EQ(out_vec.size(), init.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], 4.0);
}
}
TEST(c_allreduce_max, NPU) {
f::Scope scope;
HcclRootInfo hccl_id;
// only support one device, if more than one device, use first default
p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
PrepareUniqueId(&scope, ctx, &hccl_id);
Prepare(&scope, ctx, &hccl_id);
TestHCCLAllReduceOp(&scope, ctx);
}
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace paddle {
namespace platform {
struct ASCENDPlace;
struct float16;
} // namespace platform
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
c_allreduce_min, ops::CAllReduceOpASCENDKernel<ops::kRedMin, int>,
ops::CAllReduceOpASCENDKernel<ops::kRedMin, int8_t>,
ops::CAllReduceOpASCENDKernel<ops::kRedMin, float>,
ops::CAllReduceOpASCENDKernel<ops::kRedMin, plat::float16>)
...@@ -19,9 +19,11 @@ limitations under the License. */ ...@@ -19,9 +19,11 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#endif #endif
...@@ -38,6 +40,10 @@ limitations under the License. */ ...@@ -38,6 +40,10 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/gloo_wrapper.h" #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -113,6 +119,73 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> { ...@@ -113,6 +119,73 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
} }
}; };
template <ReduceType red_type, typename T>
class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto in = ctx.Input<framework::LoDTensor>("X");
auto out = ctx.Output<framework::LoDTensor>("Out");
auto place = ctx.GetPlace();
HcclDataType dtype = platform::ToHCCLDataType(in->type());
int64_t numel = in->numel();
void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
void* recvbuff = reinterpret_cast<void*>(out->data<T>());
int ring_id = ctx.Attr<int>("ring_id");
std::string group =
std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
switch (red_type) {
case kRedSum:
hccl_red_type = HCCL_REDUCE_SUM;
break;
case kRedMax:
hccl_red_type = HCCL_REDUCE_MAX;
break;
case kRedMin:
hccl_red_type = HCCL_REDUCE_MIN;
break;
case kRedProd:
hccl_red_type = HCCL_REDUCE_PROD;
break;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid reduce type: %d", red_type));
}
VLOG(3) << "begin hccl allreduce, parameter is: "
<< "input num: " << numel << "dtype: " << dtype
<< "hccl_red_type: " << hccl_red_type << ", group is: " << group;
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(),
reinterpret_cast<void*>(stream)));
out->Resize(in->dims());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
template <ReduceType red_type, typename T> template <ReduceType red_type, typename T>
class CAllReduceOpXPUKernel : public framework::OpKernel<T> { class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
public: public:
...@@ -240,6 +313,10 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -240,6 +313,10 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "(Tensor) the allreduced result."); AddOutput("Out", "(Tensor) the allreduced result.");
AddAttr<int>("ring_id", "(int default 0) communication ring id.") AddAttr<int>("ring_id", "(int default 0) communication ring id.")
.SetDefault(0); .SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for all reduce.")
.SetDefault("tag");
#endif
AddAttr<bool>( AddAttr<bool>(
"use_calc_stream", "use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.") "(bool default false) eject CUDA operations to calculation stream.")
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace paddle {
namespace platform {
struct ASCENDPlace;
struct float16;
} // namespace platform
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
c_allreduce_prod, ops::CAllReduceOpASCENDKernel<ops::kRedProd, int>,
ops::CAllReduceOpASCENDKernel<ops::kRedProd, int8_t>,
ops::CAllReduceOpASCENDKernel<ops::kRedProd, float>,
ops::CAllReduceOpASCENDKernel<ops::kRedProd, plat::float16>)
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace paddle {
namespace platform {
struct ASCENDPlace;
struct float16;
} // namespace platform
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
c_allreduce_sum, ops::CAllReduceOpASCENDKernel<ops::kRedSum, int>,
ops::CAllReduceOpASCENDKernel<ops::kRedSum, int8_t>,
ops::CAllReduceOpASCENDKernel<ops::kRedSum, float>,
ops::CAllReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(c_allreduce_sum);
USE_NO_KERNEL_OP(c_gen_hccl_id);
USE_NO_KERNEL_OP(c_comm_init_hccl);
USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
DECLARE_string(selected_npus);
template <typename T>
void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
std::string debugstring = "";
for (auto ele : data) {
debugstring += std::to_string(ele) + std::string(",");
}
VLOG(3) << preStr << ":" << std::endl << debugstring;
}
void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
HcclRootInfo* hccl_id) {
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
std::vector<int> rank_ids{0, 1};
f::AttributeMap gen_hccl_id;
std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
gen_hccl_id["rank"] = rank_id;
gen_hccl_id["endpoint"] = endpointList[rank_id];
std::vector<std::string> other_endpoints = {
endpointList[rank_id == 0 ? 1 : 0]};
gen_hccl_id["other_endpoints"] = other_endpoints;
auto out = scope->Var("Out");
auto id = out->GetMutable<HcclRootInfo>();
VLOG(3) << "break";
auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
{{"Out", {"Out"}}}, gen_hccl_id);
VLOG(3) << "break";
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
memcpy(hccl_id, id, 1024);
}
void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
HcclRootInfo* hccl_id) {
auto x = scope->Var("X");
auto id = x->GetMutable<HcclRootInfo>();
memcpy(id, hccl_id, 1024);
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
// std::vector<int> rank_ids{0, 1};
f::AttributeMap comm_init_attrs;
comm_init_attrs["ring_id"] = 0;
comm_init_attrs["rank_ids"] = 2;
comm_init_attrs["rank"] = rank_id;
comm_init_attrs["device_id"] = device_id;
// comm_init_attrs["rank_ids"] = rank_ids;
auto comm_init_op = f::OpRegistry::CreateOp(
"c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
}
void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
int iter) {
// init
auto x = scope->Var("Data");
auto tensor_x = x->GetMutable<f::LoDTensor>();
int rank_id = atoi(getenv("RANK_ID"));
int num1 = 3;
int num2 = 128;
std::vector<float> init;
for (int64_t i = 0; i < num1 * num2; ++i) {
init.push_back(1.0 + rank_id);
}
PrintDebugInfo("input data", init);
auto place = ctx.GetPlace();
TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num1, num2});
ctx.Wait();
auto out = scope->Var("OutData");
auto tensor_out = out->GetMutable<f::LoDTensor>();
tensor_out->Resize({num1, num2});
tensor_out->mutable_data<float>(place); // allocate
ctx.Wait();
// run
f::AttributeMap attrs;
attrs["tag"] = std::string("tagx_" + std::to_string(iter));
attrs["ring_id"] = 0;
auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"Data"}}},
{{"Out", {"OutData"}}}, attrs);
for (int i = 0; i < 10; i++) {
op->Run(*scope, place);
}
ctx.Wait();
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
PrintDebugInfo("output data", out_vec);
EXPECT_EQ(out_vec.size(), init.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], 3.0);
}
}
TEST(c_allreduce_sum, NPU) {
f::Scope scope;
HcclRootInfo hccl_id;
p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
// only support one device, if more than one device, use first default
PrepareUniqueId(&scope, ctx, &hccl_id);
Prepare(&scope, ctx, &hccl_id);
for (int i = 0; i < 1; i++) {
VLOG(2) << "iter num: " << i;
TestHCCLAllReduceOp(&scope, ctx, i);
}
}
...@@ -42,6 +42,10 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -42,6 +42,10 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(0); .SetDefault(0);
AddAttr<int>("root", "(int default 0) root id for broadcasting.") AddAttr<int>("root", "(int default 0) root id for broadcasting.")
.SetDefault(0); .SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
.SetDefault("tag");
#endif
AddAttr<bool>( AddAttr<bool>(
"use_calc_stream", "use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.") "(bool default false) eject CUDA operations to calculation stream.")
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace paddle {
namespace operators {
template <typename T>
class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto x = ctx.Input<framework::LoDTensor>("X");
void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
int numel = x->numel();
HcclDataType dtype = platform::ToHCCLDataType(x->type());
auto out = ctx.Output<framework::LoDTensor>("Out");
int ring_id = ctx.Attr<int>("ring_id");
auto place = ctx.GetPlace();
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int root = ctx.Attr<int>("root");
std::string group =
std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
VLOG(3) << "begin hccl broadcast, parameter is: "
<< "root " << root << ", group is " << group
<< ", comm: " << comm->comm() << ", stream: " << stream;
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
<< framework::product(out->dims());
dev_ctx->Wait();
if (out != x) {
framework::TensorCopy(*static_cast<const framework::Tensor*>(x), place,
*platform::DeviceContextPool::Instance().Get(place),
static_cast<framework::Tensor*>(out));
}
dev_ctx->Wait();
out->Resize(x->dims());
out->set_lod(x->lod());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(c_broadcast, ops::CBroadcastOpASCENDKernel<int>,
ops::CBroadcastOpASCENDKernel<int8_t>,
ops::CBroadcastOpASCENDKernel<float>,
ops::CBroadcastOpASCENDKernel<plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(c_broadcast);
USE_NO_KERNEL_OP(c_gen_hccl_id);
USE_NO_KERNEL_OP(c_comm_init_hccl);
USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
DECLARE_string(selected_npus);
template <typename T>
void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
std::string debugstring = "";
for (auto ele : data) {
debugstring += std::to_string(ele) + std::string(",");
}
VLOG(2) << preStr << ":" << std::endl << debugstring;
}
void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
HcclRootInfo* hccl_id) {
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
std::vector<int> rank_ids{0, 1};
f::AttributeMap gen_hccl_id;
std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
gen_hccl_id["rank"] = rank_id;
gen_hccl_id["endpoint"] = endpointList[rank_id];
std::vector<std::string> other_endpoints = {
endpointList[rank_id == 0 ? 1 : 0]};
gen_hccl_id["other_endpoints"] = other_endpoints;
auto out = scope->Var("Out");
auto id = out->GetMutable<HcclRootInfo>();
VLOG(3) << "break";
auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
{{"Out", {"Out"}}}, gen_hccl_id);
VLOG(3) << "break";
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
memcpy(hccl_id, id, 1024);
}
void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
HcclRootInfo* hccl_id) {
auto x = scope->Var("X");
auto id = x->GetMutable<HcclRootInfo>();
memcpy(id, hccl_id, 1024);
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
// std::vector<int> rank_ids{0, 1};
f::AttributeMap comm_init_attrs;
comm_init_attrs["ring_id"] = 0;
comm_init_attrs["rank_ids"] = 2;
comm_init_attrs["rank"] = rank_id;
comm_init_attrs["device_id"] = device_id;
// comm_init_attrs["rank_ids"] = rank_ids;
auto comm_init_op = f::OpRegistry::CreateOp(
"c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
}
void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
// init
auto x = scope->Var("Data");
auto tensor_x = x->GetMutable<f::LoDTensor>();
int num = 2;
std::vector<float> init;
int rank_id = atoi(getenv("RANK_ID"));
for (int64_t i = 0; i < num * num; ++i) {
init.push_back(1.0 + rank_id);
}
PrintDebugInfo("input data", init);
TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num, num});
ctx.Wait();
auto place = ctx.GetPlace();
auto out = scope->Var("OutData");
auto tensor_out = out->GetMutable<f::LoDTensor>();
tensor_out->Resize({num, num});
tensor_out->mutable_data<float>(place); // allocate
ctx.Wait();
// run
f::AttributeMap attrs;
attrs["tag"] = std::string("tagx");
attrs["root"] = 0;
attrs["ring_id"] = 0;
auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}},
{{"Out", {"OutData"}}}, attrs);
for (int i = 0; i < 10; i++) {
op->Run(*scope, place);
}
ctx.Wait();
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
PrintDebugInfo("output data", out_vec);
EXPECT_EQ(out_vec.size(), init.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], 1.0);
}
}
TEST(c_broadcast, NPU) {
f::Scope scope;
HcclRootInfo hccl_id;
// only support one device, if more than one device, use first default
p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
PrepareUniqueId(&scope, ctx, &hccl_id);
Prepare(&scope, ctx, &hccl_id);
TestHCCLBroadcastOp(&scope, ctx);
}
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace paddle {
namespace operators {
class CCommInitOpAscend : public framework::OperatorBase {
public:
CCommInitOpAscend(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
PADDLE_ENFORCE_EQ(is_npu_place(place), true,
platform::errors::PreconditionNotMet(
"CCommInitOpAscend can run on npu place only."));
auto var = scope.FindVar(Input("X"));
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::InvalidArgument("Input con not be empty."));
#if defined(PADDLE_WITH_ASCEND_CL)
HcclRootInfo* hccl_id = var->GetMutable<HcclRootInfo>();
int rank_ids = Attr<int>("rank_ids");
int rank_id = Attr<int>("rank");
int rid = Attr<int>("ring_id");
int device_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
if (Attr<int>("device_id") >= 0) {
device_id = Attr<int>("device_id");
}
platform::HCCLCommContext::Instance().CreateHCCLComm(
hccl_id, rank_ids, rank_id, device_id, rid);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
class CCommInitOpAscendMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Raw variable contains a NCCL UniqueId instaces.");
AddComment(R"DOC(
CCommInit operator
Initialize collective communicatoin context within this trainer
)DOC");
AddAttr<int>("rank_ids",
"(int) The number of ranks of distributed trainers");
AddAttr<int>("rank",
"(int) The rank of the trainer in distributed training.");
AddAttr<int>("device_id",
"(int) The deivce_id on which to initialize the communicator."
"Now, you only have to set this attr manually for pipeline "
"training. Otherwise, make it as default.")
.SetDefault(-1);
AddAttr<int>("ring_id", "(int default 0) user specified ring id")
.SetDefault(0);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(c_comm_init_hccl, ops::CCommInitOpAscend,
ops::CCommInitOpAscendMaker);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include "glog/logging.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#endif
namespace paddle {
namespace operators {
#ifdef PADDLE_WITH_ASCEND_CL
class CGenHCCLIdOp : public framework::OperatorBase {
public:
CGenHCCLIdOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
int rank = Attr<int>("rank");
framework::Scope& local_scope = scope.NewScope();
std::function<std::string(size_t)> func = [&](size_t i) -> std::string {
return Output("Out");
};
if (rank == 0) {
std::vector<std::string> endpoint_list =
Attr<std::vector<std::string>>("other_endpoints");
SendBroadCastHCCLID(endpoint_list, 1, func, local_scope);
} else {
std::string endpoint = Attr<std::string>("endpoint");
RecvBroadCastHCCLID(endpoint, 1, func, local_scope);
}
scope.DeleteScope(&local_scope);
}
};
#else
class CGenHCCLIdOp : public framework::OperatorBase {
public:
CGenHCCLIdOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {}
};
#endif
class CGenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
VLOG(3) << "ele";
AddOutput("Out", "Raw variable contains a HCCL UniqueId instaces.");
AddComment(R"DOC(
CGenHCCLId operator
For trainer 0: generate a new UniqueId and send it to all the other trainers.
For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
)DOC");
AddAttr<std::string>("endpoint",
"(string), e.g. 127.0.0.1:6175 "
"current listen endpoint");
AddAttr<std::vector<std::string>>(
"other_endpoints",
"['trainer1_ip:port', 'trainer2_ip:port', ...] "
"list of other trainer endpoints")
.SetDefault({});
AddAttr<int>("rank",
"(int default 0) "
"The rank of the trainer in distributed training.")
.SetDefault(0);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(c_gen_hccl_id, ops::CGenHCCLIdOp, ops::CGenHCCLIdOpMaker);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace paddle {
namespace platform {
struct ASCENDPlace;
struct float16;
} // namespace platform
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(c_reduce_max,
ops::CReduceOpASCENDKernel<ops::kRedMax, int>,
ops::CReduceOpASCENDKernel<ops::kRedMax, int8_t>,
ops::CReduceOpASCENDKernel<ops::kRedMax, float>,
ops::CReduceOpASCENDKernel<ops::kRedMax, plat::float16>)
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace paddle {
namespace platform {
struct ASCENDPlace;
struct float16;
} // namespace platform
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(c_reduce_min,
ops::CReduceOpASCENDKernel<ops::kRedMin, int>,
ops::CReduceOpASCENDKernel<ops::kRedMin, int8_t>,
ops::CReduceOpASCENDKernel<ops::kRedMin, float>,
ops::CReduceOpASCENDKernel<ops::kRedMin, plat::float16>)
...@@ -25,7 +25,7 @@ limitations under the License. */ ...@@ -25,7 +25,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#endif #endif
...@@ -42,6 +42,10 @@ limitations under the License. */ ...@@ -42,6 +42,10 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/gloo_wrapper.h" #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -119,6 +123,85 @@ class CReduceOpCPUKernel : public framework::OpKernel<T> { ...@@ -119,6 +123,85 @@ class CReduceOpCPUKernel : public framework::OpKernel<T> {
} }
}; };
template <ReduceType red_type, typename T>
class CReduceOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto in = ctx.Input<framework::LoDTensor>("X");
auto out = ctx.Output<framework::LoDTensor>("Out");
auto place = ctx.GetPlace();
HcclDataType dtype = platform::ToHCCLDataType(in->type());
int64_t numel = in->numel();
void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
void* recvbuff = reinterpret_cast<void*>(out->data<T>());
int ring_id = ctx.Attr<int>("ring_id");
int root_id = ctx.Attr<int>("root_id");
std::string group =
std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int rank_id = comm->rank();
HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
switch (red_type) {
case kRedSum:
hccl_red_type = HCCL_REDUCE_SUM;
break;
case kRedMax:
hccl_red_type = HCCL_REDUCE_MAX;
break;
case kRedMin:
hccl_red_type = HCCL_REDUCE_MIN;
break;
case kRedProd:
hccl_red_type = HCCL_REDUCE_PROD;
break;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid reduce type: %d", red_type));
}
VLOG(3) << "begin hccl reduce, parameter is: "
<< "input num: " << numel << "root_id: " << root_id
<< "dtype: " << dtype << "hccl_red_type: " << hccl_red_type
<< ", group is: " << group;
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(),
reinterpret_cast<void*>(stream)));
if (rank_id != root_id) {
auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place);
memory::Copy(npu_place, reinterpret_cast<void*>(out->data<T>()),
npu_place,
reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
numel * sizeof(T), stream);
}
out->Resize(in->dims());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
template <ReduceType red_type, typename T> template <ReduceType red_type, typename T>
class CReduceOpXPUKernel : public framework::OpKernel<T> { class CReduceOpXPUKernel : public framework::OpKernel<T> {
public: public:
...@@ -251,6 +334,10 @@ class CReduceOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -251,6 +334,10 @@ class CReduceOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "(Tensor) the reduced result."); AddOutput("Out", "(Tensor) the reduced result.");
AddAttr<int>("ring_id", "(int default 0) communication ring id.") AddAttr<int>("ring_id", "(int default 0) communication ring id.")
.SetDefault(0); .SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for reduce.")
.SetDefault("tag");
#endif
AddAttr<int>("root_id", "(int default 0) root id.").SetDefault(0); AddAttr<int>("root_id", "(int default 0) root id.").SetDefault(0);
AddAttr<bool>( AddAttr<bool>(
"use_calc_stream", "use_calc_stream",
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace paddle {
namespace platform {
struct ASCENDPlace;
struct float16;
} // namespace platform
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(c_reduce_prod,
ops::CReduceOpASCENDKernel<ops::kRedProd, int>,
ops::CReduceOpASCENDKernel<ops::kRedProd, int8_t>,
ops::CReduceOpASCENDKernel<ops::kRedProd, float>,
ops::CReduceOpASCENDKernel<ops::kRedProd, plat::float16>)
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace paddle {
namespace platform {
struct ASCENDPlace;
struct float16;
} // namespace platform
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(c_reduce_sum,
ops::CReduceOpASCENDKernel<ops::kRedSum, int>,
ops::CReduceOpASCENDKernel<ops::kRedSum, int8_t>,
ops::CReduceOpASCENDKernel<ops::kRedSum, float>,
ops::CReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/operators/collective/c_reduce_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(c_reduce_sum);
USE_NO_KERNEL_OP(c_gen_hccl_id);
USE_NO_KERNEL_OP(c_comm_init_hccl);
USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU);
DECLARE_string(selected_npus);
template <typename T>
void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
std::string debugstring = "";
for (auto ele : data) {
debugstring += std::to_string(ele) + std::string(",");
}
VLOG(3) << preStr << ":" << std::endl << debugstring;
}
void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
HcclRootInfo* hccl_id) {
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
std::vector<int> rank_ids{0, 1};
f::AttributeMap gen_hccl_id;
std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
gen_hccl_id["rank"] = rank_id;
gen_hccl_id["endpoint"] = endpointList[rank_id];
std::vector<std::string> other_endpoints = {
endpointList[rank_id == 0 ? 1 : 0]};
gen_hccl_id["other_endpoints"] = other_endpoints;
auto out = scope->Var("Out");
auto id = out->GetMutable<HcclRootInfo>();
VLOG(3) << "break";
auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
{{"Out", {"Out"}}}, gen_hccl_id);
VLOG(3) << "break";
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
memcpy(hccl_id, id, 1024);
}
void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
HcclRootInfo* hccl_id) {
auto x = scope->Var("X");
auto id = x->GetMutable<HcclRootInfo>();
memcpy(id, hccl_id, 1024);
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
// std::vector<int> rank_ids{0, 1};
f::AttributeMap comm_init_attrs;
comm_init_attrs["ring_id"] = 0;
comm_init_attrs["rank_ids"] = 2;
comm_init_attrs["rank"] = rank_id;
comm_init_attrs["device_id"] = device_id;
// comm_init_attrs["rank_ids"] = rank_ids;
auto comm_init_op = f::OpRegistry::CreateOp(
"c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
}
void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
// init
auto x = scope->Var("Data");
auto tensor_x = x->GetMutable<f::LoDTensor>();
int rank_id = atoi(getenv("RANK_ID"));
int num1 = 3;
int num2 = 128;
std::vector<float> init;
for (int64_t i = 0; i < num1 * num2; ++i) {
init.push_back(1.0 + rank_id);
}
PrintDebugInfo("input data", init);
auto place = ctx.GetPlace();
TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num1, num2});
ctx.Wait();
auto out = scope->Var("OutData");
auto tensor_out = out->GetMutable<f::LoDTensor>();
tensor_out->Resize({num1, num2});
tensor_out->mutable_data<float>(place); // allocate
ctx.Wait();
// run
f::AttributeMap attrs;
attrs["tag"] = std::string("tagx_" + std::to_string(iter));
attrs["ring_id"] = 0;
int root_id = 0;
attrs["root_id"] = root_id;
auto op = f::OpRegistry::CreateOp("c_reduce_sum", {{"X", {"Data"}}},
{{"Out", {"OutData"}}}, attrs);
op->Run(*scope, place);
ctx.Wait();
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
PrintDebugInfo("output data", out_vec);
EXPECT_EQ(out_vec.size(), init.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
if (rank_id == root_id) {
EXPECT_EQ(out_vec[i], 3.0);
} else {
EXPECT_EQ(out_vec[i], init[i]);
}
}
}
TEST(c_reduce_sum, NPU) {
f::Scope scope;
HcclRootInfo hccl_id;
// only support one device, if more than one device, use first default
p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
PrepareUniqueId(&scope, ctx, &hccl_id);
Prepare(&scope, ctx, &hccl_id);
for (int i = 0; i < 2; i++) {
VLOG(2) << "iter num: " << i;
TestHCCLReduceOp(&scope, ctx, i);
}
}
...@@ -49,6 +49,10 @@ class CReduceScatterOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -49,6 +49,10 @@ class CReduceScatterOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("nranks", AddAttr<int>("nranks",
"Total trainer count of the distributed training job") "Total trainer count of the distributed training job")
.SetDefault(1); .SetDefault(1);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for reduce scatter.")
.SetDefault("tag");
#endif
AddAttr<bool>( AddAttr<bool>(
"use_calc_stream", "use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.") "(bool default false) eject CUDA operations to calculation stream.")
......
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace paddle {
namespace operators {
template <typename T>
class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out");
int ring_id = ctx.Attr<int>("ring_id");
std::string group =
std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
auto place = ctx.GetPlace();
auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
int nranks = comm->nranks();
auto out_dims = in->dims();
PADDLE_ENFORCE_EQ(out_dims[0] % nranks, 0,
platform::errors::InvalidArgument(
"The input tensor X's "
"dim[0] (%d) should be divisible by nranks(%d)",
out_dims[0], nranks));
out_dims[0] = out_dims[0] / nranks;
out->mutable_data<T>(out_dims, place);
uint64_t recv_numel = in->numel() / nranks;
void* inputPtr = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
void* outputPtr = reinterpret_cast<void*>(out->data<T>());
HcclDataType dtype = platform::ToHCCLDataType(in->type());
aclrtStream stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
VLOG(3) << "begin hccl reduce scatter, parameter is: "
<< "recv_numel: " << recv_numel << "dtype: " << dtype
<< "hccl_red_type: " << HCCL_REDUCE_SUM << ", group is: " << group;
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclReduceScatter(
inputPtr, outputPtr, recv_numel, dtype, HCCL_REDUCE_SUM, comm->comm(),
reinterpret_cast<void*>(stream)));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(c_reducescatter,
ops::CReduceScatterOpAscendKernel<int8_t>,
ops::CReduceScatterOpAscendKernel<int>,
ops::CReduceScatterOpAscendKernel<float>,
ops::CReduceScatterOpAscendKernel<plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/operators/collective/c_allgather_op.h"
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(c_reducescatter);
USE_NO_KERNEL_OP(c_gen_hccl_id);
USE_NO_KERNEL_OP(c_comm_init_hccl);
USE_OP_DEVICE_KERNEL(c_reducescatter, NPU);
DECLARE_string(selected_npus);
template <typename T>
void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
std::string debugstring = "";
for (auto ele : data) {
debugstring += std::to_string(ele) + std::string(",");
}
VLOG(2) << preStr << ":" << std::endl << debugstring;
}
void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
HcclRootInfo* hccl_id) {
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
std::vector<int> rank_ids{0, 1};
f::AttributeMap gen_hccl_id;
std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
gen_hccl_id["rank"] = rank_id;
gen_hccl_id["endpoint"] = endpointList[rank_id];
std::vector<std::string> other_endpoints = {
endpointList[rank_id == 0 ? 1 : 0]};
gen_hccl_id["other_endpoints"] = other_endpoints;
auto out = scope->Var("Out");
auto id = out->GetMutable<HcclRootInfo>();
VLOG(3) << "break";
auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
{{"Out", {"Out"}}}, gen_hccl_id);
VLOG(3) << "break";
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
memcpy(hccl_id, id, 1024);
}
void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
HcclRootInfo* hccl_id) {
auto x = scope->Var("X");
auto id = x->GetMutable<HcclRootInfo>();
memcpy(id, hccl_id, 1024);
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
// std::vector<int> rank_ids{0, 1};
f::AttributeMap comm_init_attrs;
comm_init_attrs["ring_id"] = 0;
comm_init_attrs["rank_ids"] = 2;
comm_init_attrs["rank"] = rank_id;
comm_init_attrs["device_id"] = device_id;
// comm_init_attrs["rank_ids"] = rank_ids;
auto comm_init_op = f::OpRegistry::CreateOp(
"c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
}
void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
// init
auto x = scope->Var("Data");
auto tensor_x = x->GetMutable<f::LoDTensor>();
std::vector<float> init;
int num1 = 4;
int num2 = 1;
for (int64_t i = 0; i < num1 * num2; ++i) {
init.push_back(1.0);
}
PrintDebugInfo("input data", init);
TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num1, num2});
ctx.Wait();
auto place = ctx.GetPlace();
auto out = scope->Var("OutData");
auto tensor_out = out->GetMutable<f::LoDTensor>();
tensor_out->Resize({num1, num2});
tensor_out->mutable_data<float>(place); // allocate
ctx.Wait();
// run
f::AttributeMap attrs;
attrs["tag"] = std::string("tagx");
attrs["ring_id"] = 0;
attrs["nranks"] = 2;
auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"Data"}}},
{{"Out", {"OutData"}}}, attrs);
int iter_num = 10;
for (int i = 0; i < iter_num; i++) {
op->Run(*scope, place);
ctx.Wait();
}
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
PrintDebugInfo("output data", out_vec);
EXPECT_EQ(out_vec.size(), init.size() / 2);
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], 2.0);
}
}
TEST(c_reducescatter, NPU) {
f::Scope scope;
HcclRootInfo hccl_id;
// only support one device, if more than one device, use first default
p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
PrepareUniqueId(&scope, ctx, &hccl_id);
Prepare(&scope, ctx, &hccl_id);
TestHCCLReduceScatterOp(&scope, ctx);
}
...@@ -61,6 +61,16 @@ class CSyncCalcStreamCudaKernel : public framework::OpKernel<T> { ...@@ -61,6 +61,16 @@ class CSyncCalcStreamCudaKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream())); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream()));
#endif #endif
#elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32)
auto place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(is_npu_place(place), true,
platform::errors::PreconditionNotMet(
"Sync stream op can run on npu place only for now."));
auto dev_ctx = static_cast<platform::NPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(dev_ctx->stream()));
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU.")); "PaddlePaddle should compile with GPU."));
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(elementwise_add);
USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
USE_NO_KERNEL_OP(c_sync_calc_stream);
template <typename T>
void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
// init
auto x = scope->Var("X");
auto tensor_x = x->GetMutable<f::LoDTensor>();
auto y = scope->Var("Y");
auto tensor_y = y->GetMutable<f::LoDTensor>();
std::vector<T> init_x;
for (int64_t i = 0; i < 10 * 10; ++i) {
init_x.push_back(static_cast<T>(1.0));
}
std::vector<T> init_y;
for (int64_t i = 0; i < 10 * 10; ++i) {
init_y.push_back(static_cast<T>(2.0));
}
TensorFromVector(init_x, ctx, tensor_x);
tensor_x->Resize({10, 10});
TensorFromVector(init_y, ctx, tensor_y);
tensor_y->Resize({10, 10});
f::AttributeMap attrs;
auto place = ctx.GetPlace();
auto out = scope->Var("Out");
auto tensor_out = out->GetMutable<f::LoDTensor>();
// sync data
auto sync_op0 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}},
{{"Out", {"Out"}}}, attrs);
sync_op0->Run(*scope, place);
// run
auto op =
f::OpRegistry::CreateOp("elementwise_add", {{"X", {"X"}}, {"Y", {"Y"}}},
{{"Out", {"Out"}}}, attrs);
op->Run(*scope, place);
// sync op run
auto sync_op = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}},
{{"Out", {"Out"}}}, attrs);
sync_op->Run(*scope, place);
std::vector<T> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
// sync op copy
auto sync_op2 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}},
{{"Out", {"Out"}}}, attrs);
sync_op2->Run(*scope, place);
float expected = 3.0;
EXPECT_EQ(out_vec.size(), init_x.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], static_cast<T>(expected));
}
}
TEST(c_sync_calc_stream, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx);
}
...@@ -19,6 +19,11 @@ limitations under the License. */ ...@@ -19,6 +19,11 @@ limitations under the License. */
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -56,9 +61,8 @@ template <typename T> ...@@ -56,9 +61,8 @@ template <typename T>
class CSyncCommStreamCudaKernel : public framework::OpKernel<T> { class CSyncCommStreamCudaKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
int ring_id = ctx.Attr<int>("ring_id"); int ring_id = ctx.Attr<int>("ring_id");
auto stream = auto stream =
...@@ -70,6 +74,16 @@ class CSyncCommStreamCudaKernel : public framework::OpKernel<T> { ...@@ -70,6 +74,16 @@ class CSyncCommStreamCudaKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
#endif #endif
#elif defined(PADDLE_WITH_ASCEND_CL)
auto place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(is_npu_place(place), true,
platform::errors::PreconditionNotMet(
"Sync stream op can run on npu place only for now."));
int ring_id = ctx.Attr<int>("ring_id");
auto stream =
platform::HCCLCommContext::Instance().Get(ring_id, place)->stream();
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream));
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU.")); "PaddlePaddle should compile with GPU."));
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(c_broadcast);
USE_NO_KERNEL_OP(c_sync_comm_stream);
USE_NO_KERNEL_OP(c_gen_hccl_id);
USE_NO_KERNEL_OP(c_comm_init_hccl);
USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
DECLARE_string(selected_npus);
template <typename T>
void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
std::string debugstring = "";
for (auto ele : data) {
debugstring += std::to_string(ele) + std::string(",");
}
VLOG(2) << preStr << ":" << std::endl << debugstring;
}
void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
HcclRootInfo* hccl_id) {
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
std::vector<int> rank_ids{0, 1};
f::AttributeMap gen_hccl_id;
std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
gen_hccl_id["rank"] = rank_id;
gen_hccl_id["endpoint"] = endpointList[rank_id];
std::vector<std::string> other_endpoints = {
endpointList[rank_id == 0 ? 1 : 0]};
gen_hccl_id["other_endpoints"] = other_endpoints;
auto out = scope->Var("Out");
auto id = out->GetMutable<HcclRootInfo>();
VLOG(3) << "break";
auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
{{"Out", {"Out"}}}, gen_hccl_id);
VLOG(3) << "break";
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
memcpy(hccl_id, id, 1024);
}
void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
HcclRootInfo* hccl_id) {
auto x = scope->Var("X");
auto id = x->GetMutable<HcclRootInfo>();
memcpy(id, hccl_id, 1024);
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
// std::vector<int> rank_ids{0, 1};
f::AttributeMap comm_init_attrs;
comm_init_attrs["ring_id"] = 0;
comm_init_attrs["rank_ids"] = 2;
comm_init_attrs["rank"] = rank_id;
comm_init_attrs["device_id"] = device_id;
// comm_init_attrs["rank_ids"] = rank_ids;
auto comm_init_op = f::OpRegistry::CreateOp(
"c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
}
void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
// init
auto x = scope->Var("Data");
auto tensor_x = x->GetMutable<f::LoDTensor>();
int num = 2;
std::vector<float> init;
int rank_id = atoi(getenv("RANK_ID"));
std::cout << "rank_id:" << rank_id << std::endl;
for (int64_t i = 0; i < num * num; ++i) {
init.push_back(1.0 + rank_id);
std::cout << init[0];
}
std::cout << std::endl;
TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num, num});
ctx.Wait();
auto place = ctx.GetPlace();
auto out = scope->Var("OutData");
auto tensor_out = out->GetMutable<f::LoDTensor>();
tensor_out->Resize({num, num});
tensor_out->mutable_data<float>(place); // allocate
ctx.Wait();
// run
f::AttributeMap attrs;
attrs["tag"] = std::string("tagx");
attrs["root"] = 0;
attrs["ring_id"] = 0;
auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}},
{{"Out", {"OutData"}}}, attrs);
op->Run(*scope, place);
// comm sync
auto sync_op = f::OpRegistry::CreateOp(
"c_sync_comm_stream", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
sync_op->Run(*scope, place);
// ctx.Wait();
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
EXPECT_EQ(out_vec.size(), init.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], 1.0);
}
}
TEST(c_sync_comm_stream_op, NPU) {
f::Scope scope;
HcclRootInfo hccl_id;
// only support one device, if more than one device, use first default
p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
PrepareUniqueId(&scope, ctx, &hccl_id);
Prepare(&scope, ctx, &hccl_id);
TestHCCLBroadcastOp(&scope, ctx);
}
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <ostream>
#include <string>
#include "glog/logging.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/hccl_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
namespace paddle {
namespace operators {
#ifdef PADDLE_WITH_ASCEND_CL
class GenHCCLIdOp : public framework::OperatorBase {
public:
GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
std::vector<std::string> trainers =
Attr<std::vector<std::string>>("trainers");
int trainer_id = Attr<int>("trainer_id");
std::string endpoint = trainers[trainer_id];
PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument(
"trainer_id %d is less than 0. Its "
"valid range is [0, trainer_size)"));
PADDLE_ENFORCE_LT(
trainer_id, static_cast<int>(trainers.size()),
platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
"range is [0, trainer_size)",
trainer_id));
int hccl_comm_num = Attr<int>("hccl_comm_num");
int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
int inter_nranks = Attr<int>("hierarchical_allreduce_inter_nranks");
int inter_trainer_id = -1;
int exter_trainer_id = -1;
if (use_hierarchical_allreduce) {
PADDLE_ENFORCE_GT(
trainers.size(), 1,
platform::errors::PreconditionNotMet(
"The number of collective trainers %llu <= 1", trainers.size()));
PADDLE_ENFORCE_GT(
inter_nranks, 1,
platform::errors::PreconditionNotMet(
"inter_nranks %d <= 1 while in hierarchical allreduce mode",
inter_nranks));
PADDLE_ENFORCE_EQ(
trainers.size() % inter_nranks, 0,
platform::errors::PreconditionNotMet(
"The number of trainers %llu mod inter_nranks %d is not equal 0",
trainers.size(), inter_nranks));
inter_trainer_id = trainer_id % inter_nranks;
if (trainer_id % inter_nranks == 0) {
exter_trainer_id = trainer_id / inter_nranks;
}
}
std::ostringstream ss;
for (size_t i = 0; i < trainers.size(); i++) {
ss << trainers[i] << ",";
}
VLOG(1) << "trainer_id:" << trainer_id
<< ", use_hierarchical_allreduce:" << use_hierarchical_allreduce
<< ", hccl_comm_num:" << hccl_comm_num
<< ", inter_nranks:" << inter_nranks
<< ", inter_trainer_id:" << inter_trainer_id
<< ", exter_trainer_id:" << exter_trainer_id
<< ", trainers:" << ss.str();
int server_fd = -1;
/// 1. init flat
std::function<std::string(size_t)> func = platform::GetFlatHCCLVarName;
if (trainer_id == 0) {
// server endpoints
std::vector<std::string> flat_endpoints;
flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1,
trainers.end());
SendBroadCastHCCLID(flat_endpoints, hccl_comm_num, func, scope);
} else {
server_fd = CreateListenSocket(endpoint);
RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
}
/// 2. hierarchical inter ncclid
func = platform::GetHierarchicalInterHCCLVarName;
if (inter_trainer_id == 0) {
std::ostringstream ss;
ss << endpoint;
std::vector<std::string> inter_endpoints;
for (int i = trainer_id + 1; i < trainer_id + inter_nranks &&
i < static_cast<int>(trainers.size());
i++) {
ss << ",";
inter_endpoints.push_back(trainers[i]);
ss << trainers[i];
}
VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();
SendBroadCastHCCLID(inter_endpoints, hccl_comm_num, func, scope);
} else if (inter_trainer_id > 0) {
VLOG(1) << "Hierarchical inter ring";
RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
}
/// 3. hierarchical exter ncclid
func = platform::GetHierarchicalExterHCCLVarName;
if (exter_trainer_id == 0) {
std::ostringstream ss;
std::vector<std::string> exter_endpoints;
ss << endpoint;
for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) {
ss << ",";
exter_endpoints.push_back(trainers[i]);
ss << trainers[i];
}
VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();
SendBroadCastHCCLID(exter_endpoints, hccl_comm_num, func, scope);
} else if (exter_trainer_id > 0) {
VLOG(1) << "Hierarchical exter ring";
RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
}
// close socket server
if (trainer_id != 0) {
CloseSocket(server_fd);
}
}
};
#else
class GenHCCLIdOp : public framework::OperatorBase {
public:
GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {}
};
#endif
class GenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddOutput("HCCLID", "Raw variable contains a HCCL UniqueId instaces.");
AddComment(R"DOC(
GenHCCLId operator
For trainer 0: generate a new UniqueId and send it to all the other trainers.
For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
)DOC");
AddAttr<std::vector<std::string>>(
"trainers",
"['trainer0_ip:port', 'trainer1_ip:port', ...] "
"list of all trainer endpoints")
.SetDefault({});
AddAttr<int>("trainer_id",
"(int) "
"The index of the trainer in distributed training.");
AddAttr<int>("hccl_comm_num",
"(int default 1) "
"The number of nccl communicator num.")
.SetDefault(1);
AddAttr<bool>("use_hierarchical_allreduce",
"(bool default false) "
"Wheter to use hierarchical allreduce.")
.SetDefault(false);
AddAttr<int>("hierarchical_allreduce_inter_nranks",
"(int default 1) "
"Wheter to use hierarchical allreduce.")
.SetDefault(-1);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(gen_hccl_id, ops::GenHCCLIdOp, ops::GenHCCLIdOpMaker);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#include <arpa/inet.h>
#include <netdb.h>
#include <netinet/in.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <algorithm>
#include <ostream>
#include <string>
#include "glog/logging.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/split.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace paddle {
namespace operators {
constexpr char COMM_HEAD[] = "_pd_gen_comm_id_";
#define HCCL_UNIQUE_ID_BYTES 1024
// Check system calls, such as socket, bind.
#define CHECK_SYS_CALL(call, name) \
do { \
int retval; \
CHECK_SYS_CALL_VAL(call, name, retval); \
} while (false)
#define CHECK_SYS_CALL_VAL(call, name, retval) \
do { \
RETRY_SYS_CALL_VAL(call, name, retval); \
if (retval == -1) { \
PADDLE_THROW(platform::errors::Unavailable("Call to %s failed: %s", \
name, strerror(errno))); \
} \
} while (false)
#define RETRY_SYS_CALL_VAL(call, name, retval) \
do { \
retval = (call); \
if (retval == -1 && \
(errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
LOG(WARNING) << "Call " << name << " returned " << strerror(errno) \
<< " retry"; \
} else { \
break; \
} \
} while (true)
static int SocketSend(int fd, const char* buffer, int size) {
int offset = 0;
int bytes = 0;
while (offset < size) {
bytes = send(fd, buffer + offset, size - offset, 0);
if (bytes == -1) {
if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
// send failed
return -1;
} else {
bytes = 0;
}
}
offset += bytes;
}
return offset;
}
static int SocketRecv(int fd, char* buffer, int size) {
int offset = 0;
int bytes = 0;
while (offset < size) {
bytes = recv(fd, buffer + offset, size - offset, 0);
if (bytes == 0) {
// closed by client, maybe probing alive client
return 0;
}
if (bytes == -1) {
if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
return -1;
} else {
bytes = 0;
}
}
offset += bytes;
}
return offset;
}
static void BindOrConnectFailed(int timeout, int* try_times, int* total_time,
const char* op, const std::string& ep) {
PADDLE_ENFORCE_LT(
*total_time, timeout,
platform::errors::Unavailable("%s addr=%s timeout, failed reason: %s", op,
ep.c_str(), strerror(errno)));
++(*try_times);
int retry_time = std::min(*try_times * 500, 3000); // max 3 seconds
*total_time += retry_time;
LOG(WARNING) << op << " addr=" << ep << " failed " << *try_times
<< " times with reason: " << strerror(errno) << " retry after "
<< retry_time / 1000.0 << " seconds";
std::this_thread::sleep_for(std::chrono::milliseconds(retry_time));
}
int CreateListenSocket(const std::string& ep) {
auto addr = paddle::string::Split(ep, ':');
PADDLE_ENFORCE_EQ(
addr.size(), 2UL,
platform::errors::InvalidArgument(
"The endpoint should contain host and port, but got %s.", ep));
std::string host = addr[0];
int port = std::stoi(addr[1]);
// creating socket fd
int server_fd = -1;
CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", server_fd);
// NOTE. Solutions to `Address already in use`.
// 1. Reuse addr&port. Otherwise, once the server closes the socket
// before client, the server will enter TIME-WAIT status. If we bind port
// again, the error `Address already in use` will appear.
// 2. Or we can close the client first to ensure that the server does
// not enter the TIME-WAIT state. But this is obviously not as convenient
// as the reuse method.
int opt = 1;
#if defined(SO_REUSEPORT)
// since Linux kernel 3.9
CHECK_SYS_CALL(setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT,
&opt, sizeof(opt)),
"setsockopt");
#else
CHECK_SYS_CALL(
setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)),
"setsockopt");
#endif
struct sockaddr_in address;
address.sin_family = AF_INET;
address.sin_addr.s_addr = INADDR_ANY;
address.sin_port = htons(port);
// TODO(wangxi) Set from env, default 900s=15min
int timeout = 900 * 1000;
int try_times = 0;
int total_time = 0;
while (true) {
int ret_val = -1;
RETRY_SYS_CALL_VAL(
bind(server_fd, (struct sockaddr*)&address, sizeof(address)), "bind",
ret_val);
if (ret_val == -1) {
BindOrConnectFailed(timeout, &try_times, &total_time, "bind", ep);
continue;
}
break;
}
CHECK_SYS_CALL(listen(server_fd, 3), "listen");
LOG(INFO) << "Server listening on: " << ep << " successful.";
return server_fd;
}
void CloseSocket(int fd) { CHECK_SYS_CALL(close(fd), "close"); }
static int SocketAccept(int server_fd, const char* head) {
struct sockaddr_in client_addr;
socklen_t addr_length = sizeof(client_addr);
char buffer[1024] = {0};
int conn = -1;
while (true) {
CHECK_SYS_CALL_VAL(
accept(server_fd, reinterpret_cast<struct sockaddr*>(&client_addr),
&addr_length),
"accept", conn);
int ret_val = SocketRecv(conn, buffer, strlen(head));
if (ret_val > 0 && strncmp(buffer, head, strlen(head)) == 0) {
break; // accept client
} else {
VLOG(3) << "socket read failed with ret_val=" << ret_val;
CloseSocket(conn);
}
}
return conn;
}
static int ConnectAddr(const std::string& ep, const char* head) {
auto addr = paddle::string::Split(ep, ':');
PADDLE_ENFORCE_EQ(
addr.size(), 2UL,
platform::errors::InvalidArgument(
"The endpoint should contain host and port, but got %s.", ep));
std::string host = addr[0];
int port = std::stoi(addr[1]);
int sock = -1;
CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", sock);
struct sockaddr_in server_addr;
memset(&server_addr, 0, sizeof(server_addr));
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(port);
char* ip = NULL;
struct hostent* hp = NULL;
hp = gethostbyname(host.c_str());
PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument(
"Fail to get host by name %s.", host));
int i = 0;
while (hp->h_addr_list[i] != NULL) {
ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]);
VLOG(3) << "gethostbyname host:" << host << " ->ip: " << ip;
break;
}
PADDLE_ENFORCE_GT(inet_pton(AF_INET, ip, &server_addr.sin_addr), 0,
platform::errors::Unavailable("Open address %s failed: %s",
ep, strerror(errno)));
// TODO(wangxi) Set from env, default 900s=15min
int timeout = 900 * 1000;
int try_times = 0;
int total_time = 0;
while (true) {
int ret_val = -1;
RETRY_SYS_CALL_VAL(
connect(sock, (struct sockaddr*)&server_addr, sizeof(server_addr)),
"connect", ret_val);
if (ret_val == -1) {
BindOrConnectFailed(timeout, &try_times, &total_time, "connect", ep);
continue;
}
CHECK_SYS_CALL(SocketSend(sock, head, strlen(head)), "send");
break;
}
return sock;
}
static void RecvHCCLID(int conn, HcclRootInfo* hccl_id) {
char buffer[1024] = {0};
static_assert(HCCL_UNIQUE_ID_BYTES <= 1024,
"hccl id bytes must <= buffer size");
CHECK_SYS_CALL(SocketRecv(conn, buffer, HCCL_UNIQUE_ID_BYTES),
"recv hccl id");
memcpy(hccl_id, buffer, HCCL_UNIQUE_ID_BYTES);
}
static void SendHCCLID(int conn, HcclRootInfo* hccl_id) {
char buffer[1024] = {0};
memcpy(buffer, hccl_id, HCCL_UNIQUE_ID_BYTES);
CHECK_SYS_CALL(SocketSend(conn, buffer, HCCL_UNIQUE_ID_BYTES),
"send hccl id");
}
void SendBroadCastHCCLID(std::vector<std::string> servers, int hccl_comm_num,
std::function<std::string(size_t)> func,
const framework::Scope& scope) {
// connect with server
std::vector<int> connects;
for (auto server : servers) {
VLOG(3) << "connecting endpoint: " << server;
int conn = ConnectAddr(server, COMM_HEAD);
connects.push_back(conn);
}
VLOG(3) << "connecting completed...";
for (int i = 0; i < hccl_comm_num; ++i) {
std::string var_name = func(i);
auto var = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable with name %s is not found",
var_name.c_str()));
auto hccl_id = var->GetMutable<HcclRootInfo>();
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(hccl_id));
int j = 0;
for (auto conn : connects) {
VLOG(3) << "sending hccl_id_var: " << var_name << " to " << servers[j]
<< " hccl_comm_no: " << i;
SendHCCLID(conn, hccl_id);
++j;
}
VLOG(3) << "sending completed...";
}
// close client
for (auto conn : connects) {
CloseSocket(conn);
}
}
void RecvBroadCastHCCLID(std::string endpoint, int hccl_comm_num,
std::function<std::string(size_t)> func,
const framework::Scope& scope) {
int server = CreateListenSocket(endpoint);
RecvBroadCastHCCLID(server, endpoint, hccl_comm_num, func, scope);
CloseSocket(server);
}
void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int hccl_comm_num,
std::function<std::string(size_t)> func,
const framework::Scope& scope) {
int client = SocketAccept(server_fd, COMM_HEAD);
for (int i = 0; i < hccl_comm_num; ++i) {
std::string var_name = func(i);
auto var = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable with name %s is not found",
var_name.c_str()));
auto hccl_id = var->GetMutable<HcclRootInfo>();
VLOG(3) << "trainer: " << endpoint << " receiving hccl_id_var: " << var_name
<< " from trainer 0, hccl_comm_no: " << i;
RecvHCCLID(client, hccl_id);
}
VLOG(3) << "receiving completed...";
CloseSocket(client);
}
} // namespace operators
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include <string>
#include <vector>
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace operators {
int CreateListenSocket(const std::string& ep);
void CloseSocket(int fd);
void SendBroadCastHCCLID(std::vector<std::string> servers, int nccl_comm_num,
std::function<std::string(size_t)> func,
const framework::Scope& scope);
// server listen on endpoint, then recv nccl id
void RecvBroadCastHCCLID(std::string endpoint, int nccl_comm_num,
std::function<std::string(size_t)> func,
const framework::Scope& scope);
// recv nccl id from socket
void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int nccl_comm_num,
std::function<std::string(size_t)> func,
const framework::Scope& scope);
} // namespace operators
} // namespace paddle
...@@ -70,6 +70,12 @@ class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker { ...@@ -70,6 +70,12 @@ class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("peer", "(int default 0) rank id for sender.").SetDefault(0); AddAttr<int>("peer", "(int default 0) rank id for sender.").SetDefault(0);
AddAttr<int>("dtype", "(int default 5('float32')) data type of tensor.") AddAttr<int>("dtype", "(int default 5('float32')) data type of tensor.")
.SetDefault(5); .SetDefault(5);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
.SetDefault("tag");
AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
.SetDefault(0);
#endif
AddAttr<std::vector<int>>("out_shape", "shape of the output tensor.") AddAttr<std::vector<int>>("out_shape", "shape of the output tensor.")
.SetDefault(std::vector<int>()); .SetDefault(std::vector<int>());
AddAttr<bool>( AddAttr<bool>(
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/recv_v2_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace paddle {
namespace operators {
template <typename T>
class CRecvOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto x = ctx.Output<framework::LoDTensor>("Out");
void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
int numel = x->numel();
HcclDataType dtype = platform::ToHCCLDataType(x->type());
int ring_id = ctx.Attr<int>("ring_id");
auto place = ctx.GetPlace();
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int nranks = comm->nranks();
int peer = ctx.Attr<int>("peer");
PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument(
"The nranks must be 2, but (%d)", nranks));
int root = peer;
VLOG(3) << "begin hccl recv, parameter is: "
<< "root " << root << ", comm: " << comm->comm()
<< ", stream: " << stream;
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(recv_v2, ops::CRecvOpASCENDKernel<int>,
ops::CRecvOpASCENDKernel<int8_t>,
ops::CRecvOpASCENDKernel<float>,
ops::CRecvOpASCENDKernel<plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#include "paddle/fluid/operators/collective/recv_v2_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(recv_v2);
USE_NO_KERNEL_OP(c_gen_hccl_id);
USE_NO_KERNEL_OP(c_comm_init_hccl);
USE_OP_DEVICE_KERNEL(recv_v2, NPU);
void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
HcclRootInfo* hccl_id) {
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
std::vector<int> rank_ids{0, 1};
f::AttributeMap gen_hccl_id;
std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
gen_hccl_id["rank"] = rank_id;
gen_hccl_id["endpoint"] = endpointList[rank_id];
std::vector<std::string> other_endpoints = {
endpointList[rank_id == 0 ? 1 : 0]};
gen_hccl_id["other_endpoints"] = other_endpoints;
auto out = scope->Var("Out");
auto id = out->GetMutable<HcclRootInfo>();
VLOG(3) << "break";
auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
{{"Out", {"Out"}}}, gen_hccl_id);
VLOG(3) << "break";
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
memcpy(hccl_id, id, 1024);
}
void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
HcclRootInfo* hccl_id) {
auto x = scope->Var("X");
auto id = x->GetMutable<HcclRootInfo>();
memcpy(id, hccl_id, 1024);
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
// std::vector<int> rank_ids{0, 1};
f::AttributeMap comm_init_attrs;
comm_init_attrs["ring_id"] = 0;
comm_init_attrs["rank_ids"] = 2;
comm_init_attrs["rank"] = rank_id;
comm_init_attrs["device_id"] = device_id;
// comm_init_attrs["rank_ids"] = rank_ids;
auto comm_init_op = f::OpRegistry::CreateOp(
"c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
}
void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx) {
std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
int num = atoi(getenv("DATA_SIZE"));
EXPECT_GT(num, 0);
EXPECT_LT(num, 1 << 15);
int rank_id = atoi(getenv("RANK_ID"));
VLOG(3) << "rank_id:" << rank_id << std::endl;
ctx.Wait();
auto place = ctx.GetPlace();
auto out = scope->Var("Data");
auto tensor_out = out->GetMutable<f::LoDTensor>();
tensor_out->Resize({num, num});
tensor_out->mutable_data<float>(place); // allocate
ctx.Wait();
f::AttributeMap attrs;
attrs["tag"] = std::string("srtest");
attrs["peer"] = atoi(getenv("SRC_RANK"));
attrs["ring_id"] = 0;
attrs["srTag"] = 0;
std::vector<int> out_shape;
out_shape.push_back(num);
out_shape.push_back(num);
attrs["out_shape"] = out_shape;
auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Data"}}}, attrs);
VLOG(3) << "CreateOp recv_v2";
for (int i = 0; i < 10; i++) {
op->Run(*scope, place);
}
VLOG(3) << "Run op recv_v2";
std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
EXPECT_EQ(out_vec == init, true);
}
TEST(recv_v2, NPU) {
f::Scope scope;
HcclRootInfo hccl_id;
char* npu_id = getenv("FLAGS_selected_npus");
VLOG(3) << "Select npu:" << npu_id;
p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
PrepareUniqueId(&scope, ctx, &hccl_id);
Prepare(&scope, ctx, &hccl_id);
TestHcomRecvOp(&scope, ctx);
}
...@@ -50,6 +50,12 @@ class SendOpV2Maker : public framework::OpProtoAndCheckerMaker { ...@@ -50,6 +50,12 @@ class SendOpV2Maker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.") AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
.SetDefault(0); .SetDefault(0);
AddAttr<int>("peer", "(int default 0) rank id for receiver.").SetDefault(0); AddAttr<int>("peer", "(int default 0) rank id for receiver.").SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
.SetDefault("tag");
AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
.SetDefault(0);
#endif
AddAttr<bool>( AddAttr<bool>(
"use_calc_stream", "use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.") "(bool default false) eject CUDA operations to calculation stream.")
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/send_v2_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace paddle {
namespace operators {
template <typename T>
class CSendOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto x = ctx.Input<framework::LoDTensor>("X");
void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
int numel = x->numel();
HcclDataType dtype = platform::ToHCCLDataType(x->type());
int ring_id = ctx.Attr<int>("ring_id");
auto place = ctx.GetPlace();
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
aclrtStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int nranks = comm->nranks();
int rank = comm->rank();
PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument(
"The nranks must be 2, but (%d)", nranks));
int root = rank;
VLOG(3) << "begin hccl send, parameter is: "
<< "root " << root << ", comm: " << comm->comm()
<< ", stream: " << stream;
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(send_v2, ops::CSendOpASCENDKernel<int>,
ops::CSendOpASCENDKernel<int8_t>,
ops::CSendOpASCENDKernel<float>,
ops::CSendOpASCENDKernel<plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#include "paddle/fluid/operators/collective/send_v2_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(send_v2);
USE_NO_KERNEL_OP(c_gen_hccl_id);
USE_NO_KERNEL_OP(c_comm_init_hccl);
USE_OP_DEVICE_KERNEL(send_v2, NPU);
void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
HcclRootInfo* hccl_id) {
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
std::vector<int> rank_ids{0, 1};
f::AttributeMap gen_hccl_id;
std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
gen_hccl_id["rank"] = rank_id;
gen_hccl_id["endpoint"] = endpointList[rank_id];
std::vector<std::string> other_endpoints = {
endpointList[rank_id == 0 ? 1 : 0]};
gen_hccl_id["other_endpoints"] = other_endpoints;
auto out = scope->Var("Out");
auto id = out->GetMutable<HcclRootInfo>();
VLOG(3) << "break";
auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
{{"Out", {"Out"}}}, gen_hccl_id);
VLOG(3) << "break";
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
memcpy(hccl_id, id, 1024);
}
void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
HcclRootInfo* hccl_id) {
auto x = scope->Var("X");
auto id = x->GetMutable<HcclRootInfo>();
memcpy(id, hccl_id, 1024);
int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));
VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
// std::vector<int> rank_ids{0, 1};
f::AttributeMap comm_init_attrs;
comm_init_attrs["ring_id"] = 0;
comm_init_attrs["rank_ids"] = 2;
comm_init_attrs["rank"] = rank_id;
comm_init_attrs["device_id"] = device_id;
// comm_init_attrs["rank_ids"] = rank_ids;
auto comm_init_op = f::OpRegistry::CreateOp(
"c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
}
void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx) {
std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
auto x = scope->Var("Data");
auto tensor_x = x->GetMutable<f::LoDTensor>();
int num = atoi(getenv("DATA_SIZE"));
EXPECT_GT(num, 0);
EXPECT_LT(num, 1 << 15);
std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
int rank_id = atoi(getenv("RANK_ID"));
VLOG(3) << "rank id:" << rank_id;
TensorFromVector(init, ctx, tensor_x);
tensor_x->Resize({num, num});
ctx.Wait();
auto place = ctx.GetPlace();
ctx.Wait();
f::AttributeMap attrs;
attrs["tag"] = std::string("srtest");
attrs["peer"] = atoi(getenv("DEST_RANK"));
attrs["ring_id"] = 0;
attrs["srTag"] = 0;
auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"Data"}}}, {}, attrs);
for (int i = 0; i < 10; i++) {
op->Run(*scope, place);
}
VLOG(3) << "send run over";
ctx.Wait();
}
TEST(send_v2, NPU) {
f::Scope scope;
HcclRootInfo hccl_id;
char* npu_id = getenv("FLAGS_selected_npus");
VLOG(3) << "Select npu:" << npu_id;
p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
PrepareUniqueId(&scope, ctx, &hccl_id);
Prepare(&scope, ctx, &hccl_id);
TestHcomSendOp(&scope, ctx);
}
...@@ -106,11 +106,11 @@ ELSE() ...@@ -106,11 +106,11 @@ ELSE()
ENDIF() ENDIF()
IF(WITH_ASCEND_CL) IF(WITH_ASCEND_CL)
cc_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) cc_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
ENDIF() ENDIF()
IF(WITH_GPU) IF(WITH_GPU)
nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
ENDIF() ENDIF()
IF(WITH_ROCM) IF(WITH_ROCM)
hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
...@@ -136,7 +136,7 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool ...@@ -136,7 +136,7 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool
place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS}) ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})
cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce) cc_library(collective_helper SRCS collective_helper.cc collective_helper_npu.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)
if(WITH_GPU OR WITH_ROCM) if(WITH_GPU OR WITH_ROCM)
cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info) cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#ifdef PADDLE_WITH_ASCEND #ifdef PADDLE_WITH_ASCEND_CL
namespace paddle { namespace paddle {
namespace platform { namespace platform {
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "boost/variant.hpp" #include "boost/variant.hpp"
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/hccl.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
...@@ -126,6 +127,113 @@ class NCCLCommContext { ...@@ -126,6 +127,113 @@ class NCCLCommContext {
}; };
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL)
// In order to apply hierarchical communication with HCCL, we need
// a communication ring contains HCCL communicators associated to a global
// HCCLUniqueId. E.g. for a hierarchical case,
//
// 11 - 12 21 - 22
// | | | |
// 13 - 14 - 23 - 24
// | |
// 31 - 32 - 41 - 42
// | | | |
// 33 - 34 43 - 44
//
// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24),
// (31,32,33,34), (41,42,43,44) as bottoms respectively.
//
// We could also use a single communication ring for the flatten case
//
// The HCCLComm instance is created and reversed in the HCCLCommContext
// singleton with a global user specified group id.
class NPUDeviceContext;
#define ENV_RANK_TABLE_FILE "RANK_TABLE_FILE"
#define ENV_RANK_ID "PADDLE_TRAINER_ID"
class HCCLComm {
public:
virtual int ring_id() const = 0;
virtual int nranks() const = 0;
virtual int rank() const = 0;
virtual int device_id() const = 0;
virtual HcclComm comm() const = 0;
virtual aclrtStream stream() const = 0;
virtual NPUDeviceContext* dev_context() const = 0;
virtual ~HCCLComm() = default;
};
// A singleton HCCL communicator context reserves communication ring ids
class HCCLCommContext {
public:
static HCCLCommContext& Instance() {
static HCCLCommContext comm_ctx;
return comm_ctx;
}
HCCLComm* CreateHCCLComm(HcclRootInfo* hccl_id, int nranks, int rank,
int dev_id, int ring_id);
// a latter comm with the same dev_id and the same ring_id
// will override the former
HCCLComm* AssignHCCLComm(HcclComm comm, int nranks, int rank, int dev_id,
int ring_id);
// retrieve a communicator by the ring id in multiprocessing mode
HCCLComm* Get(int ring_id) const {
PADDLE_ENFORCE_GT(
comm_map_.count(ring_id), 0,
platform::errors::InvalidArgument(
"Communicator in ring id %d has not been initialized.", ring_id));
PADDLE_ENFORCE_EQ(comm_map_.at(ring_id).size(), 1,
platform::errors::InvalidArgument(
"One device id should be specified to retrieve from "
"multiple communicators."));
return comm_map_.at(ring_id).begin()->second.get();
}
// retrieve a communicator by the ring id and the device id
HCCLComm* Get(int ring_id, int dev_id) const {
PADDLE_ENFORCE_GT(
comm_map_.count(ring_id), 0,
platform::errors::InvalidArgument(
"Communicator of ring id %d has not been initialized.", ring_id));
PADDLE_ENFORCE_GT(
comm_map_.at(ring_id).count(dev_id), 0,
platform::errors::InvalidArgument(
"Communicator at device id %d has not been initialized in ring %d.",
dev_id, ring_id));
return comm_map_.at(ring_id).at(dev_id).get();
}
// retrieve a communicator by the ring id and place
HCCLComm* Get(int ring_id, Place place) const {
return Get(ring_id, BOOST_GET_CONST(NPUPlace, place).device);
}
private:
// Init global hcom
HCCLCommContext() {}
// we may use group feature in the feature
// HCCLCommContext() { InitHcomWorldGroup(); }
HcclComm comm_;
public:
~HCCLCommContext() {}
std::once_flag once_flag_;
std::mutex comm_map_mutex_;
// ring id to dev-HCCLComm
std::map<int, std::map<int, std::unique_ptr<HCCLComm>>> comm_map_;
// void InitHcomWorldGroup();
void ReleaseHCCLComms();
DISABLE_COPY_AND_ASSIGN(HCCLCommContext);
};
#endif
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
// In order to apply hierarchical communication with BKCL, we need // In order to apply hierarchical communication with BKCL, we need
// a communication ring contains BKCL communicators associated to a global // a communication ring contains BKCL communicators associated to a global
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include <utility>
namespace paddle {
namespace platform {
class HCCLCommImpl : public HCCLComm {
public:
void set_ring_id(int ring_id) { ring_id_ = ring_id; }
int ring_id() const override { return ring_id_; }
void set_nranks(int nranks) { nranks_ = nranks; }
int nranks() const override { return nranks_; }
void set_rank(int rank) { rank_ = rank; }
int rank() const override { return rank_; }
int device_id() const override {
return BOOST_GET_CONST(NPUPlace, dev_ctx_->GetPlace()).device;
}
~HCCLCommImpl() {
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommDestroy(comm_));
}
void set_comm(HcclComm comm) { comm_ = comm; }
HcclComm comm() const override { return comm_; }
aclrtStream stream() const override { return dev_ctx_->stream(); }
void set_dev_ctx(std::unique_ptr<NPUDeviceContext>&& dev_ctx) {
dev_ctx_ = std::move(dev_ctx);
}
NPUDeviceContext* dev_context() const override { return dev_ctx_.get(); }
private:
int ring_id_;
int nranks_;
int rank_;
HcclComm comm_;
std::unique_ptr<NPUDeviceContext> dev_ctx_;
};
HCCLComm* HCCLCommContext::CreateHCCLComm(HcclRootInfo* hccl_id, int nranks,
int rank, int dev_id, int ring_id) {
PADDLE_ENFORCE_NOT_NULL(hccl_id,
platform::errors::InvalidArgument(
"The hccl unique id should not be null."));
PADDLE_ENFORCE_GT(
nranks, 1,
platform::errors::InvalidArgument(
"Expected nranks > 1. But received nranks is %d.", nranks));
PADDLE_ENFORCE_GE(rank, 0,
platform::errors::InvalidArgument(
"Expected rank >= 0. But received rank is %d.", rank));
PADDLE_ENFORCE_LT(
rank, nranks,
platform::errors::InvalidArgument(
"Expected rank < nranks. But received rank is %d, nranks is %d.",
rank, nranks));
PADDLE_ENFORCE_GE(
dev_id, 0,
platform::errors::InvalidArgument(
"Expected dev_id >= 0. But received dev_id is %d.", dev_id));
HcclComm comm;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(dev_id));
VLOG(1) << "initialized comm: " << &comm << ", nranks: " << nranks
<< ", hccl_id: " << hccl_id << ", rank: " << rank;
PADDLE_ENFORCE_NPU_SUCCESS(
platform::dynload::HcclCommInitRootInfo(nranks, hccl_id, rank, &comm));
VLOG(1) << "initialized comm: " << &comm << ", nranks: " << nranks
<< ", hccl_id: " << hccl_id << ", rank: " << rank;
auto* comm_wrapper = AssignHCCLComm(comm, nranks, rank, dev_id, ring_id);
VLOG(1) << "hccl communicator of rank " << rank << " in ring " << ring_id
<< " has been created on device " << dev_id
<< ", with comm: " << comm_wrapper->comm();
std::call_once(once_flag_, []() {
std::atexit([]() { HCCLCommContext::Instance().ReleaseHCCLComms(); });
});
return comm_wrapper;
}
HCCLComm* HCCLCommContext::AssignHCCLComm(HcclComm comm, int nranks, int rank,
int dev_id, int ring_id) {
std::unique_ptr<NPUDeviceContext> dev_ctx(
new NPUDeviceContext(NPUPlace(dev_id)));
HCCLCommImpl* c = new HCCLCommImpl;
c->set_ring_id(ring_id);
c->set_nranks(nranks);
c->set_rank(rank);
c->set_comm(comm);
c->set_dev_ctx(std::move(dev_ctx));
comm_map_mutex_.lock();
if (comm_map_.count(ring_id) == 0) {
comm_map_.emplace(ring_id, std::map<int, std::unique_ptr<HCCLComm>>());
}
auto& dev2comm = comm_map_[ring_id];
dev2comm.emplace(dev_id, std::unique_ptr<HCCLComm>(c));
comm_map_mutex_.unlock();
if (ring_id == 0) {
auto* dev_ctx = static_cast<platform::NPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(
platform::NPUPlace(dev_id)));
dev_ctx->set_hccl_comm(comm);
}
return comm_map_[ring_id][dev_id].get();
}
void HCCLCommContext::ReleaseHCCLComms() {
for (auto& p : comm_map_) {
for (auto& q : p.second) {
q.second.reset();
}
}
}
} // namespace platform
} // namespace paddle
#endif
...@@ -189,14 +189,6 @@ class NPUDeviceContext : public DeviceContext { ...@@ -189,14 +189,6 @@ class NPUDeviceContext : public DeviceContext {
/*! \brief Return npu stream in the device context. */ /*! \brief Return npu stream in the device context. */
aclrtStream stream() const; aclrtStream stream() const;
#ifdef PADDLE_WITH_ASCEND_HCCL
/*! \brief Return bkcl context. */
HCCLContext_t hccl_context() const { return hccl_context_; }
/*! \brief Set bkcl context. */
void set_hccl_context(HCCLContext_t context) { hccl_context_ = context; }
#endif
template <typename Callback> template <typename Callback>
void AddStreamCallback(Callback&& callback) const { void AddStreamCallback(Callback&& callback) const {
return stream_->AddCallback(callback); return stream_->AddCallback(callback);
...@@ -204,11 +196,28 @@ class NPUDeviceContext : public DeviceContext { ...@@ -204,11 +196,28 @@ class NPUDeviceContext : public DeviceContext {
void WaitStreamCallback() const { return stream_->WaitCallback(); } void WaitStreamCallback() const { return stream_->WaitCallback(); }
#if defined(PADDLE_WITH_ASCEND_CL)
/*! \brief Return hccl communicators. */
HcclComm hccl_comm() const { return hccl_comm_; }
/*! \brief Set hccl communicators. */
void set_hccl_comm(HcclComm comm) { hccl_comm_ = comm; }
#endif
// template <typename Callback>
// void AddStreamCallback(Callback&& callback) const {
// return stream_->AddCallback(callback);
// }
// void WaitStreamCallback() const { return stream_->WaitCallback(); }
private: private:
NPUPlace place_; NPUPlace place_;
aclrtContext context_; aclrtContext context_;
#ifdef PADDLE_WITH_ASCEND_HCCL
HCCLContext_t hccl_context_; #ifdef PADDLE_WITH_ASCEND_CL
// HCCLContext_t hccl_context_;
HcclComm hccl_comm_{nullptr};
#endif #endif
// Need to be the same with other DeviceContext, // Need to be the same with other DeviceContext,
......
...@@ -9,7 +9,7 @@ endif() ...@@ -9,7 +9,7 @@ endif()
# There is no macOS version of NCCL. # There is no macOS version of NCCL.
# Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux. # Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux.
if (NOT APPLE AND NOT WIN32) if (NOT APPLE AND NOT WIN32)
list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc) list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
if (WITH_NCCL) if (WITH_NCCL)
list(APPEND CUDA_SRCS nccl.cc) list(APPEND CUDA_SRCS nccl.cc)
endif() endif()
...@@ -32,6 +32,8 @@ endif(CUPTI_FOUND) ...@@ -32,6 +32,8 @@ endif(CUPTI_FOUND)
if(WITH_ROCM) if(WITH_ROCM)
hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader) hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
elseif (WITH_ASCEND_CL)
cc_library(dynload_warpctc SRCS warpctc.cc hccl.cc DEPS dynamic_loader warpctc)
else() else()
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
......
...@@ -36,6 +36,13 @@ DEFINE_string(nccl_dir, "", ...@@ -36,6 +36,13 @@ DEFINE_string(nccl_dir, "",
"For instance, /usr/local/cuda/lib64. If default, " "For instance, /usr/local/cuda/lib64. If default, "
"dlopen will search cuda from LD_LIBRARY_PATH"); "dlopen will search cuda from LD_LIBRARY_PATH");
DEFINE_string(hccl_dir, "",
"Specify path for loading hccl library, such as libhccl.so. "
"For instance, "
"/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/. If "
"default, "
"dlopen will search hccl from LD_LIBRARY_PATH");
DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so."); DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
DEFINE_string( DEFINE_string(
...@@ -392,6 +399,24 @@ void* GetNCCLDsoHandle() { ...@@ -392,6 +399,24 @@ void* GetNCCLDsoHandle() {
warning_msg); warning_msg);
#endif #endif
} }
void* GetHCCLDsoHandle() {
std::string warning_msg(
"You may need to install 'hccl2' from Huawei official website: "
"before install PaddlePaddle.");
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
warning_msg);
#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
#elif defined(PADDLE_WITH_ASCEND_CL)
return GetDsoHandleFromSearchPath(FLAGS_hccl_dir, "libhccl.so", true, {},
warning_msg);
#else
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
warning_msg);
#endif
}
void* GetTensorRtDsoHandle() { void* GetTensorRtDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
......
...@@ -34,6 +34,7 @@ void* GetNVRTCDsoHandle(); ...@@ -34,6 +34,7 @@ void* GetNVRTCDsoHandle();
void* GetCUDADsoHandle(); void* GetCUDADsoHandle();
void* GetWarpCTCDsoHandle(); void* GetWarpCTCDsoHandle();
void* GetNCCLDsoHandle(); void* GetNCCLDsoHandle();
void* GetHCCLDsoHandle();
void* GetTensorRtDsoHandle(); void* GetTensorRtDsoHandle();
void* GetMKLMLDsoHandle(); void* GetMKLMLDsoHandle();
void* GetOpDsoHandle(const std::string& dso_name); void* GetOpDsoHandle(const std::string& dso_name);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/dynload/hccl.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag hccl_dso_flag;
void *hccl_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
HCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
#if HCCL_VERSION_CODE >= 2212
HCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
#endif
#if HCCL_VERSION_CODE >= 2703
HCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
#endif
} // namespace dynload
} // namespace platform
} // namespace paddle
#endif
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_ASCEND_CL
#include <hccl/hccl.h>
#include <hccl/hccl_types.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#define HCOM_GROUP_PREFIX "HCOM_GROUP_"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag hccl_dso_flag;
extern void* hccl_dso_handle;
#define DECLARE_DYNAMIC_LOAD_HCCL_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using HCCL_func = decltype(&::__name); \
std::call_once(hccl_dso_flag, []() { \
hccl_dso_handle = paddle::platform::dynload::GetHCCLDsoHandle(); \
}); \
static void* p_##__name = dlsym(hccl_dso_handle, #__name); \
return reinterpret_cast<HCCL_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define HCCL_RAND_ROUTINE_EACH(__macro) \
__macro(HcclReduceScatter); \
__macro(HcclCommDestroy); \
__macro(HcclAllReduce); \
__macro(HcclCommInitRootInfo); \
__macro(HcclGetRootInfo); \
__macro(HcclBroadcast); \
__macro(HcclCommInitClusterInfo); \
__macro(HcclAllGather);
HCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
#if HCCL_VERSION_CODE >= 2212
#define HCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(HCCLBroadcast);
HCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
#endif
#if HCCL_VERSION_CODE >= 2703
#define HCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
__macro(HCCLSend); \
__macro(HCCLRecv);
HCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
#endif
} // namespace dynload
} // namespace platform
} // namespace paddle
#endif
...@@ -47,6 +47,7 @@ limitations under the License. */ ...@@ -47,6 +47,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
#include "acl/acl.h" #include "acl/acl.h"
#include "hccl/hccl_types.h"
#endif // PADDLE_WITH_ASCEND_CL #endif // PADDLE_WITH_ASCEND_CL
#include <fstream> #include <fstream>
...@@ -1220,6 +1221,7 @@ struct NPUStatusType {}; ...@@ -1220,6 +1221,7 @@ struct NPUStatusType {};
} }
DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE); DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
DEFINE_NPU_STATUS_TYPE(HcclResult, HCCL_SUCCESS);
} // namespace details } // namespace details
inline std::string build_npu_error_msg(aclError stat) { inline std::string build_npu_error_msg(aclError stat) {
...@@ -1228,6 +1230,12 @@ inline std::string build_npu_error_msg(aclError stat) { ...@@ -1228,6 +1230,12 @@ inline std::string build_npu_error_msg(aclError stat) {
return sout.str(); return sout.str();
} }
inline std::string build_npu_error_msg(HcclResult stat) {
std::ostringstream sout;
sout << " HCCL error, the error code is : " << stat << ". ";
return sout.str();
}
#define PADDLE_ENFORCE_NPU_SUCCESS(COND) \ #define PADDLE_ENFORCE_NPU_SUCCESS(COND) \
do { \ do { \
auto __cond__ = (COND); \ auto __cond__ = (COND); \
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#if defined(PADDLE_WITH_HCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_ASCEND_CL)
#include <stdio.h>
#include <memory>
#include <string>
#include <thread> // NOLINT
#include <typeindex>
#include <unordered_map>
#include <vector>
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/dynload/hccl.h"
#endif
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#define HCCL_ID_VARNAME "HCCLID"
namespace paddle {
namespace platform {
inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) {
if (type == framework::proto::VarType::FP32) {
return HCCL_DATA_TYPE_FP32;
} else if (type == framework::proto::VarType::FP16) {
return HCCL_DATA_TYPE_FP16;
} else if (type == framework::proto::VarType::INT32) {
return HCCL_DATA_TYPE_INT32;
} else if (type == framework::proto::VarType::INT8) {
return HCCL_DATA_TYPE_INT8;
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"This datatype in hccl is not supported."));
}
}
// NOTE(minqiyang): according to the ncclGroupEnd documentations:
// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
// ncclGroupEnd will wait for all communicators to be initialized, which will
// cause blocking problem when a runtime_error was thrown, so try only guard
// HCCL actions when use it.
// class HCCLGroupGuard {
// public:
// static std::mutex &HCCLMutex() {
// static std::mutex mtx;
// return mtx;
// }
// inline HCCLGroupGuard() {
// HCCLMutex().lock();
// PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart());
// }
// inline ~HCCLGroupGuard() PADDLE_MAY_THROW {
// PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd());
// HCCLMutex().unlock();
// }
// };
struct HCCLContext {
std::unique_ptr<NPUDeviceContext> ctx_;
HcclComm comm_;
explicit HCCLContext(int dev_id)
: ctx_(new NPUDeviceContext(NPUPlace(dev_id))), comm_{nullptr} {}
aclrtStream stream() const { return ctx_->stream(); }
HcclComm comm() const { return comm_; }
int device_id() const {
return BOOST_GET_CONST(platform::NPUPlace, ctx_->GetPlace()).device;
}
};
struct HCCLContextMap {
std::unordered_map<int, HCCLContext> contexts_;
std::vector<int> order_;
explicit HCCLContextMap(const std::vector<platform::Place> &places,
HcclRootInfo *hccl_id = nullptr,
size_t num_trainers = 1, size_t trainer_id = 0) {
PADDLE_ENFORCE_EQ(!places.empty(), true,
platform::errors::InvalidArgument(
"The HCCL place should not be empty."));
order_.reserve(places.size());
for (auto &p : places) {
int dev_id = BOOST_GET_CONST(NPUPlace, p).device;
order_.emplace_back(dev_id);
contexts_.emplace(dev_id, HCCLContext(dev_id));
}
PADDLE_ENFORCE_EQ(
order_.size(), contexts_.size(),
platform::errors::Unavailable("HCCL Context Map does not support "
"contain two or more same device."));
std::unique_ptr<HcclComm[]> comms(new HcclComm[order_.size()]);
// if num_trainers == 1, should create a new nccl id for local comms.
if (num_trainers == 1 && hccl_id == nullptr) {
// we do not know how to tackle this situation under hccl
// std::lock_guard<std::mutex> guard(HCCLGroupGuard::HCCLMutex());
// PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::ncclCommInitAll(
// comms.get(), static_cast<int>(order_.size()), order_.data()));
} else {
PADDLE_ENFORCE_NOT_NULL(hccl_id, platform::errors::InvalidArgument(
"The HCCL id should not be null."));
{
int nranks = num_trainers * order_.size();
// HCCLGroupGuard gurad;
for (size_t i = 0; i < order_.size(); ++i) {
int gpu_id = order_[i];
int rank;
if (order_.size() > 1) {
rank = trainer_id * order_.size() + i;
} else {
rank = trainer_id;
}
VLOG(1) << "init hccl rank:" << rank << ", nranks:" << nranks
<< ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
aclrtSetDevice(gpu_id);
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommInitRootInfo(
nranks, hccl_id, rank, comms.get() + i));
}
}
}
int i = 0;
for (auto &dev_id : order_) {
contexts_.at(dev_id).comm_ = comms[i++];
}
}
HCCLContextMap(const HCCLContextMap &other) = delete;
HCCLContextMap &operator=(const HCCLContextMap &other) = delete;
NPUDeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
NPUDeviceContext *DevCtx(platform::Place p) const {
return DevCtx(BOOST_GET_CONST(NPUPlace, p).device);
}
const HCCLContext &at(platform::Place p) const {
return this->at(BOOST_GET_CONST(NPUPlace, p).device);
}
const HCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }
void WaitAll() {
for (auto &p : contexts_) {
p.second.ctx_->Wait();
}
}
};
inline std::string GetFlatHCCLVarName(size_t pos) {
if (pos == 0) {
return HCCL_ID_VARNAME;
}
return string::Sprintf("%s_%d", HCCL_ID_VARNAME, static_cast<int>(pos));
}
inline std::string GetHierarchicalExterHCCLVarName(size_t pos) {
return string::Sprintf("Hierarchical_exter_%s_%d", HCCL_ID_VARNAME,
static_cast<int>(pos));
}
inline std::string GetHierarchicalInterHCCLVarName(size_t pos) {
return string::Sprintf("Hierarchical_inter_%s_%d", HCCL_ID_VARNAME,
static_cast<int>(pos));
}
class HCCLCommunicator {
public:
HCCLCommunicator() {}
virtual ~HCCLCommunicator() PADDLE_MAY_THROW {}
HCCLContextMap *DefaultFlatCtx() const {
if (flat_ctxs_.size() == 0) {
return nullptr;
}
return flat_ctxs_[0].get();
}
std::vector<std::unique_ptr<HCCLContextMap>> *GetFlatCtxs() {
return &flat_ctxs_;
}
HCCLContextMap *GetFlatCtx(size_t run_order) const {
return flat_ctxs_[run_order % flat_ctxs_.size()].get();
}
HCCLContextMap *GetRunEnvHCCLCtx(size_t run_order,
bool use_hierarchical_allreduce) const {
if (!use_hierarchical_allreduce) {
return GetFlatCtx(run_order);
}
return GetHierarchicalInterCtx(run_order);
}
/*
When nccl inits nccl comm using ncclCommInitAll, it meets error when
allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So
create a new nccl comm for sync_batch_norm_op. And these codes should be
polished with a unified nccl management.
*/
HCCLContextMap *GetSyncBatchNormCtx(
framework::Scope *scope, const std::vector<platform::Place> &places) {
auto *hccl_id_var = scope->FindVar(HCCL_ID_VARNAME);
if (hccl_id_var != nullptr) {
return DefaultFlatCtx();
}
if (sync_batch_norm_ctx_.get() == nullptr) {
sync_batch_norm_ctx_.reset(new HCCLContextMap(places));
}
return sync_batch_norm_ctx_.get();
}
void InitFlatCtxs(const std::vector<platform::Place> &places,
const std::vector<HcclRootInfo *> &hccl_ids,
size_t trainers_num, size_t trainer_id) {
if (hccl_ids.size() == 0) {
auto ptr = new platform::HCCLContextMap(places);
VLOG(1) << "init local trainer";
flat_ctxs_.emplace_back(ptr);
} else {
for (size_t i = 0; i < hccl_ids.size(); i++) {
auto ptr = new platform::HCCLContextMap(places, hccl_ids[i],
trainers_num, trainer_id);
VLOG(1) << "init trainer_id:" << trainer_id << ", comm no:" << i;
flat_ctxs_.emplace_back(ptr);
}
}
// as Executor have no way to use ncclComm created by ParallelExecutor,
// we assign all flatten contexts to HCCLCommContext to fix.
int nranks = static_cast<int>(trainers_num * places.size());
int nrings = static_cast<int>(flat_ctxs_.size());
for (int ring_id = 0; ring_id < nrings; ++ring_id) {
for (size_t p = 0; p < places.size(); ++p) {
int rank = trainer_id * places.size() + p;
int dev_id = BOOST_GET_CONST(NPUPlace, places[p]).device;
auto &ctx = flat_ctxs_[ring_id]->contexts_.at(dev_id);
HCCLCommContext::Instance().AssignHCCLComm(ctx.comm_, nranks, rank,
dev_id, ring_id);
}
}
}
void InitHierarchicalCtxs(const std::vector<platform::Place> &places,
const std::vector<HcclRootInfo *> &inter_hccl_ids,
const std::vector<HcclRootInfo *> &exter_hccl_ids,
size_t trainers_num, size_t trainer_id,
size_t inter_trainers_num,
size_t exter_trainers_num) {
PADDLE_ENFORCE_EQ(
trainers_num, inter_trainers_num * exter_trainers_num,
platform::errors::InvalidArgument(
"trainers_num:%llu != inter_trainers_num:%llu * "
"exter_trainers_num:%llu",
trainers_num, inter_trainers_num, exter_trainers_num));
PADDLE_ENFORCE_GT(
inter_trainers_num, 1,
platform::errors::InvalidArgument(
"The inter_trainers_num:%llu should be larger than 1.",
inter_trainers_num));
int inter_trainer_id = trainer_id % inter_trainers_num;
for (size_t i = 0; i < inter_hccl_ids.size(); i++) {
VLOG(1) << "init inter_trainer_id:" << inter_trainer_id
<< ", comm no:" << i;
auto local = new HCCLContextMap(places, inter_hccl_ids[i],
inter_trainers_num, inter_trainer_id);
h_inter_ctxs_.emplace_back(local);
}
int exter_trainer_id = -1;
if (trainer_id % inter_trainers_num == 0) {
exter_trainer_id = trainer_id / inter_trainers_num;
}
if (exter_trainer_id >= 0) {
for (size_t i = 0; i < exter_hccl_ids.size(); i++) {
auto ex = new HCCLContextMap(places, exter_hccl_ids[i],
exter_trainers_num, exter_trainer_id);
VLOG(1) << "init exter_trainer_id:" << exter_trainer_id
<< ", comm no:" << i;
h_exter_ctxs_.emplace_back(ex);
}
}
}
bool NeedExterAllReduce() const { return h_exter_ctxs_.size() > 0; }
HCCLContextMap *GetHierarchicalInterCtx(size_t run_order) const {
PADDLE_ENFORCE_GT(h_inter_ctxs_.size(), 0,
platform::errors::InvalidArgument(
"Hierarchical ctxs should be initialized firstly!"));
return h_inter_ctxs_[run_order % h_inter_ctxs_.size()].get();
}
HCCLContextMap *GetHierarchicalExterCtx(size_t run_order) const {
PADDLE_ENFORCE_GT(h_exter_ctxs_.size(), 0,
platform::errors::InvalidArgument(
"Hierarchical ctxs should be initialized firstly!"));
return h_exter_ctxs_[run_order % h_exter_ctxs_.size()].get();
}
std::vector<std::unique_ptr<HCCLContextMap>> *GetHierarchicalInterCtxs() {
return &h_inter_ctxs_;
}
std::vector<std::unique_ptr<HCCLContextMap>> *GetHierarchicalExterCtxs() {
return &h_exter_ctxs_;
}
protected:
// Support multi nccl comm on default nccl ring while HCCLContextMap can't.
std::vector<std::unique_ptr<HCCLContextMap>> flat_ctxs_;
// h_inter_ctxs_ and h_exter_ctxs_ are for 2d allreduce.
// And h_exter_ctxs_ can support multi comm too.
std::vector<std::unique_ptr<HCCLContextMap>> h_inter_ctxs_;
std::vector<std::unique_ptr<HCCLContextMap>> h_exter_ctxs_;
// just used for sync_batch_norm op.
std::unique_ptr<HCCLContextMap> sync_batch_norm_ctx_;
};
} // namespace platform
} // namespace paddle
#endif
...@@ -58,10 +58,10 @@ set(PYBIND_SRCS ...@@ -58,10 +58,10 @@ set(PYBIND_SRCS
compatible.cc compatible.cc
generator_py.cc) generator_py.cc)
if(WITH_ASCEND) if(WITH_ASCEND OR WITH_ASCEND_CL)
set(PYBIND_DEPS ${PYBIND_DEPS} ascend_wrapper) set(PYBIND_DEPS ${PYBIND_DEPS} ascend_wrapper)
set(PYBIND_SRCS ${PYBIND_SRCS} ascend_wrapper_py.cc) set(PYBIND_SRCS ${PYBIND_SRCS} ascend_wrapper_py.cc)
endif(WITH_ASCEND) endif()
if(WITH_GLOO) if(WITH_GLOO)
set(PYBIND_DEPS ${PYBIND_DEPS} gloo_context) set(PYBIND_DEPS ${PYBIND_DEPS} gloo_context)
...@@ -86,7 +86,11 @@ endif() ...@@ -86,7 +86,11 @@ endif()
if(WITH_PYTHON) if(WITH_PYTHON)
# generate op pybind functions automatically for dygraph. # generate op pybind functions automatically for dygraph.
set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag) if (WITH_ASCEND_CL)
set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag ascend_wrapper)
else()
set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
endif()
list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB}) list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB})
list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS}) list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS})
...@@ -100,6 +104,7 @@ if(WITH_PYTHON) ...@@ -100,6 +104,7 @@ if(WITH_PYTHON)
add_executable(op_function_generator op_function_generator.cc) add_executable(op_function_generator op_function_generator.cc)
target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS}) target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(op_function_generator ${os_dependency_modules}) target_link_libraries(op_function_generator ${os_dependency_modules})
if(WITH_ROCM) if(WITH_ROCM)
...@@ -153,9 +158,9 @@ if(WITH_PYTHON) ...@@ -153,9 +158,9 @@ if(WITH_PYTHON)
) )
endif() endif()
else(WIN32) else(WIN32)
# If there are no *.so in /usr/lib or LD_LIBRARY_PATH, # If there are no *.so in /usr/lib or LD_LIBRARY_PATH,
# copy these *.so to current directory and append current directory to # copy these *.so to current directory and append current directory to
# LD_LIBRARY_PATH. This is different with Windows platformm, which search # LD_LIBRARY_PATH. This is different with Windows platformm, which search
# *.dll in current directory automatically. # *.dll in current directory automatically.
add_custom_command(TARGET op_function_generator add_custom_command(TARGET op_function_generator
POST_BUILD POST_BUILD
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_WITH_ASCEND #ifdef PADDLE_WITH_ASCEND_CL
#include <fcntl.h> #include <fcntl.h>
#ifdef _POSIX_C_SOURCE #ifdef _POSIX_C_SOURCE
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#pragma once #pragma once
#ifdef PADDLE_WITH_ASCEND #ifdef PADDLE_WITH_ASCEND_CL
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
#include "pybind11/stl.h" #include "pybind11/stl.h"
......
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/pybind/pybind.h" #include "paddle/fluid/pybind/pybind.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
#ifdef PADDLE_WITH_ASCEND #ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/framework/fleet/ascend_wrapper.h" #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
#endif #endif
...@@ -182,16 +182,16 @@ const char* OUT_DUPLICABLE_INITIALIZER_TEMPLATE = R"({"%s", ConstructDuplicableO ...@@ -182,16 +182,16 @@ const char* OUT_DUPLICABLE_INITIALIZER_TEMPLATE = R"({"%s", ConstructDuplicableO
const char* INPUT_INITIALIZER_TEMPLATE = R"({"%s", {%s}})"; const char* INPUT_INITIALIZER_TEMPLATE = R"({"%s", {%s}})";
const char* INPUT_LIST_INITIALIZER_TEMPLATE = R"({"%s", %s})"; const char* INPUT_LIST_INITIALIZER_TEMPLATE = R"({"%s", %s})";
const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"( const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(
if (%s != nullptr) { if (%s != nullptr) {
ins["%s"] = {%s}; ins["%s"] = {%s};
} }
)"; )";
const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"( const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"(
if (%s.size() != 0) { if (%s.size() != 0) {
ins["%s"] = %s; ins["%s"] = %s;
} }
)"; )";
const char* OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"( const char* OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(
...@@ -264,8 +264,8 @@ R"( ...@@ -264,8 +264,8 @@ R"(
imperative::NameVarBaseMap ins = %s; imperative::NameVarBaseMap ins = %s;
%s %s
tracer->TraceOp("%s", ins, outs, attrs, {%s}); tracer->TraceOp("%s", ins, outs, attrs, {%s});
return %s; return %s;
} }
})"; })";
const char* PYBIND_ITEM_TEMPLATE = R"( %s.def("%s", &%s);)"; const char* PYBIND_ITEM_TEMPLATE = R"( %s.def("%s", &%s);)";
...@@ -350,7 +350,7 @@ std::string GenerateOpFunctionsBody( ...@@ -350,7 +350,7 @@ std::string GenerateOpFunctionsBody(
} }
ins_initializer += "}"; ins_initializer += "}";
if (input_args.back() == ',') { if (!input_args.empty() && input_args.back() == ',') {
input_args.pop_back(); input_args.pop_back();
} }
...@@ -364,6 +364,7 @@ std::string GenerateOpFunctionsBody( ...@@ -364,6 +364,7 @@ std::string GenerateOpFunctionsBody(
int outs_num = 0; int outs_num = 0;
for (auto& output : op_proto->outputs()) { for (auto& output : op_proto->outputs()) {
auto& out_name = output.name(); auto& out_name = output.name();
// skip those dispensable oututs // skip those dispensable oututs
if (output.dispensable() && !FindOutsMap(op_type, out_name)) { if (output.dispensable() && !FindOutsMap(op_type, out_name)) {
continue; continue;
...@@ -459,7 +460,7 @@ std::string GenerateOpFunctionsBody( ...@@ -459,7 +460,7 @@ std::string GenerateOpFunctionsBody(
return_str.pop_back(); return_str.pop_back();
} }
outs_initializer += "}"; outs_initializer += "}";
if (inplace_mapping_str.back() == ',') { if (!inplace_mapping_str.empty() && inplace_mapping_str.back() == ',') {
inplace_mapping_str.pop_back(); inplace_mapping_str.pop_back();
} }
if (!use_inplace_strategy && FindViewOpMap(op_type)) { if (!use_inplace_strategy && FindViewOpMap(op_type)) {
...@@ -567,7 +568,7 @@ int main(int argc, char* argv[]) { ...@@ -567,7 +568,7 @@ int main(int argc, char* argv[]) {
return -1; return -1;
} }
#ifdef PADDLE_WITH_ASCEND #ifdef PADDLE_WITH_ASCEND_CL
auto ascend_ptr = paddle::framework::AscendInstance::GetInstance(); auto ascend_ptr = paddle::framework::AscendInstance::GetInstance();
ascend_ptr->InitGEForUT(); ascend_ptr->InitGEForUT();
#endif #endif
...@@ -602,8 +603,9 @@ int main(int argc, char* argv[]) { ...@@ -602,8 +603,9 @@ int main(int argc, char* argv[]) {
out.close(); out.close();
#ifdef PADDLE_WITH_ASCEND #ifdef PADDLE_WITH_ASCEND_CL
ge::GEFinalize(); ge::GEFinalize();
#endif #endif
return 0; return 0;
} }
...@@ -63,7 +63,6 @@ def _get_ascend_rankfile(rank_table_file_path): ...@@ -63,7 +63,6 @@ def _get_ascend_rankfile(rank_table_file_path):
Returns: Returns:
node_ips: node ip list node_ips: node ip list
device_count: number of npu per machine device_count: number of npu per machine
""" """
json_data = None json_data = None
with open(rank_table_file_path) as json_file: with open(rank_table_file_path) as json_file:
......
...@@ -163,6 +163,33 @@ class CollectiveHelper(object): ...@@ -163,6 +163,33 @@ class CollectiveHelper(object):
'ring_id': ring_id, 'ring_id': ring_id,
OP_ROLE_KEY: OpRole.Forward OP_ROLE_KEY: OpRole.Forward
}) })
elif core.is_compiled_with_npu():
hccl_id_var = block.create_var(
name=unique_name.generate('hccl_id'),
persistable=True,
type=core.VarDesc.VarType.RAW)
endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
block.append_op(
type='c_gen_hccl_id',
inputs={},
outputs={'Out': hccl_id_var},
attrs={
'rank': rank,
'endpoint': current_endpoint,
'other_endpoints': other_endpoints,
OP_ROLE_KEY: OpRole.Forward
})
block.append_op(
type='c_comm_init_hccl',
inputs={'X': hccl_id_var},
outputs={},
attrs={
'rank': rank,
'ring_id': ring_id,
'device_id': int(os.getenv("FLAGS_selected_npus")),
'rank_ids': nranks,
OP_ROLE_KEY: OpRole.Forward
})
else: else:
raise ValueError( raise ValueError(
"comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu." "comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu."
......
...@@ -2254,7 +2254,7 @@ class Operator(object): ...@@ -2254,7 +2254,7 @@ class Operator(object):
'gen_bkcl_id', 'c_gen_bkcl_id', 'gen_nccl_id', 'c_gen_nccl_id', 'gen_bkcl_id', 'c_gen_bkcl_id', 'gen_nccl_id', 'c_gen_nccl_id',
'c_comm_init', 'c_sync_calc_stream', 'c_sync_comm_stream', 'c_comm_init', 'c_sync_calc_stream', 'c_sync_comm_stream',
'queue_generator', 'dequeue', 'enqueue', 'heter_listen_and_serv', 'queue_generator', 'dequeue', 'enqueue', 'heter_listen_and_serv',
'c_wait_comm', 'c_wait_compute' 'c_wait_comm', 'c_wait_compute', 'c_gen_hccl_id', 'c_comm_init_hccl'
} }
def __init__(self, def __init__(self,
......
...@@ -105,30 +105,58 @@ class Collective(object): ...@@ -105,30 +105,58 @@ class Collective(object):
wait_server_ready(other_endpoints) wait_server_ready(other_endpoints)
block = program.global_block() block = program.global_block()
nccl_id_var = block.create_var( if core.is_compiled_with_npu():
name=unique_name.generate('nccl_id'), hccl_id_var = block.create_var(
persistable=True, name=unique_name.generate('hccl_id'),
type=core.VarDesc.VarType.RAW) persistable=True,
block.append_op( type=core.VarDesc.VarType.RAW)
type='c_gen_nccl_id', endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
inputs={}, block.append_op(
outputs={'Out': nccl_id_var}, type='c_gen_hccl_id',
attrs={ inputs={},
'rank': rank, outputs={'Out': hccl_id_var},
'endpoint': current_endpoint, attrs={
'other_endpoints': other_endpoints, 'rank': rank,
self.op_role_key: OpRole.Forward 'endpoint': current_endpoint,
}) 'other_endpoints': other_endpoints,
block.append_op( self.op_role_key: OpRole.Forward
type='c_comm_init', })
inputs={'X': nccl_id_var}, block.append_op(
outputs={}, type='c_comm_init_hccl',
attrs={ inputs={'X': hccl_id_var},
'nranks': nranks, outputs={},
'rank': rank, attrs={
'ring_id': ring_id, 'rank': rank,
self.op_role_key: OpRole.Forward 'ring_id': ring_id,
}) 'device_id': int(os.getenv("FLAGS_selected_npus")),
'rank_ids': nranks,
self.op_role_key: OpRole.Forward
})
else:
nccl_id_var = block.create_var(
name=unique_name.generate('nccl_id'),
persistable=True,
type=core.VarDesc.VarType.RAW)
block.append_op(
type='c_gen_nccl_id',
inputs={},
outputs={'Out': nccl_id_var},
attrs={
'rank': rank,
'endpoint': current_endpoint,
'other_endpoints': other_endpoints,
self.op_role_key: OpRole.Forward
})
block.append_op(
type='c_comm_init',
inputs={'X': nccl_id_var},
outputs={},
attrs={
'nranks': nranks,
'rank': rank,
'ring_id': ring_id,
self.op_role_key: OpRole.Forward
})
def _broadcast_params(self): def _broadcast_params(self):
block = self.startup_program.global_block() block = self.startup_program.global_block()
......
...@@ -136,30 +136,56 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint, ...@@ -136,30 +136,56 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
if rank == 0 and wait_port: if rank == 0 and wait_port:
wait_server_ready(other_endpoints) wait_server_ready(other_endpoints)
block = program.global_block() block = program.global_block()
nccl_id_var = block.create_var( if core.is_compiled_with_cuda():
name=fluid.unique_name.generate('nccl_id'), nccl_id_var = block.create_var(
persistable=True, name=fluid.unique_name.generate('nccl_id'),
type=fluid.core.VarDesc.VarType.RAW) persistable=True,
type=fluid.core.VarDesc.VarType.RAW)
block.append_op(
type='c_gen_nccl_id', block.append_op(
inputs={}, type='c_gen_nccl_id',
outputs={'Out': nccl_id_var}, inputs={},
attrs={ outputs={'Out': nccl_id_var},
'rank': rank, attrs={
'endpoint': current_endpoint, 'rank': rank,
'other_endpoints': other_endpoints 'endpoint': current_endpoint,
}) 'other_endpoints': other_endpoints
})
block.append_op(
type='c_comm_init', block.append_op(
inputs={'X': nccl_id_var}, type='c_comm_init',
outputs={}, inputs={'X': nccl_id_var},
attrs={ outputs={},
'nranks': nranks, attrs={
'rank': rank, 'nranks': nranks,
'ring_id': 0, 'rank': rank,
}) 'ring_id': 0,
})
elif core.is_compiled_with_npu():
hccl_id_var = block.create_var(
name=unique_name.generate('hccl_id'),
persistable=True,
type=core.VarDesc.VarType.RAW)
endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
block.append_op(
type='c_gen_hccl_id',
inputs={},
outputs={'Out': hccl_id_var},
attrs={
'rank': rank,
'endpoint': current_endpoint,
'other_endpoints': other_endpoints
})
block.append_op(
type='c_comm_init_hccl',
inputs={'X': hccl_id_var},
outputs={},
attrs={
'rank': rank,
'ring_id': 0,
'device_id': int(os.getenv("FLAGS_selected_npus")),
'rank_ids': nranks
})
def prepare_distributed_context(place=None): def prepare_distributed_context(place=None):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册