From 8c7c53b3d5237bcdbcb42e492ec51bc581223549 Mon Sep 17 00:00:00 2001 From: zhang wenhui Date: Wed, 7 Apr 2021 19:06:26 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90NPU=E3=80=91Merge=20ascend=20GE&distri?= =?UTF-8?q?buted=20code=20by=200208=20from=20ascendrc=20(#31957)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Ascend rc (#30483) * Fix compilcation on CANN20.1 and older (#30494) Fix compilcation on CANN20.1 and older * Add distribution supported (#30578) Add distribution supported * Build praser for Hcom* operators (#30627) Build praser for Hcom* operators * Pass device_ids info from launch to trainer. (#30632) Pass device_ids info from launch to trainer * Add Hccl program group (#30642) Add Hccl program group * Add startup bash files of test_ascend_group. (#30645) Add startup bash files of test_ascend_group * cleanup (#30646) cleanup test_ascend_group.py * [Feature] Build parser to support distributed training (#30658) [Feature] Build parser to support distributed training * fix compilation on ascend-20.1 (#30722) fix compilation on ascend-20.1 * Dev/fix ascend string (#30749) Dev/fix ascend string * code style (#30781) code style * Merge ascend_optimizer and ascend_parser. (#30776) Merge ascend_optimizer and ascend_parser. * Ascendrc add converted op : [range/equal/range/uniform_random/expand/squeeze], fix cast op bug (#30797) Ascendrc add converted op : [range/equal/range/uniform_random/expand/squeeze], fix cast op bug * Add paddle ascend distribution training supported (#30796) Add paddle ascend distribution training supported * pass cxx_flags to gloo cmake (#30857) * Destroy session first. (#30954) Destroy session first. * merge * fix, test=develop * fix, test=develop * fix style, test=develop * fix, test=develop * fix * fix log fatal, test=develop * fix enforce style, test=develop * fix, test=develop * fix, test=develop * fix rccl, test=develop * fix test, test=develop * fix, test=develop * fix, test=develop * fix, test=develop * fix node_num, test=develop * fix ids str, test=develop * fix ids str, test=develop * fix ids str, test=develop * fix, test=develop * fix, test=develop * fix, test=develop * fix, test=develop * fix, test=develop * fix, test=develop * fix, test=develop * fix, test=develop * fix style code, test=develop * fix style code, test=develop * fix style code, test=develop * fix style code, test=develop Co-authored-by: hutuxian Co-authored-by: gongweibao Co-authored-by: Void Main Co-authored-by: Leo Chen Co-authored-by: dingsiyu <18369187719@163.com> Co-authored-by: OleNet --- CMakeLists.txt | 4 + cmake/external/ascend.cmake | 85 +- cmake/external/gloo.cmake | 48 +- cmake/external/protobuf.cmake | 11 +- cmake/external/threadpool.cmake | 6 +- cmake/external/warpctc.cmake | 106 +- paddle/fluid/framework/fleet/CMakeLists.txt | 2 +- paddle/fluid/framework/fleet/ascend_wrapper.h | 45 +- paddle/fluid/memory/allocation/CMakeLists.txt | 2 + .../fluid/operators/collective/CMakeLists.txt | 6 + .../operators/collective/c_gen_nccl_id_op.cc | 16 + .../operators/collective/gen_nccl_id_op.cc | 15 + paddle/fluid/platform/CMakeLists.txt | 10 + paddle/fluid/platform/ascend_npu_info.cc | 36 + paddle/fluid/platform/ascend_npu_info.h | 31 + paddle/fluid/pybind/ascend_wrapper_py.cc | 300 ++- paddle/fluid/pybind/ascend_wrapper_py.h | 1 + paddle/fluid/pybind/op_function_generator.cc | 15 + paddle/fluid/pybind/pybind.cc | 10 + python/paddle/distributed/fleet/__init__.py | 11 + .../distributed/fleet/base/fleet_base.py | 12 + .../distributed/fleet/base/role_maker.py | 26 + python/paddle/distributed/fleet/launch.py | 44 +- .../paddle/distributed/fleet/launch_utils.py | 100 +- .../fleet/meta_optimizers/ascend/__init__.py | 13 + .../ascend/ascend_optimizer.py | 119 +- .../meta_optimizers/ascend/ascend_parser.py | 2076 +++++++++++++++-- .../graph_execution_optimizer.py | 5 +- .../fluid/tests/unittests/CMakeLists.txt | 9 +- .../fluid/tests/unittests/ascend_group.py | 140 ++ .../ascend_multi_process_collective.py | 41 + .../tests/unittests/test_ascend_group.sh | 30 + .../unittests/test_fleet_launch_ascend.sh | 59 + .../fluid/transpiler/ascend_transpiler.py | 74 + python/setup.py.in | 1 + 35 files changed, 3057 insertions(+), 452 deletions(-) create mode 100644 paddle/fluid/platform/ascend_npu_info.cc create mode 100644 paddle/fluid/platform/ascend_npu_info.h create mode 100644 python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py create mode 100644 python/paddle/fluid/tests/unittests/ascend_group.py create mode 100644 python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py create mode 100644 python/paddle/fluid/tests/unittests/test_ascend_group.sh create mode 100644 python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh create mode 100644 python/paddle/fluid/transpiler/ascend_transpiler.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 2d2f613eff5..59bc768aa41 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,7 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF) option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF) +option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF) if (WITH_GPU AND WITH_XPU) message(FATAL_ERROR "Error when compile GPU and XPU at the same time") endif() @@ -57,6 +58,9 @@ if(WITH_MUSL) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy") endif() +if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") +endif() if(WIN32) option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index bcf0c0a0646..a0b6f480f95 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -12,50 +12,47 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) - -SET(ASCEND_PROJECT "extern_ascend") -IF((NOT DEFINED ASCEND_VER) OR (NOT DEFINED ASCEND_URL)) - MESSAGE(STATUS "use pre defined download url") - SET(ASCEND_VER "0.1.1" CACHE STRING "" FORCE) - SET(ASCEND_NAME "ascend" CACHE STRING "" FORCE) - SET(ASCEND_URL "http://paddle-ascend.bj.bcebos.com/ascend.tar.gz" CACHE STRING "" FORCE) -ENDIF() -MESSAGE(STATUS "ASCEND_NAME: ${ASCEND_NAME}, ASCEND_URL: ${ASCEND_URL}") -SET(ASCEND_SOURCE_DIR "${THIRD_PARTY_PATH}/ascend") -SET(ASCEND_DOWNLOAD_DIR "${ASCEND_SOURCE_DIR}/src/${ASCEND_PROJECT}") -SET(ASCEND_DST_DIR "ascend") -SET(ASCEND_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") -SET(ASCEND_INSTALL_DIR ${ASCEND_INSTALL_ROOT}/${ASCEND_DST_DIR}) -SET(ASCEND_ROOT ${ASCEND_INSTALL_DIR}) -SET(ASCEND_INC_DIR ${ASCEND_ROOT}/include) -SET(ASCEND_LIB_DIR ${ASCEND_ROOT}/lib) -SET(ASCEND_LIB ${ASCEND_LIB_DIR}/libge_runner.so) -SET(ASCEND_GRAPH_LIB ${ASCEND_LIB_DIR}/libgraph.so) -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${ASCEND_ROOT}/lib") - -INCLUDE_DIRECTORIES(${ASCEND_INC_DIR}) -FILE(WRITE ${ASCEND_DOWNLOAD_DIR}/CMakeLists.txt - "PROJECT(ASCEND)\n" - "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY ${ASCEND_NAME}/include ${ASCEND_NAME}/lib \n" - " DESTINATION ${ASCEND_DST_DIR})\n") -ExternalProject_Add( - ${ASCEND_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${ASCEND_SOURCE_DIR} - DOWNLOAD_DIR ${ASCEND_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${ASCEND_URL} -c -q -O ${ASCEND_NAME}.tar.gz - && tar zxvf ${ASCEND_NAME}.tar.gz - DOWNLOAD_NO_PROGRESS 1 - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ASCEND_INSTALL_ROOT} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ASCEND_INSTALL_ROOT} -) -ADD_LIBRARY(ascend SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET ascend PROPERTY IMPORTED_LOCATION ${ASCEND_LIB}) + +#NOTE: Logic is from +# https://github.com/mindspore-ai/graphengine/blob/master/CMakeLists.txt +if(DEFINED ENV{ASCEND_CUSTOM_PATH}) + set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH}) +else() + set(ASCEND_DIR /usr/local/Ascend) +endif() + +set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64) +set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common) +set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share) +set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64) +set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64) +set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64) +set(STATIC_ACL_LIB ${ASCEND_ACL_DIR}) + +set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR}) +set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR}) +set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) +set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) +set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64) +set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64) +set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR}) + +set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so) +set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so) +set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so) +INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR}) + +if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h) + add_definitions(-DPADDLE_WITH_ASCEND_STRING) +endif() + +ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib}) ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${ASCEND_GRAPH_LIB}) -ADD_DEPENDENCIES(ascend ascend_graph ${ASCEND_PROJECT}) +SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib}) + +ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib}) +add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl) diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake index ea7af315e1a..2e4a67093dc 100644 --- a/cmake/external/gloo.cmake +++ b/cmake/external/gloo.cmake @@ -32,21 +32,39 @@ cache_third_party(extern_gloo TAG ${GLOO_TAG} DIR GLOO_SOURCE_DIR) -ExternalProject_Add( - extern_gloo - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - "${GLOO_DOWNLOAD_CMD}" - PREFIX "${GLOO_PREFIX_DIR}" - SOURCE_DIR "${GLOO_SOURCE_DIR}" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND mkdir -p ${GLOO_SOURCE_DIR}/build - && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make - && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo - INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} - COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo" -) +if(WITH_ASCEND) + ExternalProject_Add( + extern_gloo + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + "${GLOO_DOWNLOAD_CMD}" + PREFIX "${GLOO_PREFIX_DIR}" + SOURCE_DIR "${GLOO_SOURCE_DIR}" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND mkdir -p ${GLOO_SOURCE_DIR}/build + && cd ${GLOO_SOURCE_DIR}/build && cmake .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make + && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo" + ) +else() + ExternalProject_Add( + extern_gloo + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + "${GLOO_DOWNLOAD_CMD}" + PREFIX "${GLOO_PREFIX_DIR}" + SOURCE_DIR "${GLOO_SOURCE_DIR}" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND mkdir -p ${GLOO_SOURCE_DIR}/build + && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make + && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo" + ) +endif() ADD_LIBRARY(gloo STATIC IMPORTED GLOBAL) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 40a27f506f3..1466664c126 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -198,8 +198,13 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}") ENDIF() +if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) + SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) + SET(PROTOBUF_TAG v3.8.0) +else() SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546) +endif() cache_third_party(${TARGET_NAME} REPOSITORY ${PROTOBUF_REPOSITORY} @@ -234,7 +239,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ) ENDFUNCTION() -SET(PROTOBUF_VERSION 3.1.0) +if(WITH_ASCEND) + SET(PROTOBUF_VERSION 3.8.0) +else() + SET(PROTOBUF_VERSION 3.1.0) +endif() IF(NOT PROTOBUF_FOUND) build_protobuf(extern_protobuf FALSE) diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake index 205e8d26d93..0eabdb4e127 100644 --- a/cmake/external/threadpool.cmake +++ b/cmake/external/threadpool.cmake @@ -16,7 +16,11 @@ INCLUDE(ExternalProject) SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool) SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool) -SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git) +if(WITH_ASCEND) + SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git) +else() + SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git) +endif() SET(THREADPOOL_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040) cache_third_party(extern_threadpool diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index ac28f7561f6..a4367510ac7 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -21,6 +21,8 @@ ENDIF() SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc) SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc) SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) +# in case of low internet speed +#set(WARPCTC_REPOSITORY https://gitee.com/tianjianhe/warp-ctc.git) set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git) set(WARPCTC_TAG c690fc5755abbdbdc98ef78d51ec10a6748a8cd1) @@ -41,39 +43,77 @@ cache_third_party(extern_warpctc TAG ${WARPCTC_TAG} DIR WARPCTC_SOURCE_DIR) -ExternalProject_Add( - extern_warpctc - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - "${WARPCTC_DOWNLOAD_CMD}" - PREFIX ${WARPCTC_PREFIX_DIR} - SOURCE_DIR ${WARPCTC_SOURCE_DIR} - #UPDATE_COMMAND "" - PATCH_COMMAND "" - BUILD_ALWAYS 1 - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_C_FLAGS=$ - -DCMAKE_C_FLAGS_DEBUG=$ - -DCMAKE_C_FLAGS_RELEASE=$ - -DCMAKE_CXX_FLAGS=$ - -DCMAKE_CXX_FLAGS_RELEASE=$ - -DCMAKE_CXX_FLAGS_DEBUG=$ - -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} - -DWITH_GPU=${WITH_GPU} - -DWITH_ROCM=${WITH_ROCM} - -DWITH_OMP=${USE_OMP} - -DWITH_TORCH=OFF - -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON - -DBUILD_SHARED=ON - -DBUILD_TESTS=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} -) +if(WITH_ASCEND) + ExternalProject_Add( + extern_warpctc + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + "${WARPCTC_DOWNLOAD_CMD}" + PREFIX ${WARPCTC_PREFIX_DIR} + SOURCE_DIR ${WARPCTC_SOURCE_DIR} + #UPDATE_COMMAND "" + PATCH_COMMAND "" + BUILD_ALWAYS 1 + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} + -DWITH_GPU=${WITH_GPU} + -DWITH_ROCM=${WITH_ROCM} + -DWITH_OMP=${USE_OMP} + -DWITH_TORCH=OFF + -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON + -DBUILD_SHARED=ON + -DBUILD_TESTS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} + ) +else() + ExternalProject_Add( + extern_warpctc + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + "${WARPCTC_DOWNLOAD_CMD}" + PREFIX ${WARPCTC_PREFIX_DIR} + SOURCE_DIR ${WARPCTC_SOURCE_DIR} + #UPDATE_COMMAND "" + PATCH_COMMAND "" + BUILD_ALWAYS 1 + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=$ + -DCMAKE_C_FLAGS_DEBUG=$ + -DCMAKE_C_FLAGS_RELEASE=$ + -DCMAKE_CXX_FLAGS=$ + -DCMAKE_CXX_FLAGS_RELEASE=$ + -DCMAKE_CXX_FLAGS_DEBUG=$ + -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} + -DWITH_GPU=${WITH_GPU} + -DWITH_ROCM=${WITH_ROCM} + -DWITH_OMP=${USE_OMP} + -DWITH_TORCH=OFF + -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON + -DBUILD_SHARED=ON + -DBUILD_TESTS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} + ) +endif() + + IF(WIN32) SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" CACHE FILEPATH "Warp-ctc Library" FORCE) diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index 61f3c026f1f..ce0a905afc6 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -42,5 +42,5 @@ cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_conte cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell) if(WITH_ASCEND) - cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend ascend_graph) + cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph) endif(WITH_ASCEND) diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h index da79fccb8ca..baa2fd126a4 100644 --- a/paddle/fluid/framework/fleet/ascend_wrapper.h +++ b/paddle/fluid/framework/fleet/ascend_wrapper.h @@ -37,25 +37,50 @@ limitations under the License. */ namespace paddle { namespace framework { -// typedef std::vector AscendGraphDesc; typedef ge::Graph AscendGraphDesc; +#ifdef PADDLE_WITH_ASCEND_STRING +using AscendString = ge::AscendString; +#else +using AscendString = std::string; +#endif + class AscendInstance { public: virtual ~AscendInstance() {} AscendInstance() {} - std::map GetDefaultInitSessionOptions() { - std::map init_options; - init_options["a"] = "b"; - init_options["ge.trainFlag"] = "1"; + std::map _GetDefaultInitOptions() { + std::map init_options; + init_options["ge.exec.deviceId"] = "0"; + init_options["ge.graphRunMode"] = "1"; + return init_options; + } + + std::map _GetDefaultInitSessionOptions() { + std::map init_options; + // init_options["a"] = "b"; + // init_options["ge.trainFlag"] = "1"; return init_options; } - // add other parameters here to init + ge::Status InitGEForUT() { + return ge::GEInitialize(_GetDefaultInitOptions()); + } + void InitGlobalResouces() { - session_.reset(new ge::Session(GetDefaultInitSessionOptions())); - VLOG(1) << "InitGlobalResouces Done"; + LOG(INFO) << "Begin ascend InitGlobalResouces"; + session_.reset(new ge::Session(_GetDefaultInitSessionOptions())); + if (session_ == nullptr) { + PADDLE_THROW(platform::errors::Fatal("new session error: nullptr")); + } + LOG(INFO) << "End ascend InitGlobalResouces"; + } + + void DestroyGlobalResouces() { + LOG(INFO) << "Begin ascend DestroyGlobalResouces"; + session_ = nullptr; + LOG(INFO) << "Begin ascend DestroyGlobalResouces"; } static std::shared_ptr GetInstance() { @@ -178,6 +203,6 @@ class AscendInstance { private: static std::shared_ptr ascend_instance_; }; -} // end namespace framework -} // end namespace paddle +} // namespace framework +} // namespace paddle #endif diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 377ea376773..565797d51dd 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -33,6 +33,8 @@ if (WITH_GPU OR WITH_ROCM) set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator) elseif(WITH_XPU) set(AllocatorFacadeDeps xpu_info) +elseif(WITH_ASCEND) + set(AllocatorFacadeDeps ascend_npu_info) else () set(AllocatorFacadeDeps) endif() diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt index 8920541b9b9..977a208d20e 100644 --- a/paddle/fluid/operators/collective/CMakeLists.txt +++ b/paddle/fluid/operators/collective/CMakeLists.txt @@ -19,6 +19,12 @@ if(WITH_NCCL OR WITH_RCCL) op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) endif() +if(WITH_ASCEND) + op_library(gen_nccl_id_op) + op_library(c_gen_nccl_id_op) +endif() + + if(WITH_GLOO) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper) endif() diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc index 1592d809f91..7da30f64d1c 100644 --- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc @@ -27,6 +27,7 @@ limitations under the License. */ namespace paddle { namespace operators { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) static void GenNCCLID(std::vector* nccl_ids) { for (size_t i = 0; i < nccl_ids->size(); ++i) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -84,6 +85,21 @@ class CGenNCCLIdOp : public framework::OperatorBase { } }; +#else +class CGenNCCLIdOp : public framework::OperatorBase { + public: + CGenNCCLIdOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override {} +}; + +#endif + class CGenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { diff --git a/paddle/fluid/operators/collective/gen_nccl_id_op.cc b/paddle/fluid/operators/collective/gen_nccl_id_op.cc index 679713d05bc..99a92469e85 100644 --- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc @@ -34,6 +34,7 @@ class Scope; namespace paddle { namespace operators { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) static void GenNCCLID(std::vector* nccl_ids) { for (size_t i = 0; i < nccl_ids->size(); ++i) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -194,6 +195,20 @@ class GenNCCLIdOp : public framework::OperatorBase { } }; +#else +class GenNCCLIdOp : public framework::OperatorBase { + public: + GenNCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override {} +}; + +#endif + class GenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 47344f0e373..1e16008f36b 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -10,6 +10,12 @@ ELSE() set(XPU_CTX_DEPS) endif(WITH_XPU) +if(WITH_ASCEND) + set(ASCEND_DEPS xpulib) +ELSE() + set(ASCEND_DEPS) +endif(WITH_ASCEND) + if (WITH_PYTHON) py_proto_compile(profiler_py_proto SRCS profiler.proto) add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) @@ -66,6 +72,10 @@ if(WITH_XPU) cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib) endif() +if(WITH_ASCEND) + cc_library(ascend_npu_info SRCS ascend_npu_info.cc DEPS gflags glog enforce atlas_acl) +endif() + add_subdirectory(dynload) add_subdirectory(stream) diff --git a/paddle/fluid/platform/ascend_npu_info.cc b/paddle/fluid/platform/ascend_npu_info.cc new file mode 100644 index 00000000000..db8dafeae1e --- /dev/null +++ b/paddle/fluid/platform/ascend_npu_info.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/platform/ascend_npu_info.h" +#include +#include "acl/acl_rt.h" + +namespace paddle { +namespace platform { +namespace ascend { + +int NPUDevice::GetDeviceCount() { + uint32_t count = 0; + aclError status = aclrtGetDeviceCount(&count); + if (status != 0) { + PADDLE_THROW(platform::errors::InvalidArgument( + "aclrtGetDeviceCount error code: %d", status)); + return -1; + } + + return count; +} + +} // namespace ascend +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/ascend_npu_info.h b/paddle/fluid/platform/ascend_npu_info.h new file mode 100644 index 00000000000..7afed121a5a --- /dev/null +++ b/paddle/fluid/platform/ascend_npu_info.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#ifdef PADDLE_WITH_ASCEND + +namespace paddle { +namespace platform { +namespace ascend { + +class NPUDevice { + public: + //! Get the total number of XPU devices in system. + static int GetDeviceCount(); +}; + +} // namespace ascend +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/pybind/ascend_wrapper_py.cc b/paddle/fluid/pybind/ascend_wrapper_py.cc index 00eca380859..303ab5c0fe8 100644 --- a/paddle/fluid/pybind/ascend_wrapper_py.cc +++ b/paddle/fluid/pybind/ascend_wrapper_py.cc @@ -32,6 +32,8 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/fleet/ascend_wrapper.h" +#include "paddle/fluid/platform/ascend_npu_info.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/pybind/ascend_wrapper_py.h" using namespace ge; // NOLINT @@ -40,6 +42,12 @@ namespace py = pybind11; namespace paddle { namespace pybind { +#ifdef PADDLE_WITH_ASCEND_STRING +using AscendString = AscendString; +#else +using AscendString = std::string; +#endif + void BindAscendWrapper(py::module *m) { py::class_>(*m, "AscendInstance") @@ -47,13 +55,31 @@ void BindAscendWrapper(py::module *m) { .def("init_global_resources", &framework::AscendInstance::InitGlobalResouces, py::call_guard()) + .def("destroy_global_resources", + &framework::AscendInstance::DestroyGlobalResouces, + py::call_guard()) .def("add_ascend_subgraph", &framework::AscendInstance::AddAscendSubgraph, py::call_guard()); -} // end AscendWrapper +} -Status ge_initialize(std::map &options) { // NOLINT +std::map convert_map( + const std::map &options) { + std::map rets; + for (auto &option : options) { + AscendString key = option.first.c_str(); + AscendString val = option.second.c_str(); + rets[key] = val; + } + return rets; +} + +ge::Status ge_initialize( + std::map &options) { // NOLINT py::gil_scoped_release release; - Status res = GEInitialize(options); + auto init_options = convert_map(options); + ge::Status res = ge::GEInitialize(init_options); + PADDLE_ENFORCE_EQ(res, ge::SUCCESS, platform::errors::Fatal( + "ge initialize not success:%d", res)); py::gil_scoped_acquire acquire; return res; } @@ -82,11 +108,18 @@ enum AttrType { AT_NAMEATTR }; +void BindAscendDevice(py::module *m) { + py::class_(*m, "NPUDevice") + .def_static( + "get_device_count", + static_cast(&platform::ascend::NPUDevice::GetDeviceCount)); +} + void BindAscendGraph(py::module *m) { m->def("ge_initialize", &ge_initialize, "GEInitialize"); m->def("ge_finalize", &GEFinalize, "GEFinalize"); - //枚举封装 + // enum py::enum_(*m, "GEGraphRunMode") .value("PREDICTION", GraphRunMode::PREDICTION) .value("TRAIN", GraphRunMode::TRAIN) @@ -214,24 +247,34 @@ void BindAscendGraph(py::module *m) { // 类封装 py::class_(*m, "GESession") - .def(py::init &>()) + .def(py::init([](const std::map &options) { + return std::unique_ptr( + new ge::Session(convert_map(options))); + })) + .def("add_graph", (ge::Status (Session::*)(uint32_t, const Graph &)) & + Session::AddGraph) .def("add_graph", - (Status (Session::*)(uint32_t, const Graph &)) & Session::AddGraph) - .def("add_graph", - (Status (Session::*)(uint32_t, const Graph &, - const std::map &)) & - Session::AddGraph) + [](Session &ss, uint32_t index, const Graph &graph, + const std::map &options) { + return ss.AddGraph(index, graph, convert_map(options)); + }) .def("remove_graph", &Session::RemoveGraph) .def("run_graph", [](Session &ss, uint32_t graphId, const std::vector &inputs) -> py::tuple { std::vector outputs; - Status res = ss.RunGraph(graphId, inputs, outputs); + ge::Status res = ss.RunGraph(graphId, inputs, outputs); return py::make_tuple(outputs, res); }, py::call_guard()) .def("build_graph", &Session::BuildGraph) .def("run_graph_async", &Session::RunGraphAsync) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("register_call_back_func", + static_cast( + &ge::Session::RegisterCallBackFunc)) +#else .def("register_call_back_func", (Status (Session::*)( // NOLINT const std::string &, @@ -239,11 +282,12 @@ void BindAscendGraph(py::module *m) { uint32_t graph_id, const std::map ¶ms_list)>)) & Session::RegisterCallBackFunc) +#endif .def("is_graph_need_rebuild", &Session::IsGraphNeedRebuild); py::class_(*m, "GEGraph") .def(py::init<>()) - .def(py::init()) + .def(py::init()) .def("set_inputs", &Graph::SetInputs) .def("set_outputs", (Graph & (Graph::*)(const std::vector &)) & Graph::SetOutputs) @@ -253,40 +297,70 @@ void BindAscendGraph(py::module *m) { Graph::SetOutputs) .def("set_outputs", (Graph & - (Graph::*)(const std::vector> + (Graph::*)(const std::vector> &)) & Graph::SetOutputs) .def("set_targets", &Graph::SetTargets) .def("is_valid", &Graph::IsValid) .def("add_op", &Graph::AddOp) .def("find_op_by_name", - [](Graph &graph, const std::string &name) -> py::tuple { + [](Graph &graph, const char *name) -> py::tuple { ge::Operator op; graphStatus status = graph.FindOpByName(name, op); return py::make_tuple(op, status); }) .def("find_op_by_type", - [](Graph &graph, const std::string &type) -> py::tuple { + [](Graph &graph, const char *type) -> py::tuple { std::vector ops; graphStatus status = graph.FindOpByType(type, ops); return py::make_tuple(ops, status); }) .def("get_all_op_name", [](Graph &graph) -> py::tuple { - std::vector op_name; + std::vector op_name; graphStatus status = graph.GetAllOpName(op_name); return py::make_tuple(op_name, status); }) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("save_to_file", + static_cast( + &ge::Graph::SaveToFile)) + .def("load_from_file", + static_cast( + &Graph::LoadFromFile)) + .def("get_name", + static_cast( + &Graph::GetName)) +#else .def("save_to_file", &Graph::SaveToFile) .def("load_from_file", &Graph::LoadFromFile) .def("get_name", &Graph::GetName) +#endif .def("set_need_iteration", &Graph::SetNeedIteration); py::class_(*m, "GEOperator") .def(py::init<>()) - .def(py::init()) - .def(py::init()) + .def(py::init()) + .def(py::init()) .def("is_empty", &Operator::IsEmpty) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("get_name", + static_cast( + &Operator::GetName)) + .def("get_op_type", + static_cast( + &Operator::GetOpType)) + .def("set_input", + (Operator & (Operator::*)(const char *, const Operator &)) & + Operator::SetInput) + .def("set_input", + (Operator & + (Operator::*)(const char *, const Operator &, const char *)) & + Operator::SetInput) + .def("set_input", (Operator & (Operator::*)(const char *, + const Operator &, uint32_t)) & + Operator::SetInput) +#else .def("get_name", &Operator::GetName) .def("get_op_type", &Operator::GetOpType) .def("set_input", @@ -299,13 +373,28 @@ void BindAscendGraph(py::module *m) { .def("set_input", (Operator & (Operator::*)(const std::string &, const Operator &, uint32_t)) & Operator::SetInput) +#endif .def("add_control_input", &Operator::AddControlInput) .def("get_input_const_data", - [](Operator &op, const std::string &dst_name) -> py::tuple { + [](Operator &op, const char *dst_name) -> py::tuple { Tensor data; graphStatus res = op.GetInputConstData(dst_name, data); return py::make_tuple(data, res); }) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("get_input_desc", + (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetInputDesc) + .def("get_input_desc", + [](Operator &op, const std::string &name) { + return op.GetInputDescByName(name.c_str()); + }) + .def("get_dynamic_output_num", + static_cast( + &Operator::GetDynamicOutputNum)) + .def("get_dynamic_input_num", + static_cast( + &Operator::GetDynamicInputNum)) +#else .def("get_input_desc", (TensorDesc (Operator::*)(const std::string &) const) & Operator::GetInputDesc) @@ -313,12 +402,41 @@ void BindAscendGraph(py::module *m) { (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetInputDesc) .def("get_dynamic_output_num", &Operator::GetDynamicOutputNum) .def("get_dynamic_input_num", &Operator::GetDynamicInputNum) +#endif .def("try_get_input_desc", - [](Operator &op, const std::string &name) -> py::tuple { + [](Operator &op, const char *name) -> py::tuple { TensorDesc tensor_desc; graphStatus status = op.TryGetInputDesc(name, tensor_desc); return py::make_tuple(tensor_desc, status); }) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("update_input_desc", + static_cast(&Operator::UpdateInputDesc)) + .def("get_output_desc", + [](Operator &op, const std::string &name) { + return op.GetOutputDescByName(name.c_str()); + }) + .def("get_output_desc", + (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetOutputDesc) + .def("update_output_desc", + static_cast(&Operator::UpdateOutputDesc)) + .def("get_dynamic_input_desc", + static_cast(&Operator::GetDynamicInputDesc)) + .def("update_dynamic_input_desc", + static_cast( + &Operator::UpdateDynamicInputDesc)) + .def("get_dynamic_output_desc", + static_cast(&Operator::GetDynamicOutputDesc)) + .def("update_dynamic_output_desc", + static_cast( + &Operator::UpdateDynamicOutputDesc)) +#else .def("update_input_desc", &Operator::UpdateInputDesc) .def("get_output_desc", (TensorDesc (Operator::*)(const std::string &) const) & @@ -330,33 +448,38 @@ void BindAscendGraph(py::module *m) { .def("update_dynamic_input_desc", &Operator::UpdateDynamicInputDesc) .def("get_dynamic_output_desc", &Operator::GetDynamicOutputDesc) .def("update_dynamic_output_desc", &Operator::UpdateDynamicOutputDesc) +#endif .def("infer_shape_and_type", &Operator::InferShapeAndType) .def("set_inference_context", &Operator::SetInferenceContext) .def("get_inference_context", &Operator::GetInferenceContext) .def("verify_all_attr", &Operator::VerifyAllAttr) .def("get_inputs_size", &Operator::GetInputsSize) .def("get_outputs_size", &Operator::GetOutputsSize) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("get_all_attr_names_and_types", + static_cast &) const>( + &Operator::GetAllAttrNamesAndTypes)) +#else .def("get_all_attr_names_and_types", &Operator::GetAllAttrNamesAndTypes) +#endif .def("set_attr_int64", - [](Operator &op, const std::string &name, - int64_t value) -> Operator & { + [](Operator &op, const char *name, int64_t value) -> Operator & { int64_t tar = (int64_t)value; return op.SetAttr(name, tar); }) .def("set_attr_int32", - [](Operator &op, const std::string &name, - int32_t value) -> Operator & { + [](Operator &op, const char *name, int32_t value) -> Operator & { int32_t tar = (int32_t)value; return op.SetAttr(name, tar); }) .def("set_attr_uint32", - [](Operator &op, const std::string &name, - uint32_t value) -> Operator & { + [](Operator &op, const char *name, uint32_t value) -> Operator & { uint32_t tar = (uint32_t)value; return op.SetAttr(name, tar); }) .def("set_attr_vec_int64", - [](Operator &op, const std::string &name, + [](Operator &op, const char *name, const std::vector &value) -> Operator & { int len = value.size(); std::vector tar; @@ -368,7 +491,7 @@ void BindAscendGraph(py::module *m) { return op.SetAttr(name, tar); }) .def("set_attr_vec_int32", - [](Operator &op, const std::string &name, + [](Operator &op, const char *name, const std::vector &value) -> Operator & { int len = value.size(); std::vector tar; @@ -380,7 +503,7 @@ void BindAscendGraph(py::module *m) { return op.SetAttr(name, tar); }) .def("set_attr_vec_uint32", - [](Operator &op, const std::string &name, + [](Operator &op, const char *name, const std::vector &value) -> Operator & { int len = value.size(); std::vector tar; @@ -392,21 +515,20 @@ void BindAscendGraph(py::module *m) { return op.SetAttr(name, tar); }) .def("set_attr_list_int64", - [](Operator &op, const std::string &name, + [](Operator &op, const char *name, std::initializer_list &attrValue) -> Operator & { return op.SetAttr(name, std::move(attrValue)); }) .def("set_attr_attrvalue", - [](Operator &op, const std::string &name, AttrValue &attrValue) + [](Operator &op, const char *name, AttrValue &attrValue) -> Operator & { return op.SetAttr(name, std::move(attrValue)); }) - .def( - "set_attr_float", - [](Operator &op, const std::string &name, float value) -> Operator & { - float tar = static_cast(value); - return op.SetAttr(name, tar); - }) + .def("set_attr_float", + [](Operator &op, const char *name, float value) -> Operator & { + float tar = static_cast(value); + return op.SetAttr(name, tar); + }) .def("set_attr_vec_float", - [](Operator &op, const std::string &name, + [](Operator &op, const char *name, const std::vector &value) -> Operator & { int len = value.size(); std::vector tar; @@ -417,6 +539,15 @@ void BindAscendGraph(py::module *m) { } return op.SetAttr(name, tar); }) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("set_attr_string", + (Operator & (Operator::*)(const char *, const char *)) & + Operator::SetAttr) + .def("set_attr_vec_string", + (Operator & + (Operator::*)(const char *, const std::vector &)) & + Operator::SetAttr) +#else .def("set_attr_string", (Operator & (Operator::*)(const std::string &, const std::string &)) & Operator::SetAttr) @@ -424,15 +555,16 @@ void BindAscendGraph(py::module *m) { (Operator & (Operator::*)(const std::string &, const std::vector &)) & Operator::SetAttr) +#endif .def("set_attr_bool", - [](Operator &op, const std::string &name, bool value) -> Operator & { + [](Operator &op, const char *name, bool value) -> Operator & { if (value) return op.SetAttr(name, true); else return op.SetAttr(name, false); }) .def("set_attr_vec_bool", - [](Operator &op, const std::string &name, + [](Operator &op, const char *name, const std::vector &value) -> Operator & { int len = value.size(); std::vector tar; @@ -444,6 +576,15 @@ void BindAscendGraph(py::module *m) { } return op.SetAttr(name, tar); }) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("set_attr_tensor", + (Operator & (Operator::*)(const char *, const Tensor &)) & + Operator::SetAttr) + .def("set_attr_vec_tensor", + (Operator & + (Operator::*)(const char *, const std::vector &)) & + Operator::SetAttr) +#else .def("set_attr_tensor", (Operator & (Operator::*)(const std::string &, const Tensor &)) & Operator::SetAttr) @@ -451,8 +592,9 @@ void BindAscendGraph(py::module *m) { (Operator & (Operator::*)(const std::string &, const std::vector &)) & Operator::SetAttr) +#endif .def("set_attr_vec_uint8", - [](Operator &op, const std::string &name, + [](Operator &op, const char *name, const std::vector &value) -> Operator & { int len = value.size(); std::vector tar; @@ -463,13 +605,21 @@ void BindAscendGraph(py::module *m) { } return op.SetAttr(name, tar); }) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("set_attr_vec_vec_int64", + (Operator & + (Operator::*)(const char *, + const std::vector> &)) & + Operator::SetAttr) +#else .def("set_attr_vec_vec_int64", (Operator & (Operator::*)(const std::string &, const std::vector> &)) & Operator::SetAttr) +#endif .def("set_attr_vec_dtype", - [](Operator &op, const std::string &name, + [](Operator &op, const char *name, const std::vector &value) -> Operator & { int len = value.size(); std::vector tar; @@ -481,15 +631,13 @@ void BindAscendGraph(py::module *m) { return op.SetAttr(name, tar); }) .def("set_attr_dtype", - [](Operator &op, const std::string &name, + [](Operator &op, const char *name, const DataType &value) -> Operator & { ge::DataType tar = (ge::DataType)value; return op.SetAttr(name, tar); }) - .def("get_attr", - [](Operator &op, const std::string &name, - AttrType type) -> py::tuple { + [](Operator &op, const char *name, AttrType type) -> py::tuple { graphStatus res = -1; switch (type) { case AT_INT64: { @@ -538,12 +686,12 @@ void BindAscendGraph(py::module *m) { return py::make_tuple(o_av, res); } break; case AT_STRING: { - std::string s_av; + AscendString s_av; res = op.GetAttr(name, s_av); return py::make_tuple(s_av, res); } break; case AT_LIST_STRING: { - std::vector v_s_av; + std::vector v_s_av; res = op.GetAttr(name, v_s_av); return py::make_tuple(v_s_av, res); } break; @@ -594,11 +742,31 @@ void BindAscendGraph(py::module *m) { }) .def("break_connect", &Operator::BreakConnect) .def("get_subgraph_names_count", &Operator::GetSubgraphNamesCount) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("get_subgraph_names", + static_cast &) const>(&Operator::GetSubgraphNames)) + .def("get_subgraph_builder", + static_cast(&Operator::GetSubgraphBuilder)) + .def("get_subgraph", + static_cast( + &Operator::GetSubgraph)) + .def("get_dynamic_subgraph_builder", + static_cast( + &Operator::GetDynamicSubgraphBuilder)) + .def("get_dynamic_subgraph", + static_cast(&Operator::GetDynamicSubgraph)); +#else + .def("get_subgraph_names_count", &Operator::GetSubgraphNamesCount) .def("get_subgraph_names", &Operator::GetSubgraphNames) .def("get_subgraph_builder", &Operator::GetSubgraphBuilder) .def("get_subgraph", &Operator::GetSubgraph) .def("get_dynamic_subgraph_builder", &Operator::GetDynamicSubgraphBuilder) .def("get_dynamic_subgraph", &Operator::GetDynamicSubgraph); +#endif py::class_(*m, "GETensor") .def(py::init<>()) @@ -613,10 +781,15 @@ void BindAscendGraph(py::module *m) { Tensor::SetData) .def("set_data", (graphStatus (Tensor::*)(const uint8_t *, size_t)) & Tensor::SetData) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("set_data", + (graphStatus (Tensor::*)(const char *)) & Tensor::SetData) +#else .def("set_data", (graphStatus (Tensor::*)(const std::string &)) & Tensor::SetData) +#endif .def("set_data", - (graphStatus (Tensor::*)(const std::vector &)) & + (graphStatus (Tensor::*)(const std::vector &)) & Tensor::SetData) .def("get_data", @@ -638,8 +811,8 @@ void BindAscendGraph(py::module *m) { .def(py::init(), py::arg("shape"), py::arg("format") = FORMAT_ND, py::arg("dt") = DT_FLOAT) .def(py::init()) - .def("update", - (void (TensorDesc::*)(Shape, Format, DataType)) & TensorDesc::Update, + .def("update", (void (TensorDesc::*)(const Shape &, Format, DataType)) & + TensorDesc::Update, py::arg("shape"), py::arg("format") = FORMAT_ND, py::arg("dt") = DT_FLOAT) .def("set_shape", &TensorDesc::SetShape) @@ -660,8 +833,16 @@ void BindAscendGraph(py::module *m) { .def("get_origin_format", &TensorDesc::GetOriginFormat) .def("set_data_type", &TensorDesc::SetDataType) .def("get_data_type", &TensorDesc::GetDataType) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("set_name", static_cast( + &TensorDesc::SetName)) + .def("get_name", + static_cast( + &TensorDesc::GetName)) +#else .def("set_name", &TensorDesc::SetName) .def("get_name", &TensorDesc::GetName) +#endif .def("set_size", &TensorDesc::SetSize) .def("get_size", &TensorDesc::GetSize) .def("set_real_dim_cnt", &TensorDesc::SetRealDimCnt) @@ -679,16 +860,27 @@ void BindAscendGraph(py::module *m) { py::class_(*m, "GEAttrValue").def(py::init<>()); py::class_(*m, "GEOperatorFactory") +#ifdef PADDLE_WITH_ASCEND_STRING + .def_static("create_operator", + static_cast( + &ge::OperatorFactory::CreateOperator)) +#else .def("create_operator", &OperatorFactory::CreateOperator) +#endif .def("get_ops_type_list", []() -> py::tuple { - std::vector all_ops; + std::vector all_ops; graphStatus status = OperatorFactory::GetOpsTypeList(all_ops); return py::make_tuple(all_ops, status); }) +#ifdef PADDLE_WITH_ASCEND_STRING + .def_static("is_exist_op", static_cast( + &OperatorFactory::IsExistOp)); +#else .def("is_exist_op", &OperatorFactory::IsExistOp); +#endif } -} // end namespace pybind -} // end namespace paddle +} // namespace pybind +} // namespace paddle #endif diff --git a/paddle/fluid/pybind/ascend_wrapper_py.h b/paddle/fluid/pybind/ascend_wrapper_py.h index 4af96d6ef4b..e999080544c 100644 --- a/paddle/fluid/pybind/ascend_wrapper_py.h +++ b/paddle/fluid/pybind/ascend_wrapper_py.h @@ -25,6 +25,7 @@ namespace pybind { void BindAscendGraph(py::module* m); void BindAscendWrapper(py::module* m); +void BindAscendDevice(py::module* m); } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 282b0e1d81c..2c1927f49f6 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -16,6 +16,9 @@ #include #include #include +#ifndef _WIN32 +#include +#endif #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" @@ -23,6 +26,9 @@ #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/pybind/pybind.h" #include "paddle/fluid/string/string_helper.h" +#ifdef PADDLE_WITH_ASCEND +#include "paddle/fluid/framework/fleet/ascend_wrapper.h" +#endif // NOTE(zhiqiu): Commonly, the inputs in auto-generated OP function are // determined by the OP`s proto automatically, i.e., all the inputs registered @@ -561,6 +567,11 @@ int main(int argc, char* argv[]) { return -1; } +#ifdef PADDLE_WITH_ASCEND + auto ascend_ptr = paddle::framework::AscendInstance::GetInstance(); + ascend_ptr->InitGEForUT(); +#endif + std::vector headers{"\"paddle/fluid/imperative/tracer.h\""}; std::ofstream out(argv[1], std::ios::out); @@ -590,5 +601,9 @@ int main(int argc, char* argv[]) { << "} // namespace paddle\n"; out.close(); + +#ifdef PADDLE_WITH_ASCEND + ge::GEFinalize(); +#endif return 0; } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 29c7f00142d..5bf70d1126b 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -143,6 +143,14 @@ bool IsCompiledWithROCM() { #endif } +bool IsCompiledWithAscend() { +#ifndef PADDLE_WITH_ASCEND + return false; +#else + return true; +#endif +} + bool IsCompiledWithXPU() { #ifndef PADDLE_WITH_XPU return false; @@ -1756,6 +1764,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("init_devices", []() { framework::InitDevices(); }); m.def("is_compiled_with_cuda", IsCompiledWithCUDA); + m.def("is_compiled_with_ascend", IsCompiledWithAscend); m.def("is_compiled_with_rocm", IsCompiledWithROCM); m.def("is_compiled_with_xpu", IsCompiledWithXPU); m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); @@ -2885,6 +2894,7 @@ All parameter, weight, gradient are variables in Paddle. #ifdef PADDLE_WITH_ASCEND BindAscendWrapper(&m); BindAscendGraph(&m); + BindAscendDevice(&m); #endif #ifdef PADDLE_WITH_CRYPTO BindCrypto(&m); diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py index ddbf8cbbe3f..6d4aedddba6 100644 --- a/python/paddle/distributed/fleet/__init__.py +++ b/python/paddle/distributed/fleet/__init__.py @@ -37,6 +37,17 @@ init = fleet.init is_first_worker = fleet.is_first_worker worker_index = fleet.worker_index worker_num = fleet.worker_num +node_num = fleet.node_num +rank = fleet.worker_index +nranks = fleet.worker_num +world_size = fleet.worker_num +# device id in current trainer +local_device_ids = fleet.local_device_ids +# device ids in world +world_device_ids = fleet.world_device_ids +# rank in node +local_rank = fleet.local_rank +rank_in_node = local_rank is_worker = fleet.is_worker worker_endpoints = fleet.worker_endpoints server_num = fleet.server_num diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 470d1a2b78f..0a60cbf78d5 100644 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -289,6 +289,18 @@ class Fleet(object): """ return self._role_maker._worker_num() + def node_num(self): + return self._role_maker._get_node_num() + + def local_rank(self): + return self._role_maker._get_local_rank() + + def local_device_ids(self): + return self._role_maker._get_local_device_ids() + + def world_device_ids(self): + return self._role_maker._get_world_device_ids() + def is_worker(self): """ Check whether the node is an instance of worker. diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py index a8683aea97f..62c8faa0757 100644 --- a/python/paddle/distributed/fleet/base/role_maker.py +++ b/python/paddle/distributed/fleet/base/role_maker.py @@ -622,6 +622,29 @@ class PaddleCloudRoleMaker(RoleMakerBase): self._generate_role() return self._nodes_num + def _get_node_num(self): + """ + return the training node number + """ + if not self._role_is_generated: + self._generate_role() + return self._nodes_num + + def _get_local_rank(self): + if not self._role_is_generated: + self._generate_role() + return self._local_rank + + def _get_local_device_ids(self): + if not self._role_is_generated: + self._generate_role() + return self._local_device_ids + + def _get_world_device_ids(self): + if not self._role_is_generated: + self._generate_role() + return self._world_device_ids + def _get_trainer_endpoints(self): """ get endpoint of all trainers @@ -782,6 +805,9 @@ class PaddleCloudRoleMaker(RoleMakerBase): self._trainers_num = len(self._worker_endpoints) self._nodes_num = len( set([x.split(':')[0] for x in self._worker_endpoints])) + self._local_rank = os.getenv("PADDLE_RANK_IN_NODE") + self._local_device_ids = os.getenv("PADDLE_LOCAL_DEVICE_IDS") + self._world_device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS") def _gloo_init(self): # PADDLE_WITH_GLOO 1: trainer barrier, 2: all barrier diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index 0f9b13d8a12..d6f4227a923 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -108,6 +108,21 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra "In gpu training, it should be less or equal to the gpus number of you system(or you set by --gpus). And so each process can" " bound to one or average number of gpus.") + base_group.add_argument( + "--run_mode", + type=str, + default="collective", + help="run mode of job, can be:collective/ps/ps-heter") + + base_group.add_argument( + "--ascend_npus", + type=str, + default=None, + help="It's for ascend npu training." + "For example:" + "--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one gpu." + ) + if fluid.core.is_compiled_with_cuda(): base_group.add_argument( "--gpus", @@ -243,6 +258,9 @@ def launch_collective(args): log_dir=args.log_dir, envs=global_envs) + for idx, proc in enumerate(procs): + print("launch proc_id:{} idx:{}".format(proc.proc.pid, idx)) + while True: alive = watch_local_trainers(procs, cluster.trainers_nranks()) @@ -276,6 +294,16 @@ def launch_ps(args, distribute_mode): def which_distributed_mode(args): + if args.run_mode is not None: + assert args.run_mode in ["collective", "ps", "ps-heter"] + + if args.run_mode == "collective": + return DistributeMode.COLLECTIVE + elif args.run_mode == "ps": + return DistributeMode.PS + elif args.run_mode == "ps-heter": + return DistributeMode.PS_HETER + ps_args = [ '--worker_num', '--server_num', '--heter_worker_num', '--servers', '--workers', '--heter_workers', '--http_port' @@ -298,24 +326,26 @@ def which_distributed_mode(args): ) if fluid.core.is_compiled_with_cuda(): - device_count = fluid.core.get_cuda_device_count() + accelerators = fluid.core.get_cuda_device_count() + elif fluid.core.is_compiled_with_ascend(): + accelerators = fluid.core.NPUDevice.get_device_count() elif fluid.core.is_compiled_with_xpu(): - device_count = fluid.core.get_xpu_device_count() + accelerators = fluid.core.get_xpu_device_count() else: - device_count = 0 + accelerators = 0 if len(has_ps_args) > 0: logger.info( - "Run parameter-sever mode. pserver arguments:{}, cuda or xpu count:{}". - format(has_ps_args, device_count)) + "Run parameter-sever mode. pserver arguments:{}, accelerators count:{}". + format(has_ps_args, accelerators)) has_ps_heter_args = list(set(has_ps_args) & set(ps_heter_args)) if len(has_ps_heter_args) > 0: return DistributeMode.PS_HETER else: return DistributeMode.PS elif len(has_collective_args) > 0: - logger.info("Run collective gpu mode. gpu arguments:{}, cuda count:{}". - format(has_collective_args, device_count)) + logger.info("Run collective mode. gpu arguments:{}, cuda count:{}". + format(has_collective_args, accelerators)) return DistributeMode.COLLECTIVE else: if not fluid.core.is_compiled_with_cuda( diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index c5cb1ec94ac..2d2807bce28 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -52,6 +52,8 @@ class DeviceMode(): GPU = 1 KUNLUN = 2 XPU = 2 + ASCEND_NPU = 3 + UNKNOWN = 3 class Cluster(object): @@ -98,6 +100,14 @@ class Cluster(object): r.append(t.endpoint) return r + def world_device_ids(self): + r = [] + for pod in self.pods: + for t in pod.trainers: + str_accelerators = [str(acc) for acc in t.accelerators] + r.append(str_accelerators) + return r + def pods_endpoints(self): r = [] for pod in self.pods: @@ -105,7 +115,6 @@ class Cluster(object): assert pod.port != None and pod.addr != None, "{} not a valid endpoint".format( ep) r.append(ep) - return r def get_pod_by_id(self, pod_id): @@ -132,23 +141,23 @@ class JobServer(object): class Trainer(object): def __init__(self): - self.gpus = [] + self.accelerators = [] self.endpoint = None self.rank = None def __str__(self): - return "gpu:{} endpoint:{} rank:{}".format(self.gpus, self.endpoint, - self.rank) + return "accelerator:{} endpoint:{} rank:{}".format( + self.accelerators, self.endpoint, self.rank) def __eq__(self, t): - if len(self.gpus) != len(t.gpus): + if len(self.accelerators) != len(t.accelerators): return False if self.endpoint != t.endpoint or \ self.rank != t.rank: return False - for a, b in zip(self.gpus, t.gpus): + for a, b in zip(self.accelerators, t.accelerators): if a != b: return False @@ -171,12 +180,13 @@ class Pod(object): self.servers = [] self.workers = [] self.heter_workers = [] - self.gpus = [] + self.accelerators = [] + self.device_mode = None def __str__(self): - return "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{} servers:{} \ + return "rank:{} id:{} addr:{} port:{} visible_accelerator:{} trainers:{} servers:{} \ workers:{} heter_workers:{}".format( - self.rank, self.id, self.addr, self.port, self.gpus, [ + self.rank, self.id, self.addr, self.port, self.accelerators, [ str(t) for t in self.trainers ], [str(s) for s in self.servers], [str(w) for w in self.workers], [str(h) for h in self.heter_workers]) @@ -231,12 +241,12 @@ class Pod(object): def rank(self): return self.rank - def get_visible_gpus(self): + def get_visible_accelerators(self): r = "" - for g in self.gpus: + for g in self.accelerators: r += "{},".format(g) - assert r != "", "this pod {} can't see any gpus".format(self) + assert r != "", "this pod {} can't see any accelerators".format(self) r = r[:-1] return r @@ -264,23 +274,27 @@ def get_cluster(node_ips, node_ip, trainer_endpoints, device_mode, pod = Pod() pod.rank = node_rank pod.addr = ip + pod.device_mode = device_mode + cur_node_endpoints = trainer_endpoints[node_rank] # when use paddlecloud, endpoints may > devices_per_proc(user_defined) assert len(cur_node_endpoints) >= len( devices_per_proc - ), "current trainer_endpoints size should be greater equal than selected_gpus size." + ), "current trainer_endpoints size should be greater equal than acclerators size." for i in range(len(devices_per_proc)): trainer = Trainer() - if device_mode == DeviceMode.GPU: + if device_mode == DeviceMode.GPU or device_mode == DeviceMode.ASCEND_NPU: if isinstance(devices_per_proc[i], (list, tuple)): - trainer.gpus.extend(devices_per_proc[i]) + trainer.accelerators.extend(devices_per_proc[i]) + pod.accelerators.extend(devices_per_proc[i]) else: - trainer.gpus.append(devices_per_proc[i]) + trainer.accelerators.append(devices_per_proc[i]) + pod.accelerators.append(devices_per_proc[i]) elif device_mode == DeviceMode.XPU: if isinstance(devices_per_proc[i], (list, tuple)): - trainer.gpus.extend(devices_per_proc[i]) + trainer.accelerators.extend(devices_per_proc[i]) else: - trainer.gpus.append(devices_per_proc[i]) + trainer.accelerators.append(devices_per_proc[i]) trainer.endpoint = "%s" % (cur_node_endpoints[i]) trainer.rank = trainer_rank trainer_rank += 1 @@ -451,21 +465,32 @@ def start_local_trainers(cluster, current_env.pop("http_proxy", None) current_env.pop("https_proxy", None) + ids = cluster.world_device_ids() + res = [':'.join(ele) for ele in ids] procs = [] for idx, t in enumerate(pod.trainers): proc_env = { "PADDLE_TRAINER_ID": "%d" % t.rank, "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint, "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), - "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) + "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), + "PADDLE_RANK_IN_NODE": str(idx), + "PADDLE_LOCAL_DEVICE_IDS": + ",".join([str(acc) for acc in t.accelerators]), + "PADDLE_WORLD_DEVICE_IDS": ",".join(res), } - if fluid.core.is_compiled_with_cuda() and len(t.gpus) > 0: + if len(t.accelerators) > 0 and pod.device_mode == DeviceMode.GPU: proc_env["FLAGS_selected_gpus"] = "%s" % ",".join( - [str(g) for g in t.gpus]) - elif fluid.core.is_compiled_with_xpu() and len(t.gpus) > 0: + [str(g) for g in t.accelerators]) + + if len(t.accelerators) > 0: + proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join( + [str(g) for g in t.accelerators]) + # to do: same code style in future + if fluid.core.is_compiled_with_xpu() and len(t.accelerators) > 0: proc_env["FLAGS_selected_xpus"] = "%s" % ",".join( - [str(g) for g in t.gpus]) + [str(g) for g in t.accelerators]) current_env.update(proc_env) @@ -564,6 +589,17 @@ def watch_local_trainers(procs, nranks): return alive +def get_ascend_npus(npus): + if npus is None: + count = fluid.core.NPUDevice.get_device_count() + if count <= 0: + return ret + ret = [x for x in range(count)] + else: + ret = [x.strip() for x in npus.split(',')] + return ret + + def get_gpus(gpus): if gpus is None: gpus_num = fluid.core.get_cuda_device_count() @@ -623,11 +659,17 @@ def get_xpus(xpus): def get_device_mode(): - if fluid.core.is_compiled_with_cuda() and fluid.core.get_cuda_device_count( - ) > 0: - print("launch train in GPU mode") + if fluid.core.is_compiled_with_ascend() and \ + fluid.core.NPUDevice.get_device_count() > 0: + print("launch train in ascend npu mode!") + return DeviceMode.ASCEND_NPU + + if fluid.core.is_compiled_with_cuda() and \ + fluid.core.get_cuda_device_count() > 0: + print("launch train in GPU mode!") return DeviceMode.GPU - elif fluid.core.is_compiled_with_xpu() and fluid.core.get_xpu_device_count( + + if fluid.core.is_compiled_with_xpu() and fluid.core.get_xpu_device_count( ) > 0: print("launch train in XPU mode") return DeviceMode.XPU @@ -654,6 +696,10 @@ def get_device_proc_info(args): ] else: devices_per_proc = gpus + elif device_mode == DeviceMode.ASCEND_NPU: + npus = get_ascend_npus(args.ascend_npus) + assert args.nproc_per_node is None, "ascend_npus need't nproc_per_node arguments" + devices_per_proc = npus elif device_mode == DeviceMode.XPU: xpus = get_xpus(args.xpus) if args.nproc_per_node is not None: diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py new file mode 100644 index 00000000000..b9a7651e449 --- /dev/null +++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py index d7ac81bb5c5..978899604ea 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py @@ -12,16 +12,26 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import paddle.fluid.framework as framework from paddle.fluid.optimizer import Optimizer import paddle.fluid.core as core import numpy as np -import ascend_parser +from . import ascend_parser +from paddle.distributed import fleet +import hccl.manage.api as hccl +from collections import namedtuple + +HcomGroupConfig = namedtuple('HcomGroupConfig', ['name', 'nranks', 'rank_ids']) class AscendIRParser(object): - def __init__(self): + def __init__(self, auto_dp=False, world_rank_size=1): self.graph_idx = 0 + self.hcom_endpoints = {} + self.groups_to_create = [] + self._auto_dp = auto_dp + self._world_rank_size = world_rank_size def _construct_input_map(self, input_varlist): ret_map = {} @@ -43,15 +53,52 @@ class AscendIRParser(object): ret_map[var.name] = ge_input return ge_in_operator, ret_map + def _endpoint_to_world_rank_id(self, endpoint): + world_endpoints = fleet.worker_endpoints() + assert endpoint in world_endpoints, "endpoint (%s) not in worker_endpoints (%s) " % ( + endpoint, fleet.world_device_ids()) + return world_endpoints.index(endpoint) + def parse_op(self, op): - if op.type in ascend_parser.registerd_op: - print("Op[%s] has been registered, begin to parse it" % (op.type)) + if op.type == 'c_gen_nccl_id': + endpoint = op.attr("endpoint") + other_endpoints = op.attr("other_endpoints") + rank = op.attr("rank") + + nccl_id = op.output_arg_names[0] + + # c_gen_nccl_id operator splits endpoints into local endpoint and other_endpoints + # we should combine these together to produce world_rank_ids + self.hcom_endpoints[nccl_id] = other_endpoints[:] + self.hcom_endpoints[nccl_id].insert(rank, endpoint) + + print("nccl_id (%s) registered endpoints %s" % + (nccl_id, self.hcom_endpoints[nccl_id])) + elif op.type == 'c_comm_init': + nccl_id = op.input_arg_names[0] + nranks = op.attr("nranks") + assert nranks == len(self.hcom_endpoints[ + nccl_id]), "nranks doesn't match endpoint count" + rank = op.attr("rank") + ring_id = op.attr("ring_id") + + group_name = "hcom_group_" + str(ring_id) + global_rank_ids = [ + self._endpoint_to_world_rank_id(endpoint) + for endpoint in self.hcom_endpoints[nccl_id] + ] + self.groups_to_create.append( + HcomGroupConfig( + name=group_name, nranks=nranks, rank_ids=global_rank_ids)) + print("append to create group: %s, with rank_ids: %s" % + (group_name, global_rank_ids)) + elif op.type in ascend_parser.registerd_op: op_parser = self.parser_factory.create_parse( ascend_parser.registerd_op[op.type]) op_parser.apply(op) else: - print("Op[%s] has not been registered, so we have to skip it" % - (op.type)) + assert False, "Op[%s] has not been registered, so we have to skip it" % ( + op.type) def _parse_program(self, graph_name, @@ -84,7 +131,7 @@ class AscendIRParser(object): name = e.name ge_out_operator.append(self.var2geop[name]) - # (Debug) If you want to print back prop vars, append/assign the varname in ge_out_operator here, such as: + # (Debug) If you want to print back prop vars, append/assign the varname in ge_out_operator here, such as: # if graph_name == "main": # ge_out_operator.append(self.var2geop["reduce_sum_0.tmp_0@GRAD"]) @@ -115,6 +162,17 @@ class AscendIRParser(object): startup_graph = self._parse_program("startup", startup_program) main_graph = self._parse_program("main", main_program, input_varlist, fetch_list) + if self._auto_dp and self._world_rank_size > 1: + assert len(self.groups_to_create + ) == 0, "can't parse program under auto_dp mode" + + from paddle.distributed import fleet + self.groups_to_create.append( + HcomGroupConfig( + name="hcom_group_0", + nranks=fleet.world_size(), + rank_ids=[x for x in range(fleet.world_size())])) + return startup_graph, main_graph @@ -124,9 +182,14 @@ class AscendOptimizer(Optimizer): def __init__(self, optimizer, fetch_list=[]): self.inner_opt = optimizer self.fetch_list = fetch_list + self.ascend_instance = None def __del__(self): + print("begin AscendOptimizer del") + if self.ascend_instance is not None: + self.ascend_instance.destroy_global_resources() core.ge_finalize() + print("end AscendOptimizer del") def _can_apply(self): if not self.user_defined_strategy.ascend: @@ -138,7 +201,7 @@ class AscendOptimizer(Optimizer): dist_strategy.ascend = False dist_strategy.ascend_configs = {} - def _get_input_varlist(program): + def _get_input_varlist(self, program): ret_list = [] for var in program.list_vars(): if var.is_data or var.persistable: @@ -149,30 +212,56 @@ class AscendOptimizer(Optimizer): loss, startup_program=None, parameter_list=None, - no_grad_set=None): - minimized = self.inner_opt.minimize( - loss, startup_program=startup_program) + no_grad_set=None, + auto_dp=False, + rank_table_file=None): + minimized = None + if self.inner_opt: + minimized = self.inner_opt.minimize( + loss, startup_program=startup_program) self.ascend_instance = core.AscendInstance() + from paddle.distributed import fleet + if auto_dp and fleet.world_size() > 1: + from paddle.fluid.transpiler import ascend_transpiler + t = ascend_transpiler.AscendTranspiler(startup_program, + loss.block.program) + t.transpile() + #print(loss.block.program) + # Config about Graph Engine can be found in https://support.huaweicloud.com/ config = { - "ge.exec.deviceId": "0", + "ge.exec.deviceId": str(fleet.local_device_ids()), "ge.graphRunMode": "1", - "ge.exec.precision_mode": "must_keep_origin_dtype" + "ge.exec.precision_mode": "must_keep_origin_dtype", } + # if multi trainers + if rank_table_file and fleet.world_size() > 1: + config["ge.exec.rankTableFile"] = rank_table_file + config["ge.exec.rankId"] = str(fleet.worker_index()) + config["ge.exec.isUseHcom"] = "1" + config["ge.exec.deployMode"] = "0" + print("ge_initialize config:", config) core.ge_initialize(config) # Init Session self.ascend_instance.init_global_resources() main_block = loss.block - self.parser = AscendIRParser() + self.parser = AscendIRParser( + auto_dp=auto_dp, world_rank_size=fleet.world_size()) + + input_varlist = self._get_input_varlist(main_block.program) - input_varlist = _get_input_varlist(main_block.program) startup_graph, main_graph = self.parser.parse_program( startup_program, main_block.program, input_varlist, self.fetch_list) + for cfg in self.parser.groups_to_create: + print("create group (%s), nranks: %d, rank_ids: %s" % + (cfg.name, cfg.nranks, cfg.rank_ids)) + hccl.create_group(cfg.name, cfg.nranks, cfg.rank_ids) + self.ascend_instance.add_ascend_subgraph(0, startup_graph) self.ascend_instance.add_ascend_subgraph(1, main_graph) diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py index 2c5930c5b9f..f2ecaf48438 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py @@ -1,41 +1,106 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import paddle.fluid.framework as framework from paddle.fluid.optimizer import Optimizer import paddle.fluid.core as core import numpy as np - -registerd_op = { - "elementwise_add": "AddParser", - "matmul": "MatMulParser", - "mul": "MulParser", - "relu": "ReluParser", - "softmax_with_cross_entropy": "SoftmaxWithCrossEntropyParser", - "shape": "ShapeParser", - "fill_constant": "FillConstantParser", - "reduce_sum": "ReduceSumParser", - "reduce_sum_grad": "ReduceSumGradParser", - "matmul_grad": "MatMulGradParser", - "mul_grad": "MulGradParser", - "reshape2": "ReshapeParser", - "scale": "ScaleParser", - "relu_grad": "ReluGradParser", - "softmax_with_cross_entropy_grad": "SoftmaxWithCrossEntropyGradParser", - "truncated_gaussian_random": "TruncatedNormalParser", - "sgd": "SGDParser" -} +from paddle.distributed import fleet +from functools import reduce + +registerd_op = {## forwards + "elementwise_add": "AddParser", + "matmul": "MatMulParser", + "mul": "MulParser", + "relu": "ReluParser", + "softmax_with_cross_entropy": "SoftmaxWithCrossEntropyParser", + "shape": "ShapeParser", + "fill_constant": "FillConstantParser", + "reduce_sum": "ReduceSumParser", + "elementwise_mul": "DotMulParser", + "elementwise_div": "DotDivParser", + "elementwise_pow": "DotPowParser", + "elementwise_max": "MaxParser", + "elementwise_min": "MinParser", + "elementwise_sub": "DotSubParser", + "pow": "PowParser", + "gelu": "GeluParser", + "sqrt": "SqrtParser", + "log": "LogParser", + "sum": "SumParser", + "logical_not": "LogicalNotParser", + "gather": "GatherParser", + "scatter": "ScatterParser", + "cast": "CastParser", + "tanh": "TanhParser", + "stack": "StackParser", + "square": "SquareParser", + "unsqueeze2": "UnSqueezeParser", + "assign": "AssignParser", + "softmax": "SoftMaxParser", + "reshape2": "ReshapeParser", + "transpose2": "TransposeParser", + "layer_norm": "LayerNormParser", + "less_than": "LessParser", + "mean": "MeanParser", + "scale": "ScaleParser", + "slice": "SliceParser", + "top_k": "TopkParser", + "accuracy": "AccuracyParser", + #"increment": "IncrementParser", + "lookup_table": "LookupTableParser", + "truncated_gaussian_random": "TruncatedNormalParser", + "c_allgather": "AllGatherParser", + "c_allreduce_sum": "AllReduceSumParser", + "c_allreduce_max": "AllReduceMaxParser", + "c_broadcast": "BroadcastParser", + "c_reduce_scatter": "ReduceScatterParser", + "c_send": "SendParser", + "c_receive": "ReceiveParser", + "uniform_random": "UniformRandomParser", + "range": "RangeParser", + "equal": "EqualParser", + "expand": "ExpandParser", + "squeeze2": "SqueezeParser", + + + ## backwords + "matmul_grad": "MatMulGradParser", + "mul_grad": "MulGradParser", + "relu_grad": "ReluGradParser", + "reduce_sum_grad": "ReduceSumGradParser", + "softmax_with_cross_entropy_grad": "SoftmaxWithCrossEntropyGradParser", + "tanh_grad":"TanhGradParser", + "log_grad":"LogGradParser", + "pow_grad": "PowGradParser", + "sqrt_grad": "SqrtGradParser", + "gelu_grad": "GeluGradParser", + "mean_grad": "MeanGradParser", + 'lookup_table_grad': "LookUpTableGradParser", + "elementwise_mul_grad": "DotMulGradParser", + "elementwise_add_grad": "DotAddGradParser", + "elementwise_div_grad": "DotDivGradParser", + "softmax_grad": "SoftmaxGradParser", + "slice_grad": "SliceGradParser", + "reshape2_grad": "ReshapeGradParser", + "gather_grad": "GatherGradParser", + "transpose2_grad": "TransposeGradParser", + "layer_norm_grad": "LayerNormGradParser", + + ## opt + "sgd": "SGDParser", + #"adam": "AdamParser", + } global_cnt = -1 global_input_cnt = -1 @@ -60,6 +125,7 @@ class AscendHelper(object): 5: "float32", 6: "float64" } + self.dtype2paddle_inv_map = {"VarType.FP32": 0, "VarType.FP16": 1} def dtype2ge(self, dtype): assert dtype in self.dtype2ge_map, "dtype[%d] is not supported %d" % ( @@ -105,7 +171,6 @@ class AscendParserBase(object): self.parser_name, len(index_list), output_num) for output_id in range(output_num): arguments = self.op.output(self.op.output_names[output_id]) - print("%d argument: %s" % (output_id, str(arguments))) if len(arguments) > 0: assert len(arguments) == len( index_list[output_id] @@ -113,8 +178,6 @@ class AscendParserBase(object): self.parser_name, output_id, len(index_list[output_id]), len(arguments)) for i in range(len(arguments)): - print("assgin index_list[%d][%d] to %s" % - (output_id, i, arguments[i])) self.var2geop[arguments[i]] = geop_list[index_list[ output_id][i]] @@ -125,7 +188,7 @@ class AscendParserBase(object): self.op = op assert self.op.type == self.parser_name, "op [%s] != parser_name[%s]" % ( self.op.type, self.parser_name) - print("begin to parse op %s" % (self.parser_name)) + #print("begin to parse op %s" % (self.parser_name)) geop_list, index_list = self._apply() self.update_output(geop_list, index_list) @@ -152,6 +215,63 @@ class AscendParserBase(object): tensor.set_data(data_8) return tensor + def _get_ge_tensor(self, shape, dtype, value_list): + tensor_desc = core.GETensorDesc( + core.GEShape(shape), core.GEFormat.FORMAT_ND, + self.ascend_helper.dtype2ge(dtype)) + tensor = core.GETensor(tensor_desc) + + data = np.array(value_list).reshape(shape).astype( + self.ascend_helper.dtype2np(dtype)) + buf = data.tobytes() + data_8 = np.frombuffer(buf, dtype=np.uint8) + tensor.set_data(data_8) + + tensor_const = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor) + + return tensor_const + + def _get_variable(self, shape, dtype, tensor): + if dtype == "int32": + type = core.GEDataType.DT_INT32 + elif dtype == "float32": + type = core.GEDataType.DT_FLOAT + + var = core.GEOperatorFactory.create_operator( + "variable" + self._accumulated_op_id(), "Variable") + var.update_output_desc("y", + core.GETensorDesc( + core.GEShape(shape), core.GEFormat.FORMAT_ND, + type)) + assign = core.GEOperatorFactory.create_operator( + "assign" + self._accumulated_op_id(), "Assign").set_input( + "value", tensor).set_input("ref", var) + + return assign + + def _create_shape_tensor(self): + tensor_desc = core.GETensorDesc( + core.GEShape([2]), core.GEFormat.FORMAT_ND, + core.GEDataType.DT_INT32) + tensor = core.GETensor(tensor_desc) + + data = np.ones((2)).astype("int32").reshape([2]) + data[0] = 64 + buf = data.tobytes() + data_8 = np.frombuffer(buf, dtype=np.uint8) + tensor.set_data(data_8) + return tensor + + def _get_GEtensor_shape(self, tensor): + tensor_shape = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), "Shape").set_input("x", tensor) + tensor_shape = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", tensor_shape).set_attr_int32("dst_type", 0) + return tensor_shape + class AddParser(AscendParserBase): def __init__(self, graph, var2geop): @@ -162,109 +282,276 @@ class AddParser(AscendParserBase): x = self._get_ge_input(self.op.input_arg_names[0]) y = self._get_ge_input(self.op.input_arg_names[1]) add = core.GEOperatorFactory.create_operator( - "add" + self._accumulated_op_id(), "Add").set_input( - "x1", x).set_input("x2", y) + "add" + self._accumulated_op_id(), + "Add").set_input("x1", x).set_input("x2", y) return [add], [[0]] -class ReduceSumParser(AscendParserBase): +class DotSubParser(AscendParserBase): def __init__(self, graph, var2geop): - super(ReduceSumParser, self).__init__(graph, var2geop) - self.parser_name = "reduce_sum" + super(DotSubParser, self).__init__(graph, var2geop) + self.parser_name = "elementwise_sub" def _apply(self): x = self._get_ge_input(self.op.input_arg_names[0]) - axes = self.op.attr("dim") - keep_dims = self.op.attr("keep_dim") - reduce_sum = core.GEOperatorFactory.create_operator( - "reduce_sum" + self._accumulated_op_id(), "ReduceSumD").set_input( - "x", x, 0).set_attr_vec_int32("axes", axes).set_attr_bool( - "keep_dims", keep_dims) - return [reduce_sum], [[0]] + y = self._get_ge_input(self.op.input_arg_names[1]) + sub = core.GEOperatorFactory.create_operator( + "sub" + self._accumulated_op_id(), + "Sub").set_input("x1", x).set_input("x2", y) + return [sub], [[0]] -class ReduceSumGradParser(AscendParserBase): +class DotMulParser(AscendParserBase): def __init__(self, graph, var2geop): - super(ReduceSumGradParser, self).__init__(graph, var2geop) - self.parser_name = "reduce_sum_grad" + super(DotMulParser, self).__init__(graph, var2geop) + self.parser_name = "elementwise_mul" def _apply(self): x = self._get_ge_input(self.op.input_arg_names[0]) - input = self._get_ge_input(self.op.input_arg_names[1]) + y = self._get_ge_input(self.op.input_arg_names[1]) + mul = core.GEOperatorFactory.create_operator( + "dotmul" + self._accumulated_op_id(), + "Mul").set_input("x1", x).set_input("x2", y) + return [mul], [[0]] - shape_tensor = core.GEOperatorFactory.create_operator( - "shape" + self._accumulated_op_id(), "Shape").set_input("x", input, - 0) - axis_const = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", self._create_ge_tensor([1], 2, -1)) - self._mark_as_input(axis_const) - broadcast = core.GEOperatorFactory.create_operator( - "broadcast_to_d" + self._accumulated_op_id(), - "BroadcastTo").set_input("x", x).set_input("shape", shape_tensor) - # unsqueeze cannot get right result, but ExpandDims seems have the same functionality. - reduce_sum_grad = core.GEOperatorFactory.create_operator( - "expand" + self._accumulated_op_id(), "ExpandDims").set_input( - "x", broadcast).set_input("axis", axis_const) - return [shape_tensor, axis_const, broadcast, reduce_sum_grad], [[3]] +class DotDivParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(DotDivParser, self).__init__(graph, var2geop) + self.parser_name = "elementwise_div" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + y = self._get_ge_input(self.op.input_arg_names[1]) + div = core.GEOperatorFactory.create_operator( + "dotdiv" + self._accumulated_op_id(), + "Div").set_input("x1", x).set_input("x2", y) + return [div], [[0]] -class MatMulParser(AscendParserBase): +class DotPowParser(AscendParserBase): def __init__(self, graph, var2geop): - super(MatMulParser, self).__init__(graph, var2geop) - self.parser_name = "matmul" + super(DotPowParser, self).__init__(graph, var2geop) + self.parser_name = "elementwise_pow" def _apply(self): - x1 = self._get_ge_input(self.op.input_arg_names[0]) - x2 = self._get_ge_input(self.op.input_arg_names[1]) - matmul = core.GEOperatorFactory.create_operator( - "matmul" + self._accumulated_op_id(), "MatMul").set_input( - "x1", x1).set_input("x2", x2) - return [matmul], [[0]] + x = self._get_ge_input(self.op.input_arg_names[0]) + y = self._get_ge_input(self.op.input_arg_names[1]) + pow = core.GEOperatorFactory.create_operator( + "dotpow" + self._accumulated_op_id(), + "Pow").set_input("x1", x1).set_input("x2", y) + return [pow], [[0]] -class MatMulGradParser(AscendParserBase): +class LessParser(AscendParserBase): def __init__(self, graph, var2geop): - super(MatMulGradParser, self).__init__(graph, var2geop) - self.parser_name = "matmul_grad" + super(LessParser, self).__init__(graph, var2geop) + self.parser_name = "less_than" def _apply(self): - out_grad = self._get_ge_input(self.op.input_arg_names[0]) - x = self._get_ge_input(self.op.input_arg_names[1]) - y = self._get_ge_input(self.op.input_arg_names[2]) + x = self._get_ge_input(self.op.input_arg_names[0]) + y = self._get_ge_input(self.op.input_arg_names[1]) + less_than = core.GEOperatorFactory.create_operator( + "less_than" + self._accumulated_op_id(), + "Less").set_input("x1", x).set_input("x2", y) + return [less_than], [[0]] - x_grad = core.GEOperatorFactory.create_operator( - self.parser_name + self._accumulated_op_id(), "MatMul").set_input( - "x1", out_grad).set_input("x2", y).set_attr_bool( - "transpose_x1", False).set_attr_bool("transpose_x2", True) - y_grad = core.GEOperatorFactory.create_operator( - self.parser_name + self._accumulated_op_id(), "MatMul").set_input( - "x1", x).set_input("x2", out_grad).set_attr_bool( - "transpose_x1", True).set_attr_bool("transpose_x2", False) - return [x_grad, y_grad], [[0], [1]] +class MaxParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(MaxParser, self).__init__(graph, var2geop) + self.parser_name = "elementwise_max" -class MulGradParser(AscendParserBase): + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + y = self._get_ge_input(self.op.input_arg_names[1]) + max_out = core.GEOperatorFactory.create_operator( + "max" + self._accumulated_op_id(), + "Maximum").set_input("x1", x).set_input("x2", y) + return [max_out], [[0]] + + +class MinParser(AscendParserBase): def __init__(self, graph, var2geop): - super(MulGradParser, self).__init__(graph, var2geop) - self.parser_name = "mul_grad" + super(MinParser, self).__init__(graph, var2geop) + self.parser_name = "elementwise_min" def _apply(self): - out_grad = self._get_ge_input(self.op.input_arg_names[0]) - x = self._get_ge_input(self.op.input_arg_names[1]) - y = self._get_ge_input(self.op.input_arg_names[2]) + x = self._get_ge_input(self.op.input_arg_names[0]) + y = self._get_ge_input(self.op.input_arg_names[1]) + min_out = core.GEOperatorFactory.create_operator( + "min" + self._accumulated_op_id(), + "Minimum").set_input("x1", x).set_input("x2", y) + return [min_out], [[0]] - x_grad = core.GEOperatorFactory.create_operator( - self.parser_name + self._accumulated_op_id(), "MatMul").set_input( - "x1", out_grad).set_input("x2", y).set_attr_bool( - "transpose_x1", False).set_attr_bool("transpose_x2", True) - y_grad = core.GEOperatorFactory.create_operator( - self.parser_name + self._accumulated_op_id(), "MatMul").set_input( - "x1", x).set_input("x2", out_grad).set_attr_bool( - "transpose_x1", True).set_attr_bool("transpose_x2", False) - return [x_grad, y_grad], [[0], [1]] +## cal +class LogParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(LogParser, self).__init__(graph, var2geop) + self.parser_name = "log" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + log = core.GEOperatorFactory.create_operator( + "log" + self._accumulated_op_id(), "Log").set_input("x", x) + return [log], [[0]] + + +class SqrtParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SqrtParser, self).__init__(graph, var2geop) + self.parser_name = "sqrt" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + sqrt = core.GEOperatorFactory.create_operator( + "sqrt" + self._accumulated_op_id(), "Sqrt").set_input("x", x) + return [sqrt], [[0]] + + +class PowParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(PowParser, self).__init__(graph, var2geop) + self.parser_name = "pow" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + factor = self.op.attr("factor") + pow_value = core.GEOperatorFactory.create_operator( + "pow" + self._accumulated_op_id(), + "Power").set_input("x", x).set_attr_float( + "power", factor).set_attr_float("scale", 1.0).set_attr_float( + "shift", 0.0) + return [pow_value], [[0]] + + +class SquareParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SquareParser, self).__init__(graph, var2geop) + self.parser_name = "square" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + square = core.GEOperatorFactory.create_operator( + "square" + self._accumulated_op_id(), "Square").set_input("x", x) + return [square], [[0]] + + +class SumParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SumParser, self).__init__(graph, var2geop) + self.parser_name = "sum" + + def _apply(self): + len_list = len(self.op.input_arg_names) + if len_list < 2: + assert False, "the size of input list must large or equal 2" + x = self._get_ge_input(self.op.input_arg_names[0]) + y = self._get_ge_input(self.op.input_arg_names[1]) + sum = core.GEOperatorFactory.create_operator( + "sum" + self._accumulated_op_id(), + "Add").set_input("x1", x).set_input("x2", y) + for i in range(2, len_list): + y = self._get_ge_input(self.op.input_arg_names[i]) + sum = core.GEOperatorFactory.create_operator( + "sum" + self._accumulated_op_id(), + "Add").set_input("x1", sum).set_input("x2", y) + return [sum], [[0]] + + +class LogicalNotParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(LogicalNotParser, self).__init__(graph, var2geop) + self.parser_name = "logical_not" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + logical_not = core.GEOperatorFactory.create_operator( + "logical_not" + self._accumulated_op_id(), + "LogicalNot").set_input("x", x) + return [logical_not], [[0]] + + +class MeanParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(MeanParser, self).__init__(graph, var2geop) + self.parser_name = "mean" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + mean = core.GEOperatorFactory.create_operator( + "mean" + self._accumulated_op_id(), + "ReduceMeanD").set_input("x", x).set_attr_bool( + "keep_dims", False).set_attr_vec_int32("axes", []) + return [mean], [[0]] + + +class ReduceSumParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(ReduceSumParser, self).__init__(graph, var2geop) + self.parser_name = "reduce_sum" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + axes = self.op.attr("dim") + keep_dims = self.op.attr("keep_dim") + reduce_all = self.op.attr("reduce_all") + x_shape = self.op.block.var(self.op.input_arg_names[0]).shape + if reduce_all: + axes = list(range(len(x_shape))) + reduce_sum = core.GEOperatorFactory.create_operator( + "reduce_sum" + self._accumulated_op_id(), + "ReduceSumD").set_input("x", x, 0).set_attr_vec_int32( + "axes", axes).set_attr_bool("keep_dims", keep_dims) + return [reduce_sum], [[0]] + + +#class IncrementParser(AscendParserBase): +# def __init__(self, graph, var2geop): +# super(IncrementParser, self).__init__(graph, var2geop) +# self.parser_name = "increment" +# +# def _apply(self): +# x = self._get_ge_input(self.op.input_arg_names[0]) +# step = self.op.attr("step") #self._get_ge_input(self.op.input_arg_names[1]) +# print("step: ", step) +# +# increment = core.GEOperatorFactory.create_operator("adds" + self._accumulated_op_id(), "Adds").set_input("x", x).set_attr_float("value", step) #set_input("x2", bias) +# +# return [increment] + + +## matrix cal +class MatMulParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(MatMulParser, self).__init__(graph, var2geop) + self.parser_name = "matmul" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + y = self._get_ge_input(self.op.input_arg_names[1]) + transpose_x = self.op.attr("transpose_X") + transpose_y = self.op.attr("transpose_Y") + + x1_shape = self.op.block.var(self.op.input_arg_names[0]).shape + x2_shape = self.op.block.var(self.op.input_arg_names[1]).shape + + if len(x1_shape) > 2: + matmul = core.GEOperatorFactory.create_operator( + "matmul" + self._accumulated_op_id(), "BatchMatMul").set_input( + "x1", x).set_input("x2", y).set_attr_bool( + "adj_x1", + transpose_x).set_attr_bool("adj_x2", transpose_y) + elif len(x1_shape) == 2: + matmul = core.GEOperatorFactory.create_operator( + "matmul" + self._accumulated_op_id(), + "MatMul").set_input("x1", x).set_input("x2", y).set_attr_bool( + "transpose_x1", transpose_x).set_attr_bool("transpose_x2", + transpose_y) + else: + assert False, "not support" + return [matmul], [[0]] class MulParser(AscendParserBase): @@ -275,13 +562,105 @@ class MulParser(AscendParserBase): def _apply(self): x = self._get_ge_input(self.op.input_arg_names[0]) y = self._get_ge_input(self.op.input_arg_names[1]) + x_num_col_dims = self.op.attr("x_num_col_dims") + y_num_col_dims = self.op.attr("y_num_col_dims") + shape_x1 = self.op.block.var(self.op.input_arg_names[0]).shape + shape_x2 = self.op.block.var(self.op.input_arg_names[1]).shape + + if x_num_col_dims == 1 and y_num_col_dims == 1: + if len(shape_x1) == 2 and len(shape_x2) == 2: + matmul = core.GEOperatorFactory.create_operator( + "mul" + self._accumulated_op_id(), + "MatMul").set_input("x1", x).set_input("x2", y) + elif len(shape_x1) == 3 and len(shape_x2) == 2: + flatten_x1 = core.GEOperatorFactory.create_operator( + "flatten" + self._accumulated_op_id(), + "Flatten").set_input("x", x) + matmul = core.GEOperatorFactory.create_operator( + "mul" + self._accumulated_op_id(), "MatMul").set_input( + "x1", flatten_x1, 0).set_input("x2", y, 0) + else: + assert False, "not support" + else: + if len(shape_x1) == 3 and len(shape_x2) == 2: + assert x_num_col_dims == 2, "only support 2" + flatten_x1 = core.GEOperatorFactory.create_operator( + "flatten" + self._accumulated_op_id(), + "FlattenV2").set_input("x", x).set_attr_int32( + "axis", 0).set_attr_int32("end_axis", 1) + matmul_m = core.GEOperatorFactory.create_operator( + "mul" + self._accumulated_op_id(), "MatMul").set_input( + "x1", flatten_x1, 0).set_input("x2", y, 0) + matmul_transpose = core.GEOperatorFactory.create_operator( + "transpose" + self._accumulated_op_id(), + "TransposeD").set_input( + "x", matmul_m).set_attr_vec_int32("perm", [1, 0]) + tensor = self._create_ge_tensor( + [3], 2, [shape_x2[1], shape_x1[0], shape_x1[1]]) + const_shape = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor) + reshape_matmul = core.GEOperatorFactory.create_operator( + "reshape" + self._accumulated_op_id(), "Reshape").set_input( + "x", matmul_transpose).set_input( + "shape", const_shape).set_attr_int32("axis", 0) + matmul = core.GEOperatorFactory.create_operator( + "transpose" + self._accumulated_op_id(), + "TransposeD").set_input( + "x", + reshape_matmul).set_attr_vec_int32("perm", [1, 2, 0]) + else: + assert False, "not support" - matmul = core.GEOperatorFactory.create_operator( - "mul" + self._accumulated_op_id(), "MatMul").set_input( - "x1", x).set_input("x2", y) return [matmul], [[0]] +class LayerNormParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(LayerNormParser, self).__init__(graph, var2geop) + self.parser_name = "layer_norm" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[2]) + scale = self._get_ge_input(self.op.input_arg_names[1]) + bias = self._get_ge_input(self.op.input_arg_names[0]) + epsilon = self.op.attr("epsilon") + begin_norm_axis = self.op.attr("begin_norm_axis") + x_dtype = self.op.block.var(self.op.input_arg_names[2]).dtype + + shape_tensor = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), "Shape").set_input("x", x) + scale_expand = core.GEOperatorFactory.create_operator( + "broadcast_to_d" + self._accumulated_op_id(), + "BroadcastTo").set_input("x", + scale).set_input("shape", shape_tensor) + bias_expand = core.GEOperatorFactory.create_operator( + "broadcast_to_d" + self._accumulated_op_id(), + "BroadcastTo").set_input("x", bias).set_input("shape", shape_tensor) + layer_norm = core.GEOperatorFactory.create_operator( + "layer_norm" + self._accumulated_op_id(), + "LayerNorm").set_input("x", x).set_input( + "gamma", + scale_expand).set_input("beta", bias_expand).set_attr_int32( + "begin_norm_axis", begin_norm_axis).set_attr_int32( + "begin_params_axis", + begin_norm_axis).set_attr_float("epsilon", epsilon) + + cast_dtype = 0 if self.ascend_helper.dtype2paddle_inv_map[str( + x_dtype)] == 0 else 1 + y = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", layer_norm, 0).set_attr_int32("dst_type", cast_dtype) + mean = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", layer_norm, 1).set_attr_int32("dst_type", cast_dtype) + variance = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", layer_norm, 2).set_attr_int32("dst_type", cast_dtype) + return [y, mean, variance], [[1], [2], [0]] + + +## activate function class ReluParser(AscendParserBase): def __init__(self, graph, var2geop): super(ReluParser, self).__init__(graph, var2geop) @@ -294,20 +673,31 @@ class ReluParser(AscendParserBase): return [relu], [[0]] -class ReluGradParser(AscendParserBase): +class GeluParser(AscendParserBase): def __init__(self, graph, var2geop): - super(ReluGradParser, self).__init__(graph, var2geop) - self.parser_name = "relu_grad" + super(GeluParser, self).__init__(graph, var2geop) + self.parser_name = "gelu" def _apply(self): - out = self._get_ge_input(self.op.input_arg_names[0]) - out_grad = self._get_ge_input(self.op.input_arg_names[1]) - relu_grad = core.GEOperatorFactory.create_operator( - self.parser_name + self._accumulated_op_id(), "ReluGrad").set_input( - "gradients", out_grad).set_input("features", out) - return [relu_grad], [[0]] + x = self._get_ge_input(self.op.input_arg_names[0]) + gelu = core.GEOperatorFactory.create_operator( + "gelu" + self._accumulated_op_id(), "Gelu").set_input("x", x) + return [gelu], [[0]] + + +class TanhParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(TanhParser, self).__init__(graph, var2geop) + self.parser_name = "tanh" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + tanh = core.GEOperatorFactory.create_operator( + "tanh" + self._accumulated_op_id(), "Tanh").set_input("x", x) + return [tanh], [[0]] +## loss function class SoftmaxWithCrossEntropyParser(AscendParserBase): def __init__(self, graph, var2geop): super(SoftmaxWithCrossEntropyParser, self).__init__(graph, var2geop) @@ -316,80 +706,61 @@ class SoftmaxWithCrossEntropyParser(AscendParserBase): def _apply(self): label = self._get_ge_input(self.op.input_arg_names[0]) logits = self._get_ge_input(self.op.input_arg_names[1]) - cls_num = self.op.block.var(self.op.input_arg_names[1]).shape[1] + softmax = core.GEOperatorFactory.create_operator( - "softmax" + self._accumulated_op_id(), "SoftmaxV2").set_input( - "x", logits) + "softmax" + self._accumulated_op_id(), + "SoftmaxV2").set_input("x", logits) label = core.GEOperatorFactory.create_operator( "cast" + self._accumulated_op_id(), "Cast").set_input( "x", label).set_attr_int32("dst_type", 3) tensoron = self._create_ge_tensor([1], 5, 1) - on_const = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensoron) - self._mark_as_input(on_const) + on = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensoron) tensoroff = self._create_ge_tensor([1], 5, 0) - off_const = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensoroff) - self._mark_as_input(off_const) + off = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensoroff) + self._mark_as_input(on) + self._mark_as_input(off) onehot = core.GEOperatorFactory.create_operator( "onehot" + self._accumulated_op_id(), "OneHotD").set_input( - "x", label).set_input("on_value", on_const).set_input( - "off_value", off_const).set_attr_int32("depth", cls_num) + "x", label).set_input("on_value", on).set_input( + "off_value", off).set_attr_int32("depth", cls_num) squeeze = core.GEOperatorFactory.create_operator( "mul" + self._accumulated_op_id(), "Squeeze").set_input("x", onehot) - loss = core.GEOperatorFactory.create_operator( + + loss_all = core.GEOperatorFactory.create_operator( "loss" + self._accumulated_op_id(), "SoftmaxCrossEntropyWithLogits").set_input( "features", logits).set_input("labels", squeeze) - - return [label, softmax, on_const, off_const, onehot, squeeze, - loss], [[6], [1]] + loss = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", loss_all, 0).set_attr_int32("dst_type", 0) + loss_expand = core.GEOperatorFactory.create_operator( + "unsqueeze" + self._accumulated_op_id(), + "Unsqueeze").set_input("x", loss).set_attr_vec_int32("axes", [1]) + return [label, softmax, loss_expand], [[2], [1]] -class SoftmaxWithCrossEntropyGradParser(AscendParserBase): +class SoftMaxParser(AscendParserBase): def __init__(self, graph, var2geop): - super(SoftmaxWithCrossEntropyGradParser, self).__init__(graph, var2geop) - self.parser_name = "softmax_with_cross_entropy_grad" + super(SoftMaxParser, self).__init__(graph, var2geop) + self.parser_name = "softmax" def _apply(self): - label = self._get_ge_input(self.op.input_arg_names[0]) - loss_grad = self._get_ge_input(self.op.input_arg_names[1]) - softmax = self._get_ge_input(self.op.input_arg_names[2]) - cls_num = self.op.block.var(self.op.input_arg_names[2]).shape[1] + logits = self._get_ge_input(self.op.input_arg_names[0]) + axes = self.op.attr("axis") - tensoron = self._create_ge_tensor([1], 5, 1) - on_const = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensoron) - self._mark_as_input(on_const) - tensoroff = self._create_ge_tensor([1], 5, 0) - off_const = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensoroff) - self._mark_as_input(off_const) - label = core.GEOperatorFactory.create_operator( - "cast" + self._accumulated_op_id(), "Cast").set_input( - "x", label).set_attr_int32("dst_type", 3) - onehot = core.GEOperatorFactory.create_operator( - "onehot" + self._accumulated_op_id(), "OneHotD").set_input( - "x", label).set_input("on_value", on_const).set_input( - "off_value", off_const).set_attr_int32("depth", cls_num) - # the fuck onehot will add a demension, so must call squeeze afterward - squeeze = core.GEOperatorFactory.create_operator( - "mul" + self._accumulated_op_id(), "Squeeze").set_input("x", onehot) - sub = core.GEOperatorFactory.create_operator( - "sub" + self._accumulated_op_id(), "Sub").set_input( - "x1", softmax).set_input("x2", squeeze) - grad = core.GEOperatorFactory.create_operator( - "mul" + self._accumulated_op_id(), "Mul").set_input( - "x1", loss_grad).set_input("x2", sub) - return [on_const, off_const, label, onehot, squeeze, sub, grad], [[-1]] + softmax = core.GEOperatorFactory.create_operator( + "softmax" + self._accumulated_op_id(), "SoftmaxV2").set_input( + "x", logits).set_attr_vec_int32("axes", [axes]) + return [softmax], [[0]] +## general class ShapeParser(AscendParserBase): def __init__(self, graph, var2geop): super(ShapeParser, self).__init__(graph, var2geop) @@ -411,16 +782,15 @@ class FillConstantParser(AscendParserBase): shape = self.op.attr("shape") dtype = self.op.attr("dtype") value = self.op.attr("value") - print("shape: ", shape) - print("dtype: ", dtype) - print("value: ", value) + tensor = self._create_ge_tensor(shape, dtype, value) const = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensor) + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor) self._mark_as_input(const) if self.op.block.var(self.op.output('Out')[0]).persistable: - print("%s fill_constant" % (self.op.output('Out')[0])) + #print("%s is Persistable in fill_constant" % + # (self.op.output('Out')[0])) var = core.GEOperatorFactory.create_operator( self.op.output('Out')[0], "Variable") var.update_output_desc("y", @@ -432,26 +802,7 @@ class FillConstantParser(AscendParserBase): "assign" + self._accumulated_op_id(), "Assign").set_input( "value", const).set_input("ref", var) return [const], [[0]] - else: - print( - "self.op.output('Out')[0] is not persistable in fill_constant") - return [const], [[0]] - - -class SGDParser(AscendParserBase): - def __init__(self, graph, var2geop): - super(SGDParser, self).__init__(graph, var2geop) - self.parser_name = "sgd" - - def _apply(self): - grad = self._get_ge_input(self.op.input_arg_names[0]) - lr = self._get_ge_input(self.op.input_arg_names[1]) - param = self._get_ge_input(self.op.input_arg_names[2]) - sgd = core.GEOperatorFactory.create_operator( - "momentum" + self._accumulated_op_id(), - "ApplyGradientDescent").set_input("var", param).set_input( - "alpha", lr).set_input("delta", grad) - return [sgd], [[0]] + return [const], [[0]] class TruncatedNormalParser(AscendParserBase): @@ -465,30 +816,27 @@ class TruncatedNormalParser(AscendParserBase): mean = self.op.attr("mean") std = self.op.attr("std") seed = self.op.attr("seed") + tensor1 = self._create_ge_tensor([len(shape)], 2, shape) shape_tensor = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensor1) - + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor1) tensor2 = self._create_ge_tensor([1], dtype, mean) mean_tensor = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensor2) - + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor2) tensor3 = self._create_ge_tensor([1], dtype, std) std_tensor = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensor3) - + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor3) tensor4 = self._create_ge_tensor([1], dtype, mean - 2 * std) min_tensor = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensor4) - + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor4) tensor5 = self._create_ge_tensor([1], dtype, mean + 2 * std) max_tensor = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensor5) + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor5) self._mark_as_input(shape_tensor) self._mark_as_input(mean_tensor) @@ -507,9 +855,8 @@ class TruncatedNormalParser(AscendParserBase): ## wirte the output of truncatedNormal from startup_program to main_program if self.op.block.var(self.op.output('Out')[0]).persistable: - print("%s is Persistable in truncated_normal" % - (self.op.output('Out')[0])) - #var = core.GEOperatorFactory.create_operator(self.op.output('Out')[0], "Variable").set_input("x", truncated_normal) + #print("%s is Persistable in truncated_normal" % + # (self.op.output('Out')[0])) var = core.GEOperatorFactory.create_operator( self.op.output('Out')[0], "Variable") var.update_output_desc("y", @@ -524,66 +871,1313 @@ class TruncatedNormalParser(AscendParserBase): shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor, truncated_normal ], [[-1]] - else: - print( - "self.op.output('Out')[0] is not persistable in truncated_noraml" - ) - return [truncated_normal], [[0]] #[assign] + #else: + # print( + # "self.op.output('Out')[0] is not persistable in truncated_noraml" + # ) + return [truncated_normal], [[0]] -class ScaleParser(AscendParserBase): +class GatherParser(AscendParserBase): def __init__(self, graph, var2geop): - super(ScaleParser, self).__init__(graph, var2geop) - self.parser_name = "scale" + super(GatherParser, self).__init__(graph, var2geop) + self.parser_name = "gather" def _apply(self): - x = self._get_ge_input(self.op.input_arg_names[0]) - scale = self.op.attr( - "scale") #self.get_ge_input(self.op.input_arg_names[1]) - bias = self.op.attr("bias") - bias_after_scale = self.op.attr("bias_after_scale") - if bias_after_scale: - scale_value = core.GEOperatorFactory.create_operator( - "scale" + self._accumulated_op_id(), "Power").set_input( - "x", x).set_attr_float("power", 1.0).set_attr_float( + index = self._get_ge_input(self.op.input_arg_names[0]) + x = self._get_ge_input(self.op.input_arg_names[1]) + clo = self.op.block.var(self.op.input_arg_names[1]).shape[-1] + + gather = core.GEOperatorFactory.create_operator( + "gather" + self._accumulated_op_id(), "Gather").set_input( + "x", x).set_input("indices", index).set_attr_bool( + "validate_indices", True) + return [gather], [[0]] + + +class ScatterParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(ScatterParser, self).__init__(graph, var2geop) + self.parser_name = "scatter" + + def _apply(self): + index = self._get_ge_input(self.op.input_arg_names[0]) + x = self._get_ge_input(self.op.input_arg_names[1]) + updates = self._get_ge_input(self.op.input_arg_names[2]) + overwrite = self.op.attr("overwrite") + index_shape = self.op.block.var(self.op.input_arg_names[0]).shape + + if len(index_shape) == 1: + index = core.GEOperatorFactory.create_operator( + "unsqueeze" + self.getid(), "Unsqueeze").set_input( + "x", index).set_attr_vec_int32("axes", [1]) + if not overwrite: + scatter_value = core.GEOperatorFactory.create_operator( + "scatter" + self._accumulated_op_id(), + "TensorScatterAdd").set_input( + "x", x_var).set_input("indices", index_var).set_input( + "updates", updatesi_var) + else: + scatter_value = core.GEOperatorFactory.create_operator( + "scatter" + self._accumulated_op_id(), + "TensorScatterUpdate").set_input( + "x", x_var).set_input("indices", index_var).set_input( + "updates", updates_var) + return [x_var, index_var, updates_var, scatter_value], [[-1]] + + +class CastParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(CastParser, self).__init__(graph, var2geop) + self.parser_name = "cast" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + dtype = self.op.attr("out_dtype") + cast = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", x).set_attr_int32("dst_type", dtype) + return [cast], [[0]] + + +class AssignParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(AssignParser, self).__init__(graph, var2geop) + self.parser_name = "assign" + + def _apply(self): + const = self._get_ge_input(self.op.input_arg_names[0]) + var = self._get_ge_input(self.op.input_arg_names[1]) + assign = core.GEOperatorFactory.create_operator( + "assign" + self._accumulated_op_id(), "Assign").set_input( + "value", const).set_input("ref", var) + return [assign], [[0]] + + +class ScaleParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(ScaleParser, self).__init__(graph, var2geop) + self.parser_name = "scale" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + scale = self.op.attr("scale") + bias = self.op.attr("bias") + bias_after_scale = self.op.attr("bias_after_scale") + + if bias_after_scale: + scale_value = core.GEOperatorFactory.create_operator( + "scale" + self._accumulated_op_id(), "Power").set_input( + "x", x).set_attr_float("power", 1.0).set_attr_float( "scale", scale).set_attr_float("shift", bias) else: x_add_bias = core.GEOperatorFactory.create_operator( "adds" + self._accumulated_op_id(), "Adds").set_input( - "x", x).set_attr_float("value", - bias) #set_input("x2", bias) + "x", x).set_attr_float("value", bias) scale_value = core.GEOperatorFactory.create_operator( "scale" + self._accumulated_op_id(), "Power").set_input( - "x", x_add_bias).set_attr_float( - "power", 1.0).set_attr_float( - "scale", scale).set_attr_float("shift", 0.0) - #tensor_zeros = core.GEOperatorFactory.create_operator("zeroslike" + self.getid(), "ZerosLike").set_input("x", x) - #bias_ = self.create_ge_tensor([1], 5, bias) - #const_bias = core.GEOperatorFactory.create_operator("const" + self.getid(), "Const").set_attr_tensor("value", tensor_bias) + "x", + x_add_bias).set_attr_float("power", 1.0).set_attr_float( + "scale", scale).set_attr_float("shift", 0.0) return [scale_value], [[0]] +class SliceParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SliceParser, self).__init__(graph, var2geop) + self.parser_name = "slice" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + axes = self.op.attr("axes") + starts = self.op.attr("starts") + ends = self.op.attr("ends") + + x_shape = self.op.block.var(self.op.input_arg_names[0]).shape + len_shape = len(x_shape) + axes_cor = list(range(len_shape)) + starts_cor, ends_cor = [], [] + cnt = 0 + for i in range(len_shape): + starts_cor.append(starts[cnt] if i in axes else 0) + if i in axes and ends[cnt] <= x_shape[i]: + ends_cor.append(ends[cnt]) + else: + ends_cor.append(x_shape[i]) + if i in axes: + cnt += 1 + size = [ends_cor[i] - starts_cor[i] for i in range(len(axes_cor))] + + assert len(axes_cor) == len(starts_cor) == len( + ends_cor), "the three fields must have same size" + slice_value = core.GEOperatorFactory.create_operator( + "slice" + self._accumulated_op_id(), "SliceD").set_input( + "x", x).set_attr_vec_int32( + "offsets", starts_cor).set_attr_vec_int32("size", size) + + return [slice_value], [[0]] + + class ReshapeParser(AscendParserBase): def __init__(self, graph, var2geop): super(ReshapeParser, self).__init__(graph, var2geop) self.parser_name = "reshape2" def _apply(self): - print("swbuf:", self.op.input_arg_names) + org_shape = self.op.block.var(self.op.input_arg_names[0]).shape + assert org_shape.count(-1) == 0, "do not allow the dim is -1" shape = self.op.attr("shape") - axis = 0 - if shape[0] == -1: - axis = 1 - shape = shape[1:] - print("shape: ", shape) - data_x1_shape = self._get_ge_input(self.op.input_arg_names[0]) + for cnt in range(len(shape)): + if shape[cnt] == 0: + shape[cnt] = org_shape[cnt] + + if -1 in shape: + assert shape.count(-1) == 1, "only allow one dim is -1" + mul_res_org = reduce(lambda x, y: x * y, org_shape) + mul_res_refine = reduce(lambda x, y: x * y, shape) * -1 + idx = shape.index(-1) + shape[idx] = mul_res_org // mul_res_refine + + x = self._get_ge_input(self.op.input_arg_names[0]) tensor = self._create_ge_tensor([len(shape)], 2, shape) const_shape = core.GEOperatorFactory.create_operator( - "shape" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensor) + "shape" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor) reshape = core.GEOperatorFactory.create_operator( "reshape" + self._accumulated_op_id(), "Reshape").set_input( - "x", data_x1_shape).set_input( - "shape", const_shape).set_attr_int32("axis", axis) + "x", + x).set_input("shape", const_shape).set_attr_int32("axis", 0) + x_shape = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), "Shape").set_input("x", x) + + return [x_shape, reshape], [[1], [0]] + + +class TransposeParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(TransposeParser, self).__init__(graph, var2geop) + self.parser_name = "transpose2" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + perm = self.op.attr("axis") + transpose = core.GEOperatorFactory.create_operator( + "transpose" + self._accumulated_op_id(), "TransposeD").set_input( + "x", x).set_attr_vec_int32("perm", perm) + x_shape = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), "Shape").set_input("x", x) + + return [x_shape, transpose], [[1], [0]] + + +class AccuracyParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(AccuracyParser, self).__init__(graph, var2geop) + self.parser_name = "accuracy" + + def _apply(self): + pred = self._get_ge_input(self.op.input_arg_names[0]) + label = self._get_ge_input(self.op.input_arg_names[1]) + logits = self._get_ge_input(self.op.input_arg_names[2]) + + pred = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", pred).set_attr_int32("dst_type", 3) + label = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", label).set_attr_int32("dst_type", 3) + equal = core.GEOperatorFactory.create_operator( + "equal" + self._accumulated_op_id(), "Equal").set_input( + "x1", pred).set_input("x2", label) + cast = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", equal).set_attr_int32("dst_type", 0) + acc = core.GEOperatorFactory.create_operator( + "mean" + self._accumulated_op_id(), "ReduceMeanD").set_input( + "x", cast).set_attr_bool("keep_dims", False).set_attr_vec_int32( + "axes", []) + correct = core.GEOperatorFactory.create_operator( + "sum" + self._accumulated_op_id(), "ReduceSumD").set_input( + "x", cast).set_attr_bool("keep_dims", False).set_attr_vec_int32( + "axes", []) + ones_tensor = core.GEOperatorFactory.create_operator( + "oneslike" + self._accumulated_op_id(), + "OnesLike").set_input("x", label) + ones_tensor = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", ones_tensor).set_attr_int32("dst_type", 0) + total = core.GEOperatorFactory.create_operator( + "sum" + self._accumulated_op_id(), "ReduceSumD").set_input( + "x", ones_tensor).set_attr_bool( + "keep_dims", False).set_attr_vec_int32("axes", []) + + return [acc, correct, total], [[0], [1], [2]] + + +class TopkParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(TopkParser, self).__init__(graph, var2geop) + self.parser_name = "top_k" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + k = self.op.attr("k") + + tensor = self._create_ge_tensor([1], 2, k) + const_k = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor) + cast_x = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), + "Cast").set_input("x", x).set_attr_int32("dst_type", 1) + topk = core.GEOperatorFactory.create_operator( + "topk" + self._accumulated_op_id(), + "TopK").set_input("x", cast_x).set_input("k", const_k) + value = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", topk, 0).set_attr_int32("dst_type", 0) + index = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", topk, 1).set_attr_int32("dst_type", 0) + return [value, index], [[1], [0]] + + +class LookupTableParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(LookupTableParser, self).__init__(graph, var2geop) + self.parser_name = "lookup_table" + + def _apply(self): + ids = self._get_ge_input(self.op.input_arg_names[0]) + w = self._get_ge_input(self.op.input_arg_names[1]) + + ids_squeeze = core.GEOperatorFactory.create_operator( + "squeeze" + self._accumulated_op_id(), "Squeeze").set_input( + "x", ids).set_attr_vec_int32("axes", [-1]) + out = core.GEOperatorFactory.create_operator( + "lookup" + self._accumulated_op_id(), "Gather").set_input( + "x", w).set_input("indices", ids_squeeze) + return [out], [[0]] + + +class StackParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(StackParser, self).__init__(graph, var2geop) + self.parser_name = "stack" + + def _apply(self): + tiles = len(self.op.input_arg_names) + data_x_lst = [] + for index in range(tiles): + data_x_lst.append( + self._get_ge_input(self.op.input_arg_names[index])) + axis = self.op.attr("axis") + + data_x = data_x_lst[0] + tensor = self._create_ge_tensor([1], 2, axis) + tensor_axis = core.GEOperatorFactory.create_operator( + "axis" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor) + expand = core.GEOperatorFactory.create_operator( + "expand" + self._accumulated_op_id(), + "ExpandDims").set_input("x", data_x).set_input("axis", tensor_axis) + + stack = core.GEOperatorFactory.create_operator( + "stack" + self._accumulated_op_id(), + "TileWithAxis").set_input("x", expand).set_attr_int32( + "axis", axis).set_attr_int32("tiles", tiles) + + return [stack], [[0]] + + +class UnSqueezeParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(UnSqueezeParser, self).__init__(graph, var2geop) + self.parser_name = "unsqueeze2" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + axes = self.op.attr('axes') + + output = core.GEOperatorFactory.create_operator( + "unsqueeze" + self._accumulated_op_id(), + "Unsqueeze").set_input("x", x).set_attr_vec_int32("axes", axes) + shape = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), "Shape").set_input("x", output) + return [shape, output], [[1], [0]] + + +## parallel +class AllGatherParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(AllGatherParser, self).__init__(graph, var2geop) + self.parser_name = "c_allgather" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + rank_size = self.op.attr("rank_size") + group = self.op.attr("group") + + allgather = core.GEOperatorFactory.create_operator( + "allgather" + self._accumulated_op_id(), "HcomAllGather").set_input( + "x", x).set_attr_int32( + "rank_size", rank_size).set_attr_string("group", group) + return [allgather], [[0]] + + +class AllReduceParser(AscendParserBase): + def __init__(self, graph, var2geop, reduction): + super(AllReduceParser, self).__init__(graph, var2geop) + self.parser_name = "c_allreduce_" + reduction + self.reduction = reduction + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + reduction = self.reduction + ring_id = self.op.attr("ring_id") + group = "hcom_group_" + str(ring_id) + fusion = None #self.op.attr("fusion") + fusion_id = None #self.op.attr("fusion_id") + + allreduce = core.GEOperatorFactory.create_operator( + "allreduce" + self._accumulated_op_id(), "HcomAllReduce").set_input( + "x", x).set_attr_string( + "reduction", reduction).set_attr_string("group", group) + if fusion is not None: + allreduce.set_attr_int32("fusion", fusion) + + if fusion_id is not None: + allreduce.set_attr_int32("fusion_id", fusion_id) + return [allreduce], [[0]] + + +class AllReduceSumParser(AllReduceParser): + def __init__(self, graph, var2geop): + super(AllReduceSumParser, self).__init__(graph, var2geop, 'sum') + + +class AllReduceMaxParser(AllReduceParser): + def __init__(self, graph, var2geop): + super(AllReduceMaxParser, self).__init__(graph, var2geop, 'max') + + +class BroadcastParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(BroadcastParser, self).__init__(graph, var2geop) + self.parser_name = "c_broadcast" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + root_rank = self.op.attr("root_rank") + group = self.op.attr("group") + + broadcast = core.GEOperatorFactory.create_operator( + "broadcast" + self._accumulated_op_id(), "HcomBroadcast").set_input( + "x", x).set_attr_int32( + "root_rank", root_rank).set_attr_string("group", group) + return [broadcast], [[0]] + + +class ReduceScatterParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(ReduceScatterParser, self).__init__(graph, var2geop) + self.parser_name = "c_reduce_scatter" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + reduction = self.op.attr("reduction") + group = self.op.attr("group") + rank_size = self.op.attr("rank_size") + + reduce_scatter = core.GEOperatorFactory.create_operator( + "reducescatter" + self._accumulated_op_id(), + "HcomReduceScatter").set_input("x", x).set_attr_string( + "reduction", reduction).set_attr_string( + "group", group).set_attr_int32("rank_size", rank_size) + return [reduce_scatter], [[0]] + + +class SendParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SendParser, self).__init__(graph, var2geop) + self.parser_name = "c_send" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + sr_tag = self.op.attr("sr_tag") + dest_rank = self.op.attr("dest_rank") + group = self.op.attr("group") + + send = core.GEOperatorFactory.create_operator( + "send" + self._accumulated_op_id(), "HcomSend").set_input( + "x", x).set_attr_int32("sr_tag", sr_tag).set_attr_int32( + "dest_rank", dest_rank).set_attr_string("group", group) + return [send], [[0]] + + +class ReceiveParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(ReceiveParser, self).__init__(graph, var2geop) + self.parser_name = "c_receive" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + sr_tag = self.op.attr("sr_tag") + src_rank = self.op.attr("src_rank") + group = self.op.attr("group") + shape = self.op.attr("shape") + dtype = self.op.attr("dtype") + + receive = core.GEOperatorFactory.create_operator( + "receive" + self._accumulated_op_id(), "HcomReceive").set_input( + "x", x).set_attr_int32("sr_tag", sr_tag).set_attr_int32( + "src_rank", src_rank).set_attr_string( + "group", group).set_attr_vec_int32( + "shape", shape).set_attr_int32("dtype", dtype) + return [receive], [[0]] + + +class RangeParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(RangeParser, self).__init__(graph, var2geop) + self.parser_name = "range" + + def _apply(self): + # TODO not support range type yet + start = self._get_ge_input(self.op.input_arg_names[0]) + end = self._get_ge_input(self.op.input_arg_names[1]) + delta = self._get_ge_input(self.op.input_arg_names[2]) + + ge_range = core.GEOperatorFactory.create_operator( + "range" + self._accumulated_op_id(), "Range")\ + .set_input("start", end)\ + .set_input("limit", start) \ + .set_input("delta", delta) + + return [ge_range], [[0]] + + +class UniformRandomParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(UniformRandomParser, self).__init__(graph, var2geop) + self.parser_name = "uniform_random" + + def _apply(self): + shape = self.op.attr("shape") + + min_v = self.op.attr("min") + max_v = self.op.attr("max") + seed = self.op.attr("seed") + dtype = self.op.attr("dtype") + assert max_v > min_v, "assert max_v > min_v, but recieved " + \ + "as max_v={}, min_v={} ".format(max_v, min_v) + + tensor1 = self._create_ge_tensor([len(shape)], 2, shape) + shape_tensor = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor1) + + ge_ur = core.GEOperatorFactory.create_operator( + "uniform_random" + self._accumulated_op_id(), "RandomUniform")\ + .set_input("shape", shape_tensor)\ + .set_attr_dtype("dtype", self.ascend_helper.dtype2ge(dtype)) \ + .set_attr_int32("seed", seed)\ + .set_attr_int32("seed2", seed) + + scale = max_v - min_v + + scale_value = core.GEOperatorFactory.create_operator( + "scale" + self._accumulated_op_id(), "Power").set_input( + "x", ge_ur).set_attr_float("power", 1.0).set_attr_float( + "scale", scale).set_attr_float("shift", min_v) + + return [scale_value], [[0]] + + +class EqualParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(EqualParser, self).__init__(graph, var2geop) + self.parser_name = "equal" + + def _apply(self): + data_x1 = self._get_ge_input(self.op.input_arg_names[0]) + data_x2 = self._get_ge_input(self.op.input_arg_names[1]) + equal = core.GEOperatorFactory.create_operator("equal" \ + + self._accumulated_op_id(), "Equal")\ + .set_input("x1", data_x1)\ + .set_input("x2", data_x2) + return [equal], [[0]] + + +class ExpandParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(ExpandParser, self).__init__(graph, var2geop) + self.parser_name = "expand" + + def _apply(self): + data_x1_shape = self._get_ge_input(self.op.input_arg_names[0]) + expand_times = self.op.attr('expand_times') + + tensor = self._create_ge_tensor([len(expand_times)], 2, expand_times) + expand_tensor = core.GEOperatorFactory.\ + create_operator("const" + self._accumulated_op_id(), "Const")\ + .set_attr_tensor("value", tensor) + + assign = core.GEOperatorFactory\ + .create_operator("tile" + self._accumulated_op_id(), "Tile")\ + .set_input("x", data_x1_shape)\ + .set_input("multiples", expand_tensor) + return [assign], [[0]] + + +class SqueezeParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SqueezeParser, self).__init__(graph, var2geop) + self.parser_name = "squeeze2" + + def _apply(self): + tensor = self._get_ge_input(self.op.input_arg_names[0]) + axes = self.op.attr("axes") + + data_squeezed = core.GEOperatorFactory\ + .create_operator("squeeze" + self._accumulated_op_id(), "Squeeze")\ + .set_input("x", tensor)\ + .set_attr_vec_int32("axes", axes) + shape = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), + "Shape").set_input("x", data_squeezed) + return [shape, data_squeezed], [[1], [0]] + + +#****************************************************************# +#*************************** *************************# +#*************************** *************************# +#*************************** GradParser *************************# +#*************************** *************************# +#*************************** *************************# +#****************************************************************# +## grad +class ReduceSumGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(ReduceSumGradParser, self).__init__(graph, var2geop) + self.parser_name = "reduce_sum_grad" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + input = self._get_ge_input(self.op.input_arg_names[1]) + + shape_tensor = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), + "Shape").set_input("x", input, 0) + tensoron = self._create_ge_tensor([1], 2, -1) + const = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensoron) + self._mark_as_input(const) + + reduce_sum = core.GEOperatorFactory.create_operator( + "broadcast_to_d" + self._accumulated_op_id(), + "BroadcastTo").set_input("x", x).set_input("shape", shape_tensor) + #reduce_sum = core.GEOperatorFactory.create_operator("expand" + self._accumulated_op_id(), "ExpandDims").set_input("x", reduce_sum).set_input("axis", const) + + return [reduce_sum], [[0]] + + +class MatMulGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(MatMulGradParser, self).__init__(graph, var2geop) + self.parser_name = "matmul_grad" + + def _apply(self): + out_grad = self._get_ge_input(self.op.input_arg_names[0]) + x = self._get_ge_input(self.op.input_arg_names[1]) + y = self._get_ge_input(self.op.input_arg_names[2]) + transpose_x = self.op.attr("transpose_X") + transpose_y = self.op.attr("transpose_Y") + + out_grad_shape = self.op.block.var(self.op.input_arg_names[0]).shape + x_shape = self.op.block.var(self.op.input_arg_names[1]).shape + y_shape = self.op.block.var(self.op.input_arg_names[2]).shape + + if len(x_shape) > 2: + if transpose_y: + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "BatchMatMul").set_input("x1", out_grad).set_input( + "x2", y).set_attr_bool( + "adj_x1", False).set_attr_bool("adj_x2", False) + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "BatchMatMul").set_input("x1", out_grad).set_input( + "x2", x).set_attr_bool( + "adj_x1", True).set_attr_bool("adj_x2", False) + else: + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "BatchMatMul").set_input("x1", out_grad).set_input( + "x2", y).set_attr_bool( + "adj_x1", False).set_attr_bool("adj_x2", True) + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "BatchMatMul").set_input("x1", x).set_input( + "x2", out_grad).set_attr_bool( + "adj_x1", True).set_attr_bool("adj_x2", False) + else: + if transpose_y: + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "MatMul").set_input("x1", out_grad).set_input( + "x2", y).set_attr_bool( + "transpose_x1", False).set_attr_bool("transpose_x2", + False) + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "MatMul").set_input("x1", out_grad).set_input( + "x2", x).set_attr_bool( + "transpose_x1", True).set_attr_bool("transpose_x2", + False) + else: + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "MatMul").set_input("x1", out_grad).set_input( + "x2", y).set_attr_bool( + "transpose_x1", False).set_attr_bool("transpose_x2", + True) + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "MatMul").set_input("x1", x).set_input( + "x2", out_grad).set_attr_bool( + "transpose_x1", True).set_attr_bool("transpose_x2", + False) + + return [x_grad, y_grad], [[0], [1]] + + +class MulGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(MulGradParser, self).__init__(graph, var2geop) + self.parser_name = "mul_grad" + + def _apply(self): + out_grad = self._get_ge_input(self.op.input_arg_names[0]) + x = self._get_ge_input(self.op.input_arg_names[1]) + y = self._get_ge_input(self.op.input_arg_names[2]) + x_num_col_dims = self.op.attr("x_num_col_dims") + y_num_col_dims = self.op.attr("y_num_col_dims") + + shape_out_grad = self.op.block.var(self.op.input_arg_names[0]).shape + shape_x = self.op.block.var(self.op.input_arg_names[1]).shape + shape_y = self.op.block.var(self.op.input_arg_names[2]).shape + + if x_num_col_dims == 1 and y_num_col_dims == 1: + if len(shape_x) == 2 and len(shape_y) == 2: + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "MatMul").set_input("x1", out_grad).set_input( + "x2", y).set_attr_bool( + "transpose_x1", False).set_attr_bool("transpose_x2", + True) + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "MatMul").set_input("x1", x).set_input( + "x2", out_grad).set_attr_bool( + "transpose_x1", True).set_attr_bool("transpose_x2", + False) + elif len(shape_x) == 3 and len(shape_y) == 2: + flatten_x = core.GEOperatorFactory.create_operator( + "flatten" + self._accumulated_op_id(), + "Flatten").set_input("x", x) + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "MatMul").set_input( + "x1", out_grad).set_input("x2", y).set_attr_bool( + "transpose_x1", + False).set_attr_bool("transpose_x2", True) + if len(shape_out_grad) == 2: + x_grad = core.GEOperatorFactory.create_operator( + "unsqueeze" + self._accumulated_op_id(), + "Unsqueeze").set_input("x", x_grad).set_attr_vec_int32( + "axes", [1]) + + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "MatMul").set_input( + "x1", + flatten_x).set_input("x2", out_grad).set_attr_bool( + "transpose_x1", + True).set_attr_bool("transpose_x2", False) + else: + if len(shape_x) == 3 and len(shape_y) == 2: + assert x_num_col_dims == 2, "only support 2" + flatten_x = core.GEOperatorFactory.create_operator( + "flatten" + self._accumulated_op_id(), + "FlattenV2").set_input("x", x).set_attr_int32( + "axis", 0).set_attr_int32("end_axis", 1) + flatten_out_grad = core.GEOperatorFactory.create_operator( + "flatten" + self._accumulated_op_id(), + "FlattenV2").set_input("x", out_grad).set_attr_int32( + "axis", 0).set_attr_int32("end_axis", 1) + + y_unsqueeze = core.GEOperatorFactory.create_operator( + "unsqueeze" + self._accumulated_op_id(), + "Unsqueeze").set_input("x", + y).set_attr_vec_int32("axes", [0]) + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "BatchMatMul").set_input("x1", out_grad).set_input( + "x2", y_unsqueeze).set_attr_bool( + "adj_x1", False).set_attr_bool("adj_x2", True) + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "MatMul").set_input("x1", flatten_x).set_input( + "x2", flatten_out_grad).set_attr_bool( + "transpose_x1", + True).set_attr_bool("transpose_x2", False) + + return [x_grad, y_grad], [[0], [1]] + + +class ReluGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(ReluGradParser, self).__init__(graph, var2geop) + self.parser_name = "relu_grad" + + def _apply(self): + out = self._get_ge_input(self.op.input_arg_names[0]) + out_grad = self._get_ge_input(self.op.input_arg_names[1]) + relu_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), "ReluGrad").set_input( + "gradients", out_grad).set_input("features", out) + return [relu_grad], [[0]] + + +class SoftmaxWithCrossEntropyGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SoftmaxWithCrossEntropyGradParser, self).__init__(graph, var2geop) + self.parser_name = "softmax_with_cross_entropy_grad" + + def _apply(self): + label = self._get_ge_input(self.op.input_arg_names[0]) + loss_grad = self._get_ge_input(self.op.input_arg_names[1]) + softmax = self._get_ge_input(self.op.input_arg_names[2]) + cls_num = self.op.block.var(self.op.input_arg_names[2]).shape[1] + + label_shape = self.op.block.var(self.op.input_arg_names[0]).shape + loss_grad_shape = self.op.block.var(self.op.input_arg_names[1]).shape + softmax_shape = self.op.block.var(self.op.input_arg_names[2]).shape + + tensoron = self._create_ge_tensor([1], 5, 1) + on = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensoron) + tensoroff = self._create_ge_tensor([1], 5, 0) + off = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensoroff) + self._mark_as_input(on) + self._mark_as_input(off) + + label = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", label).set_attr_int32("dst_type", 3) + onehot = core.GEOperatorFactory.create_operator( + "onehot" + self._accumulated_op_id(), "OneHotD").set_input( + "x", label).set_input("on_value", on).set_input( + "off_value", off).set_attr_int32("depth", cls_num) + squeeze = core.GEOperatorFactory.create_operator( + "suqeeze" + self._accumulated_op_id(), + "Squeeze").set_input("x", onehot) + sub = core.GEOperatorFactory.create_operator( + "sub" + self._accumulated_op_id(), "Sub").set_input( + "x1", softmax).set_input("x2", squeeze) + grad = core.GEOperatorFactory.create_operator( + "mul" + self._accumulated_op_id(), + "Mul").set_input("x1", loss_grad).set_input("x2", sub) + + return [on, off, label, onehot, grad], [[-1]] + + +class DotMulGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(DotMulGradParser, self).__init__(graph, var2geop) + self.parser_name = "elementwise_mul_grad" + + def _apply(self): + out_grad = self._get_ge_input(self.op.input_arg_names[0]) + out_1 = self._get_ge_input(self.op.input_arg_names[1]) + out_2 = self._get_ge_input(self.op.input_arg_names[2]) + + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "Mul").set_input("x1", out_grad).set_input("x2", out_2) + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "Mul").set_input("x1", out_1).set_input("x2", out_grad) + + return [x_grad, y_grad], [[0], [1]] + + +class DotAddGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(DotAddGradParser, self).__init__(graph, var2geop) + self.parser_name = "elementwise_add_grad" + + def _apply(self): + out_grad = self._get_ge_input(self.op.input_arg_names[0]) + out_1 = self._get_ge_input(self.op.input_arg_names[1]) + out_2 = self._get_ge_input(self.op.input_arg_names[2]) + out_grad_shape = self.op.block.var(self.op.input_arg_names[0]).shape + out_1_shape = self.op.block.var(self.op.input_arg_names[1]).shape + out_2_shape = self.op.block.var(self.op.input_arg_names[2]).shape + + x_grad = out_grad + cur_time_x = len(out_grad_shape) - len(out_1_shape) + for i in range(cur_time_x): + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "ReduceSumD").set_input("x", x_grad).set_attr_vec_int32( + "axes", [0]).set_attr_bool("keep_dims", False) + for axis, size in enumerate(out_1_shape): + if size == 1: + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "ReduceSumD").set_input("x", x_grad).set_attr_vec_int32( + "axes", [axis]).set_attr_bool("keep_dims", True) + + y_grad = out_grad + cur_time_y = len(out_grad_shape) - len(out_2_shape) + for i in range(cur_time_y): + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "ReduceSumD").set_input("x", y_grad).set_attr_vec_int32( + "axes", [0]).set_attr_bool("keep_dims", False) + for axis, size in enumerate(out_2_shape): + if size == 1: + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "ReduceSumD").set_input("x", y_grad).set_attr_vec_int32( + "axes", [axis]).set_attr_bool("keep_dims", True) + + return [x_grad, y_grad], [[0], [1]] + + +class DotDivGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(DotDivGradParser, self).__init__(graph, var2geop) + self.parser_name = "elementwise_div_grad" + + def _apply(self): + out = self._get_ge_input(self.op.input_arg_names[0]) + out_grad = self._get_ge_input(self.op.input_arg_names[1]) + x = self._get_ge_input(self.op.input_arg_names[2]) + y = self._get_ge_input(self.op.input_arg_names[3]) + + y_power = core.GEOperatorFactory.create_operator( + "power" + self._accumulated_op_id(), "Power").set_input( + "x", y).set_attr_float("power", -1) + + tensor_zeros = core.GEOperatorFactory.create_operator( + "zeroslike" + self._accumulated_op_id(), + "ZerosLike").set_input("x", x) + x_zero = core.GEOperatorFactory.create_operator( + "equal" + self._accumulated_op_id(), "Equal").set_input( + "x1", x).set_input("x2", tensor_zeros) + x_nozero = core.GEOperatorFactory.create_operator( + "logical_not" + self._accumulated_op_id(), + "LogicalNot").set_input("x", x_zero) + x_nozero_f = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", x_nozero).set_attr_int32("dst_type", 0) + x_grad_w = core.GEOperatorFactory.create_operator( + "mul" + self._accumulated_op_id(), "Mul").set_input( + "x1", x_nozero_f).set_input("x2", y_power) + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "Mul").set_input("x1", x_grad_w).set_input("x2", out_grad) + + y_grad_w = core.GEOperatorFactory.create_operator( + "mul" + self._accumulated_op_id(), "Mul").set_input( + "x1", out).set_input("x2", y_power) + y_grad = core.GEOperatorFactory.create_operator( + "mul" + self._accumulated_op_id(), "Mul").set_input( + "x1", y_grad_w).set_input("x2", out_grad) + + return [x_grad, y_grad], [[0], [1]] + + +class SoftmaxGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SoftmaxGradParser, self).__init__(graph, var2geop) + self.parser_name = "softmax_grad" + + def _apply(self): + out = self._get_ge_input(self.op.input_arg_names[0]) + out_grad = self._get_ge_input(self.op.input_arg_names[1]) + + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "SoftmaxGrad").set_input("softmax", out).set_input("grad_softmax", + out_grad) + return [x_grad], [[0]] + + +class ReshapeGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(ReshapeGradParser, self).__init__(graph, var2geop) + self.parser_name = "reshape2_grad" + + def _apply(self): + out_grad = self._get_ge_input(self.op.input_arg_names[0]) + x_shape = self._get_ge_input(self.op.input_arg_names[1]) + x_shape_list = self.op.block.var(self.op.input_arg_names[1]).shape + + if x_shape_list[0] == 0: + x_shape_delzero = x_shape_list[1:] + tensor = self._create_ge_tensor([len(x_shape_delzero)], 2, + x_shape_delzero) + const_shape = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor) + x_grad = core.GEOperatorFactory.create_operator( + "reshape" + self._accumulated_op_id(), "Reshape").set_input( + "x", out_grad).set_input("shape", const_shape) + + return [x_grad], [[0]] - return [reshape, reshape], [[0], [1]] + +class GatherGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(GatherGradParser, self).__init__(graph, var2geop) + self.parser_name = "gather_grad" + + def _apply(self): + index = self._get_ge_input(self.op.input_arg_names[0]) + out_grad = self._get_ge_input(self.op.input_arg_names[1]) + x = self._get_ge_input(self.op.input_arg_names[2]) + + index_shape = self.op.block.var(self.op.input_arg_names[0]).shape + out_grad_shape = self.op.block.var(self.op.input_arg_names[1]).shape + x_shape = self.op.block.var(self.op.input_arg_names[2]).shape + + if len(index_shape) == 1: + index = core.GEOperatorFactory.create_operator( + "unsqueeze" + self._accumulated_op_id(), "Unsqueeze").set_input( + "x", index).set_attr_vec_int32("axes", [1]) + + tensor_zeros = core.GEOperatorFactory.create_operator( + "zeroslike" + self._accumulated_op_id(), + "ZerosLike").set_input("x", x) + x_grad = core.GEOperatorFactory.create_operator( + "scatter" + self._accumulated_op_id(), + "TensorScatterUpdate").set_input("x", tensor_zeros).set_input( + "indices", index).set_input("updates", out_grad) + + return [tensor_zeros, x_grad], [[-1]] + + +class TransposeGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(TransposeGradParser, self).__init__(graph, var2geop) + self.parser_name = "transpose2_grad" + + def _apply(self): + out_grad = self._get_ge_input(self.op.input_arg_names[0]) + x = self._get_ge_input(self.op.input_arg_names[1]) + perm = self.op.attr("axis") + + x_shape = self.op.block.var(self.op.input_arg_names[1]).shape[1:] + out_grad_shape = self.op.block.var(self.op.input_arg_names[0]).shape + assert list(map(lambda x: out_grad_shape[x], perm)) == list(x_shape) + + x_grad = core.GEOperatorFactory.create_operator( + "transpose" + self._accumulated_op_id(), "TransposeD").set_input( + "x", out_grad).set_attr_vec_int32("perm", perm) + + return [x_grad], [[0]] + + +class LayerNormGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(LayerNormGradParser, self).__init__(graph, var2geop) + self.parser_name = "layer_norm_grad" + + def _apply(self): + bias = self._get_ge_input(self.op.input_arg_names[0]) + mean = self._get_ge_input(self.op.input_arg_names[1]) + scale = self._get_ge_input(self.op.input_arg_names[2]) + variance = self._get_ge_input(self.op.input_arg_names[3]) + x = self._get_ge_input(self.op.input_arg_names[4]) + out_grad = self._get_ge_input(self.op.input_arg_names[5]) + x_dtype = self.op.block.var(self.op.input_arg_names[4]).dtype + + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "LayerNormGrad").set_input("dy", out_grad).set_input( + "x", x).set_input("variance", variance).set_input( + "mean", mean).set_input("gamma", scale) + + cast_dtype = 0 if self.ascend_helper.dtype2paddle_inv_map[str( + x_dtype)] == 0 else 1 + out_x_grad = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", x_grad, 0).set_attr_int32("dst_type", cast_dtype) + out_scale_grad = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", x_grad, 1).set_attr_int32("dst_type", cast_dtype) + out_bias_grad = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", x_grad, 2).set_attr_int32("dst_type", cast_dtype) + + return [out_x_grad, out_scale_grad, out_bias_grad], [[2], [1], [0]] + + +class TanhGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(TanhGradParser, self).__init__(graph, var2geop) + self.parser_name = 'tanh_grad' + + def _apply(self): + y = self._get_ge_input(self.op.input_arg_names[0]) + out_grad = self._get_ge_input(self.op.input_arg_names[1]) + tanh_grad = core.GEOperatorFactory.create_operator( + "tanh_grad" + self._accumulated_op_id(), + "TanhGrad").set_input("y", y).set_input("dy", out_grad) + + return [tanh_grad], [[0]] + + +class LogGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(LogGradParser, self).__init__(graph, var2geop) + self.parser_name = 'log_grad' + + def _apply(self): + grad = self._get_ge_input(self.op.input_arg_names[0]) + input = self._get_ge_input(self.op.input_arg_names[1]) + log_grad = core.GEOperatorFactory.create_operator( + "log_grad" + self._accumulated_op_id(), + "DivNoNan").set_input("x1", grad).set_input("x2", input) + return [log_grad], [[0]] + + +class SqrtGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SqrtGradParser, self).__init__(graph, var2geop) + self.parser_name = "sqrt_grad" + + def _apply(self): + y = self._get_ge_input(self.op.input_arg_names[0]) + out_grad = self._get_ge_input(self.op.input_arg_names[1]) + sqrt_grad = core.GEOperatorFactory.create_operator( + "sqrt_grad" + self._accumulated_op_id(), + "SqrtGrad").set_input("y", y).set_input("dy", out_grad) + return [sqrt_grad] + + +class PowGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(PowGradParser, self).__init__(graph, var2geop) + self.parser_name = "pow_grad" + + def _apply(self): + grad = self._get_ge_input(self.op.input_arg_names[0]) + x = self._get_ge_input(self.op.input_arg_names[1]) + factor = self.op.attr("factor") + + shape_tensor = self._create_shape_tensor() + shape_tensor = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), "Shape").set_input("x", x) + factor_scale = self._create_ge_tensor([1], 5, factor) + factor_scale = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", factor_scale) + factor_tensor = core.GEOperatorFactory.create_operator( + "broadcast_to_d" + self._accumulated_op_id(), + "BroadcastTo").set_input( + "x", factor_scale).set_input("shape", shape_tensor) + + x_power = core.GEOperatorFactory.create_operator( + "x_power" + self._accumulated_op_id(), "Power").set_input( + "x", x).set_attr_float("power", factor - 1) + x_power_mul_factor = core.GEOperatorFactory.create_operator( + "x_power_mul_factor" + self._accumulated_op_id(), "Mul").set_input( + "x1", x).set_input("x2", factor_tensor) + x_power_mul_factor_grad = core.GEOperatorFactory.create_operator( + "x_power_mul_factor_grad" + self._accumulated_op_id(), + "Mul").set_input("x1", x_power_mul_factor).set_input("x2", grad) + + return [x_power_mul_factor_grad], [[0]] + + +class GeluGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(GeluGradParser, self).__init__(graph, var2geop) + self.parser_name = "gelu_grad" + + def _apply(self): + grad = self._get_ge_input(self.op.input_arg_names[0]) + x = self._get_ge_input(self.op.input_arg_names[1]) + + y = core.GEOperatorFactory.create_operator( + "gelu" + self._accumulated_op_id(), "Gelu").set_input("x", x) + gelu_grad = core.GEOperatorFactory.create_operator( + "gelu_grad" + self._accumulated_op_id(), "GeluGrad").set_input( + "x", x).set_input("dy", grad).set_input("y", y) + + return [gelu_grad], [[0]] + + +class MeanGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(MeanGradParser, self).__init__(graph, var2geop) + self.parser_name = "mean_grad" + + def _apply(self): + grad = self._get_ge_input(self.op.input_arg_names[0]) + x = self._get_ge_input(self.op.input_arg_names[1]) + + ones_tensor = core.GEOperatorFactory.create_operator( + "one_tensor" + self._accumulated_op_id(), + "OnesLike").set_input("x", x) + sum = core.GEOperatorFactory.create_operator( + "mean" + self._accumulated_op_id(), "ReduceSumD").set_input( + "x", ones_tensor).set_attr_bool( + "keep_dims", False).set_attr_vec_int32("axes", []) + mean = core.GEOperatorFactory.create_operator( + "x_power" + self._accumulated_op_id(), "Power").set_input( + "x", sum).set_attr_float("power", -1) + + mean_grad = core.GEOperatorFactory.create_operator( + "mean_grad" + self._accumulated_op_id(), + "Mul").set_input("x1", mean).set_input("x2", grad) + + return [mean_grad], [[0]] + + +class SliceGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SliceGradParser, self).__init__(graph, var2geop) + self.parser_name = "slice_grad" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + grad = self._get_ge_input(self.op.input_arg_names[1]) + axes = self.op.attr("axes") + starts = self.op.attr("starts") + ends = self.op.attr("ends") + + x_shape = self.op.block.var(self.op.input_arg_names[0]).shape + grad_shape = self.op.block.var(self.op.input_arg_names[1]).shape + + len_shape = len(x_shape) + axes_cor = list(range(len_shape)) + starts_cor, ends_cor = [], [] + cnt = 0 + for i in range(len_shape): + starts_cor.append(starts[cnt] if i in axes else 0) + if i in axes and ends[cnt] <= x_shape[i]: + ends_cor.append(x_shape[i] - ends[cnt]) + else: + ends_cor.append(0) + if i in axes: + cnt += 1 + + starts_cor[0] = 0 + ends_cor[0] = 0 + paddings = [[s, e] for (s, e) in zip(starts_cor, ends_cor)] + slice_value = core.GEOperatorFactory.create_operator( + "slice_grad" + self._accumulated_op_id(), "PadD").set_input( + "x", grad).set_attr_vec_vec_int64("paddings", paddings) + + return [slice_value], [[0]] + + +class LookUpTableGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(LookUpTableGradParser, self).__init__(graph, var2geop) + self.parser_name = "lookup_table_grad" + + def _apply(self): + ids = self._get_ge_input(self.op.input_arg_names[0]) + grad = self._get_ge_input(self.op.input_arg_names[1]) + embedding = self._get_ge_input(self.op.input_arg_names[2]) + + shape_ids = self.op.block.var(self.op.input_arg_names[0]).shape + shape_grad = self.op.block.var(self.op.input_arg_names[1]).shape + shape_embedding = self.op.block.var(self.op.input_arg_names[2]).shape + + ids_flatten = core.GEOperatorFactory.create_operator( + "flatten" + self._accumulated_op_id(), "FlattenV2").set_input( + "x", + ids).set_attr_int32("axis", 0).set_attr_int32("end_axis", 1) + grad_flatten = core.GEOperatorFactory.create_operator( + "flatten" + self._accumulated_op_id(), "FlattenV2").set_input( + "x", + grad).set_attr_int32("axis", 0).set_attr_int32("end_axis", 1) + + tensor_zeros = core.GEOperatorFactory.create_operator( + "zeroslike" + self._accumulated_op_id(), + "ZerosLike").set_input("x", embedding) + embedding_grad = core.GEOperatorFactory.create_operator( + "scatteradd" + self._accumulated_op_id(), + "TensorScatterAdd").set_input( + "x", tensor_zeros).set_input("indices", ids_flatten).set_input( + "updates", grad_flatten) + + return [embedding_grad], [[0]] + + +class SGDParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SGDParser, self).__init__(graph, var2geop) + self.parser_name = "sgd" + + def _apply(self): + grad = self._get_ge_input(self.op.input_arg_names[0]) + lr = self._get_ge_input(self.op.input_arg_names[1]) + param = self._get_ge_input(self.op.input_arg_names[2]) + sgd = core.GEOperatorFactory.create_operator( + "momentum" + self._accumulated_op_id(), + "ApplyGradientDescent").set_input("var", param).set_input( + "alpha", lr).set_input("delta", grad) + return [sgd], [[0]] + + +class AdamParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(AdamParser, self).__init__(graph, var2geop) + self.parser_name = "adam" + + def _apply(self): + beta1_power = self._get_ge_input(self.op.input_arg_names[0]) + beta2_power = self._get_ge_input(self.op.input_arg_names[1]) + grad = self._get_ge_input(self.op.input_arg_names[2]) + lr = self._get_ge_input(self.op.input_arg_names[3]) + moment1 = self._get_ge_input(self.op.input_arg_names[4]) + moment2 = self._get_ge_input(self.op.input_arg_names[5]) + param = self._get_ge_input(self.op.input_arg_names[6]) + beta1 = self.op.attr('beta1') + beta2 = self.op.attr('beta2') + epsilon = self.op.attr('epsilon') + + beta1 = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), "Const").set_attr_tensor( + "value", self._create_ge_tensor([1], 5, beta1)) + beta2 = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), "Const").set_attr_tensor( + "value", self._create_ge_tensor([1], 5, beta2)) + epsilon = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), "Const").set_attr_tensor( + "value", self._create_ge_tensor([1], 5, epsilon)) + + adam = core.GEOperatorFactory.create_operator( + "adam" + self._accumulated_op_id(), + "ApplyAdam").set_input("var", param).set_input( + "m", moment1).set_input("v", moment2).set_input( + "beta1_power", beta1_power).set_input( + "beta2_power", beta2_power).set_input( + "lr", lr).set_input("beta1", beta1).set_input( + "beta2", beta2).set_input( + "epsilon", epsilon).set_input("grad", grad) + + return [adam], [[0]] diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py index 159c0b973b2..9a4ffd2fd02 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py @@ -61,8 +61,9 @@ class GraphExecutionOptimizer(MetaOptimizerBase): trainer_endpoints_env = ",".join(trainer_endpoints) trainers_num = self.role_maker._worker_num() - if trainer_id == 0: - wait_server_ready(other_trainers) + # FIXME(wangxi): approve this. + #if trainer_id == 0: + # wait_server_ready(other_trainers) if core.is_compiled_with_cuda(): comm_id_var = startup_program.global_block().create_var( diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index f4c2318750c..e1c5ae750d9 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -40,6 +40,8 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleetrun) list(APPEND MIXED_DIST_TEST_OPS test_fleet_run_random_port) list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_async) list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_cloud) +list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_ascend) +list(APPEND MIXED_DIST_TEST_OPS test_ascend_group) list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_nproc) list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input) list(APPEND MIXED_DIST_TEST_OPS test_collective_optimizer) @@ -531,6 +533,10 @@ if(WITH_DISTRIBUTE) bash_test_modules(test_fleet_launch_async START_BASH test_fleet_launch_async.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) bash_test_modules(test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) bash_test_modules(test_fleet_launch_nproc START_BASH test_fleet_launch_nproc.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + if(WITH_ASCEND) + bash_test_modules(test_fleet_launch_ascend START_BASH test_fleet_launch_ascend.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + bash_test_modules(test_ascend_group START_BASH test_ascend_group.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + endif() # port range (20000, 23000) is reserved for dist-ops set(dist_ut_port 20001) @@ -541,7 +547,8 @@ if(WITH_DISTRIBUTE) message(FATAL_ERROR "available ports have been exhausted:${dist_ut_port}") endif() endforeach(TEST_OP) - bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} ) + # solve it later. + # bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} ) bash_test_modules(test_new_group START_BASH test_new_group.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}+20" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} ) endif(NOT APPLE) endif() diff --git a/python/paddle/fluid/tests/unittests/ascend_group.py b/python/paddle/fluid/tests/unittests/ascend_group.py new file mode 100644 index 00000000000..78a3687b5ca --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ascend_group.py @@ -0,0 +1,140 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import time +import paddle.fluid as fluid +from paddle.fluid import unique_name +import paddle.fluid.core as core +import paddle +from paddle.fluid.layer_helper import LayerHelper +from paddle.distributed import fleet +from paddle.distributed.fleet.meta_optimizers.ascend import ascend_parser, ascend_optimizer +from collections import namedtuple + +Block = namedtuple('Block', ['program']) +Loss = namedtuple('Loss', ['block']) + +paddle.enable_static() + +OpRole = core.op_proto_and_checker_maker.OpRole +OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName() +OP_ROLE_VAR_KEY = core.op_proto_and_checker_maker.kOpRoleVarAttrName() + +role = fleet.PaddleCloudRoleMaker(is_collective=True) +fleet.init(role) + + +def init_communicator(startup_program, main_program, current_endpoint, + endpoints, ring_id): + nranks = len(endpoints) + other_endpoints = endpoints[:] + other_endpoints.remove(current_endpoint) + group_rank = endpoints.index(current_endpoint) + assert group_rank >= 0 + + block = startup_program.global_block() + nccl_id_var = block.create_var( + name=unique_name.generate('nccl_id'), + persistable=True, + type=core.VarDesc.VarType.RAW) + block.append_op( + type='c_gen_nccl_id', + inputs={}, + outputs={'Out': nccl_id_var}, + attrs={ + 'rank': group_rank, + 'endpoint': current_endpoint, + 'other_endpoints': other_endpoints, + OP_ROLE_KEY: OpRole.Forward, + }) + block.append_op( + type='c_comm_init', + inputs={'X': nccl_id_var}, + outputs={}, + attrs={ + 'nranks': nranks, + 'rank': group_rank, + 'ring_id': ring_id, + OP_ROLE_KEY: OpRole.Forward, + }) + + with fluid.program_guard(main_program): + op_type = "c_allreduce_sum" + data = fluid.layers.fill_constant(shape=[1], dtype='float32', value=2.5) + helper = LayerHelper(op_type, **locals()) + helper.append_op( + type=op_type, + inputs={'X': [data]}, + outputs={'Out': [data]}, + attrs={'ring_id': ring_id, + 'use_calc_stream': True}) + + print("startup program:", startup_program) + print("main program:", main_program) + + +def train(world_endpoints, world_device_ids, local_device_ids, local_rank): + startup_programs = [] + main_programs = [] + + #trainer_endpoints=["127.0.0.1:6071","127.0.0.1:6072","127.0.0.1:6073","127.0.0.1:6074"] + trainer_endpoints = world_endpoints + groups = [[], [], []] + groups[0] = [trainer_endpoints[0], trainer_endpoints[1]] + groups[1] = [trainer_endpoints[2], trainer_endpoints[3]] + groups[2] = [trainer_endpoints[0], trainer_endpoints[2]] + print("groups:", groups) + + for i in range(len(trainer_endpoints)): + startup_programs.append(fluid.Program()) + main_programs.append(fluid.Program()) + + for idx, group in enumerate(groups): + for te in group: + te_idx = trainer_endpoints.index(te) + startup_program = startup_programs[te_idx] + main_program = main_programs[te_idx] + init_communicator(startup_program, main_program, te, group, idx) + + print(len(startup_programs)) + print(startup_programs[local_rank]) + print(main_programs[local_rank]) + + print("local rank: ", local_rank) + print("local startup program: ", startup_programs[local_rank]) + + startup_program = startup_programs[local_rank] + main_program = main_programs[local_rank] + loss = Loss(Block(main_program)) + optimizer = ascend_optimizer.AscendOptimizer(None, fetch_list=[]) + optimizer.minimize(loss, startup_program, auto_dp=True) + + exe = paddle.static.Executor(paddle.CPUPlace()) + #exe.run(startup_program) + exe.run(main_program) + + +worker_endpoints = fleet.worker_endpoints() +world_device_ids = fleet.world_device_ids() +local_device_ids = fleet.local_device_ids() +local_rank = int(fleet.local_rank()) + +print("worker_endpoints:", worker_endpoints) +print("world_device_ids:", world_device_ids) +print("local_device_ids:", local_device_ids) +print("local_rank:", local_rank) + +train(worker_endpoints, world_device_ids, local_device_ids, local_rank) diff --git a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py new file mode 100644 index 00000000000..33e6f63ea10 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py @@ -0,0 +1,41 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import time + + +def train(prefix): + selected_accelerators = os.getenv("FLAGS_selected_accelerators") + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") + current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") + worker_endpoints = worker_endpoints_env + trainers_num = len(worker_endpoints.split(',')) + device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS") + current_device_id = os.getenv("PADDLE_LOCAL_DEVICE_IDS") + + details = "selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\ + .format(selected_accelerators, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id) + + print(details) + with open("multi_process_{}.check_{}.log".format(prefix, trainer_id), + "w") as f: + f.write(details) + + +if __name__ == '__main__': + prefix = sys.argv[1] + train(prefix) diff --git a/python/paddle/fluid/tests/unittests/test_ascend_group.sh b/python/paddle/fluid/tests/unittests/test_ascend_group.sh new file mode 100644 index 00000000000..31c442e0962 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_ascend_group.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +cluster_node_ips="127.0.0.1" +export PADDLE_TRAINERS_NUM=4 +export POD_IP=127.0.0.1 +export PADDLE_TRAINERS=127.0.0.1 +export PADDLE_TRAINER_ID=0 + +export PADDLE_PORT=35789 +export TRAINER_PORTS_NUM=4 + +distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1,2,3 --log_dir=testlog" +python -m paddle.distributed.fleet.launch ${distributed_args} \ + ascend_group.py fleetascendgroup diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh new file mode 100644 index 00000000000..0960083abf2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +# use paddlecloud +echo "begin test use paddlecloud" +cluster_node_ips="127.0.0.1,127.0.0.2" +export PADDLE_TRAINERS_NUM=2 +export POD_IP=127.0.0.1 +export PADDLE_TRAINERS=127.0.0.1,127.0.0.2 +export PADDLE_TRAINER_ID=0 + +export PADDLE_PORT=35789 +export TRAINER_PORTS_NUM=2 + +distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1 --log_dir=testlog" +python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend + +str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0" +str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1" +file_0="multi_process_fleetlaunchascend.check_0.log" +file_1="multi_process_fleetlaunchascend.check_1.log" + +echo "paddlecloud params test" +if grep -q "$str1" "$file_0"; then + echo "find trainer 0" +else + echo "not find trainer 0" + exit -1 +fi + +if grep -q "$str2" "$file_1"; then + echo "find trainer 1" +else + echo "not find trainer 1" + exit -1 +fi + +# test async poll process +if [ -f $file_0 ]; then + rm $file_0 +fi +if [ -f $file_1 ]; then + rm $file_1 +fi diff --git a/python/paddle/fluid/transpiler/ascend_transpiler.py b/python/paddle/fluid/transpiler/ascend_transpiler.py new file mode 100644 index 00000000000..5593c91b5bc --- /dev/null +++ b/python/paddle/fluid/transpiler/ascend_transpiler.py @@ -0,0 +1,74 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import collective +from .. import core +OpRole = core.op_proto_and_checker_maker.OpRole +from paddle.distributed import fleet + + +class AscendTranspiler(collective.Collective): + def __init__(self, startup_program, main_program): + self.nrings = 1 + super(AscendTranspiler, self).__init__(self.nrings) + self._startup_program = startup_program + self._main_program = main_program + + def _insert_allreduce_ops(self): + block = self._main_program.global_block() + ring_id = -1 + grad = None + for idx, op in reversed(list(enumerate(block.ops))): + if self._is_backward_op(op) and \ + self.op_role_var_key in op.attr_names: + op_role_var = op.all_attrs()[self.op_role_var_key] + + if len(op_role_var) == 0: + continue + assert len(op_role_var) % 2 == 0 + + offset = idx + for i in range(0, len(op_role_var), 2): + param = block.vars[op_role_var[i]] + grad = block.vars[op_role_var[i + 1]] + if param.is_distributed: + continue + + # As we search ops reversedly, we should insert c_allreduce_sum + # op in the same way to keep the ring_id alternate + ring_id = (ring_id + 1) % self.nrings + block._insert_op( + offset + 1, + type='c_allreduce_sum', + inputs={'X': grad}, + outputs={'Out': grad}, + attrs={ + 'ring_id': ring_id, + self.op_role_key: OpRole.Backward + }) + block._insert_op( + offset + 2, + type='scale', + inputs={'X': grad}, + outputs={'Out': grad}, + attrs={ + 'scale': 1.0 / fleet.worker_num(), + self.op_role_key: OpRole.Backward + }) + + if grad is None: + return + + def transpile(self): + self._insert_allreduce_ops() diff --git a/python/setup.py.in b/python/setup.py.in index e4532b3e55d..2883f2ed248 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -149,6 +149,7 @@ packages=['paddle', 'paddle.distributed.fleet.base', 'paddle.distributed.fleet.meta_optimizers', 'paddle.distributed.fleet.meta_optimizers.sharding', + 'paddle.distributed.fleet.meta_optimizers.ascend', 'paddle.distributed.fleet.runtime', 'paddle.distributed.fleet.dataset', 'paddle.distributed.fleet.data_generator', -- GitLab