From 6dd52c5b255f7399188818a29991f5d375cd175d Mon Sep 17 00:00:00 2001
From: hutuxian <hutuxian2011@sina.cn>
Date: Fri, 15 Jan 2021 14:50:46 +0800
Subject: [PATCH] Ascend rc (#30483)

---
 CMakeLists.txt                                |   5 +
 cmake/configure.cmake                         |   4 +
 cmake/external/ascend.cmake                   |  61 ++
 cmake/external/cryptopp.cmake                 |   4 +-
 cmake/external/dlpack.cmake                   |   2 +-
 cmake/external/gflags.cmake                   |   6 +-
 cmake/external/glog.cmake                     |   6 +-
 cmake/external/grpc.cmake                     |   2 +-
 cmake/external/openblas.cmake                 |   2 +-
 cmake/external/protobuf.cmake                 |   8 +-
 cmake/external/pybind11.cmake                 |   4 +-
 cmake/external/threadpool.cmake               |   2 +-
 cmake/external/warpctc.cmake                  |   5 +-
 cmake/external/xbyak.cmake                    |   2 +-
 cmake/external/xxhash.cmake                   |   2 +-
 cmake/external/zlib.cmake                     |   4 +-
 cmake/third_party.cmake                       |   5 +
 paddle/fluid/framework/fleet/CMakeLists.txt   |   4 +
 .../fluid/framework/fleet/ascend_wrapper.cc   |  22 +
 paddle/fluid/framework/fleet/ascend_wrapper.h | 183 +++++
 paddle/fluid/operators/CMakeLists.txt         |   3 +
 paddle/fluid/operators/ascend_trigger_op.cc   |  52 ++
 paddle/fluid/operators/ascend_trigger_op.h    |  46 ++
 paddle/fluid/pybind/CMakeLists.txt            |   5 +
 paddle/fluid/pybind/ascend_wrapper_py.cc      | 694 ++++++++++++++++++
 paddle/fluid/pybind/ascend_wrapper_py.h       |  31 +
 paddle/fluid/pybind/pybind.cc                 |   7 +
 .../ascend/ascend_optimizer.py                | 179 +++++
 .../meta_optimizers/ascend/ascend_parser.py   | 529 +++++++++++++
 .../tests/unittests/test_ascend_trigger.py    |  49 ++
 30 files changed, 1904 insertions(+), 24 deletions(-)
 create mode 100644 cmake/external/ascend.cmake
 create mode 100644 paddle/fluid/framework/fleet/ascend_wrapper.cc
 create mode 100644 paddle/fluid/framework/fleet/ascend_wrapper.h
 create mode 100644 paddle/fluid/operators/ascend_trigger_op.cc
 create mode 100644 paddle/fluid/operators/ascend_trigger_op.h
 create mode 100644 paddle/fluid/pybind/ascend_wrapper_py.cc
 create mode 100644 paddle/fluid/pybind/ascend_wrapper_py.h
 create mode 100644 python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
 create mode 100644 python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_ascend_trigger.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a58640d942..d919dc5ac0 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,9 +31,13 @@ option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_F
 option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
+option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
 if (WITH_GPU  AND WITH_XPU)
     message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
+if (WITH_GPU  AND WITH_ASCEND)
+    message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
+endif()
 # cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
 if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
     message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
@@ -322,6 +326,7 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
 
 if(ON_INFER)
     # you can trun off the paddle fluid and inference lib by set ON_INFER=OFF
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index aeec7da2e6..fc1e72ba3f 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -78,6 +78,10 @@ if(WITH_BOX_PS)
     add_definitions(-DPADDLE_WITH_BOX_PS)
 endif()
 
+if(WITH_ASCEND)
+    add_definitions(-DPADDLE_WITH_ASCEND)
+endif()
+
 if(WITH_XPU)
     message(STATUS "Compile with XPU!")
     add_definitions(-DPADDLE_WITH_XPU)
diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake
new file mode 100644
index 0000000000..bcf0c0a064
--- /dev/null
+++ b/cmake/external/ascend.cmake
@@ -0,0 +1,61 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(ASCEND_PROJECT       "extern_ascend")
+IF((NOT DEFINED ASCEND_VER) OR (NOT DEFINED ASCEND_URL))
+  MESSAGE(STATUS "use pre defined download url")
+  SET(ASCEND_VER "0.1.1" CACHE STRING "" FORCE)
+  SET(ASCEND_NAME "ascend" CACHE STRING "" FORCE)
+  SET(ASCEND_URL "http://paddle-ascend.bj.bcebos.com/ascend.tar.gz" CACHE STRING "" FORCE)
+ENDIF()
+MESSAGE(STATUS "ASCEND_NAME: ${ASCEND_NAME}, ASCEND_URL: ${ASCEND_URL}")
+SET(ASCEND_SOURCE_DIR    "${THIRD_PARTY_PATH}/ascend")
+SET(ASCEND_DOWNLOAD_DIR  "${ASCEND_SOURCE_DIR}/src/${ASCEND_PROJECT}")
+SET(ASCEND_DST_DIR       "ascend")
+SET(ASCEND_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
+SET(ASCEND_INSTALL_DIR   ${ASCEND_INSTALL_ROOT}/${ASCEND_DST_DIR})
+SET(ASCEND_ROOT          ${ASCEND_INSTALL_DIR})
+SET(ASCEND_INC_DIR       ${ASCEND_ROOT}/include)
+SET(ASCEND_LIB_DIR       ${ASCEND_ROOT}/lib)
+SET(ASCEND_LIB           ${ASCEND_LIB_DIR}/libge_runner.so)
+SET(ASCEND_GRAPH_LIB           ${ASCEND_LIB_DIR}/libgraph.so)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${ASCEND_ROOT}/lib")
+
+INCLUDE_DIRECTORIES(${ASCEND_INC_DIR})
+FILE(WRITE ${ASCEND_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(ASCEND)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY ${ASCEND_NAME}/include ${ASCEND_NAME}/lib \n"
+  "        DESTINATION ${ASCEND_DST_DIR})\n")
+ExternalProject_Add(
+    ${ASCEND_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${ASCEND_SOURCE_DIR}
+    DOWNLOAD_DIR          ${ASCEND_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${ASCEND_URL} -c -q -O ${ASCEND_NAME}.tar.gz
+                          && tar zxvf ${ASCEND_NAME}.tar.gz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${ASCEND_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${ASCEND_INSTALL_ROOT}
+)
+ADD_LIBRARY(ascend SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET ascend PROPERTY IMPORTED_LOCATION ${ASCEND_LIB})
+
+ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${ASCEND_GRAPH_LIB})
+ADD_DEPENDENCIES(ascend ascend_graph ${ASCEND_PROJECT})
+
diff --git a/cmake/external/cryptopp.cmake b/cmake/external/cryptopp.cmake
index 3176e2a665..a9e1a4d67b 100644
--- a/cmake/external/cryptopp.cmake
+++ b/cmake/external/cryptopp.cmake
@@ -17,7 +17,7 @@ INCLUDE(ExternalProject)
 SET(CRYPTOPP_PREFIX_DIR  ${THIRD_PARTY_PATH}/cryptopp)
 SET(CRYPTOPP_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cryptopp)
 SET(CRYPTOPP_INCLUDE_DIR "${CRYPTOPP_INSTALL_DIR}/include" CACHE PATH "cryptopp include directory." FORCE)
-SET(CRYPTOPP_REPOSITORY ${GIT_URL}/weidai11/cryptopp.git)
+SET(CRYPTOPP_REPOSITORY https://gitee.com/tianjianhe/cryptopp.git)
 SET(CRYPTOPP_TAG        CRYPTOPP_8_2_0)
 
 IF(WIN32)
@@ -33,7 +33,7 @@ set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
                         -DCMAKE_INSTALL_LIBDIR=${CRYPTOPP_INSTALL_DIR}/lib
                         -DCMAKE_INSTALL_PREFIX=${CRYPTOPP_INSTALL_DIR}
                         -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+			"-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0"
                         -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                         -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                         -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake
index 87db181d95..fa6f8e8d4c 100644
--- a/cmake/external/dlpack.cmake
+++ b/cmake/external/dlpack.cmake
@@ -17,7 +17,7 @@ include(ExternalProject)
 set(DLPACK_PREFIX_DIR ${THIRD_PARTY_PATH}/dlpack)
 set(DLPACK_SOURCE_DIR ${THIRD_PARTY_PATH}/dlpack/src/extern_dlpack)
 
-set(DLPACK_REPOSITORY ${GIT_URL}/dmlc/dlpack.git)
+set(DLPACK_REPOSITORY https://gitee.com/tianjianhe/dlpack.git)
 set(DLPACK_TAG        v0.2)
 
 cache_third_party(extern_dlpack
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 34f5d7e2be..8ee0c4cdcd 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -18,8 +18,8 @@ SET(GFLAGS_PREFIX_DIR  ${THIRD_PARTY_PATH}/gflags)
 SET(GFLAGS_SOURCE_DIR  ${THIRD_PARTY_PATH}/gflags/src/extern_gflags)
 SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
 SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
-set(GFLAGS_REPOSITORY ${GIT_URL}/gflags/gflags.git)
-set(GFLAGS_TAG "v2.2.2")
+set(GFLAGS_REPOSITORY https://gitee.com/tianjianhe/gflags.git)
+set(GFLAGS_TAG        77592648e3f3be87d6c7123eb81cbad75f9aef5a)
 IF(WIN32)
   set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
 ELSE(WIN32)
@@ -48,7 +48,7 @@ ExternalProject_Add(
     INSTALL_COMMAND ${INSTALL_COMMAND}
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+		    "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0"
                     -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                     -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 05b98e2b56..64410e99bd 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -18,8 +18,8 @@ SET(GLOG_PREFIX_DIR  ${THIRD_PARTY_PATH}/glog)
 SET(GLOG_SOURCE_DIR  ${THIRD_PARTY_PATH}/glog/src/extern_glog)
 SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog)
 SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE)
-SET(GLOG_REPOSITORY ${GIT_URL}/google/glog.git)
-SET(GLOG_TAG        v0.4.0)
+SET(GLOG_REPOSITORY https://gitee.com/tianjianhe/glog.git)
+SET(GLOG_TAG        v0.3.5)
 
 IF(WIN32)
   SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/glog.lib" CACHE FILEPATH "glog library." FORCE)
@@ -47,7 +47,7 @@ ExternalProject_Add(
     SOURCE_DIR      ${GLOG_SOURCE_DIR}
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
+		    "-DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}  -D_GLIBCXX_USE_CXX11_ABI=0"
                     -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                     -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index 536e95c1dc..bd2f4d11ed 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -28,7 +28,7 @@ IF(APPLE)
   SET(GRPC_INSTALL_CMD make prefix=${GRPC_INSTALL_DIR} install) 
 ELSE()
   SET(GRPC_CFLAGS "-Wno-error -std=c11 ${CLFAGS}")
-  SET(GRPC_CXXFLAGS "-Wno-error -std=c++11 ${CXXFLAGS}")
+  SET(GRPC_CXXFLAGS "-Wno-error -std=c++11 ${CXXFLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
   SET(BUILD_CMD make CFLAGS=${GRPC_CFLAGS} CXXFLAGS=${GRPC_CXXFLAGS} HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin) 
   SET(GRPC_INSTALL_CMD make prefix=${GRPC_INSTALL_DIR} install CFLAGS=${GRPC_CFLAGS} CXXFLAGS=${GRPC_CXXFLAGS})
 ENDIF()
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 19ba6d15c5..f459bbfd47 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -17,7 +17,7 @@ INCLUDE(ExternalProject)
 SET(CBLAS_PREFIX_DIR  ${THIRD_PARTY_PATH}/openblas)
 SET(CBLAS_SOURCE_DIR  ${THIRD_PARTY_PATH}/openblas/src/extern_openblas)
 SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
-SET(CBLAS_REPOSITORY  ${GIT_URL}/xianyi/OpenBLAS.git)
+SET(CBLAS_REPOSITORY  https://gitee.com/tianjianhe/OpenBLAS.git)
 SET(CBLAS_TAG         v0.3.7)
 if(WITH_MIPS)
   SET(CBLAS_TAG         v0.3.13)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 905c17b930..dd0de0d086 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -183,7 +183,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
             "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
             "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}"
-            "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
+	    "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0"
             "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
             "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
             "-Dprotobuf_WITH_ZLIB=ON"
@@ -198,8 +198,8 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
     ENDIF()
 
-    SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
-    SET(PROTOBUF_TAG         9f75c5aa851cd877fb0d93ccc31b8567a6706546)
+    SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
+    SET(PROTOBUF_TAG         v3.8.0)
 
     cache_third_party(${TARGET_NAME}
         REPOSITORY    ${PROTOBUF_REPOSITORY}
@@ -234,7 +234,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     )
 ENDFUNCTION()
 
-SET(PROTOBUF_VERSION 3.1.0)
+# SET(PROTOBUF_VERSION 3.1.0)
 
 IF(NOT PROTOBUF_FOUND)
     build_protobuf(extern_protobuf FALSE)
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index 69bd68c277..c6be74811d 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -16,8 +16,8 @@ include(ExternalProject)
 
 set(PYBIND_PREFIX_DIR     ${THIRD_PARTY_PATH}/pybind)
 set(PYBIND_SOURCE_DIR     ${THIRD_PARTY_PATH}/pybind/src/extern_pybind)
-SET(PYBIND_REPOSITORY     ${GIT_URL}/pybind/pybind11.git)
-SET(PYBIND_TAG            v2.4.3)
+SET(PYBIND_REPOSITORY     https://gitee.com/tianjianhe/pybind11.git)
+SET(PYBIND_TAG            v2.6.0)
 
 cache_third_party(extern_pybind
     REPOSITORY    ${PYBIND_REPOSITORY}
diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake
index 205e8d26d9..6c7ff3d6d7 100644
--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
@@ -16,7 +16,7 @@ INCLUDE(ExternalProject)
 
 SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool)
 SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool)
-SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
+SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git)
 SET(THREADPOOL_TAG        9a42ec1329f259a5f4881a291db1dcb8f2ad9040)
 
 cache_third_party(extern_threadpool
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 0ee3e2116a..e5d79cf558 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -17,8 +17,9 @@ INCLUDE(ExternalProject)
 SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
-set(WARPCTC_REPOSITORY  ${GIT_URL}/baidu-research/warp-ctc.git)
+set(WARPCTC_REPOSITORY  https://gitee.com/tianjianhe/warp-ctc.git)
 set(WARPCTC_TAG         95a461eddeabd51099ef059dcfada1117eb1bfb8)
+# set(WARPCTC_TAG         bc29dcfff07ced1c7a19a4ecee48e5ad583cef8e)
 
 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
     CACHE PATH "Warp-ctc Directory" FORCE)
@@ -52,7 +53,7 @@ ExternalProject_Add(
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
                     -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
                     -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+		    "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}  -D_GLIBCXX_USE_CXX11_ABI=0"
                     -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                     -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                     -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
index 6627c4eed1..c4c04c98bc 100644
--- a/cmake/external/xbyak.cmake
+++ b/cmake/external/xbyak.cmake
@@ -19,7 +19,7 @@ set(XBYAK_PREFIX_DIR    ${THIRD_PARTY_PATH}/xbyak)
 SET(XBYAK_SOURCE_DIR     ${THIRD_PARTY_PATH}/xbyak/src/extern_xbyak)
 set(XBYAK_INSTALL_ROOT  ${THIRD_PARTY_PATH}/install/xbyak)
 set(XBYAK_INC_DIR       ${XBYAK_INSTALL_ROOT}/include)
-set(XBYAK_REPOSITORY    ${GIT_URL}/herumi/xbyak.git)
+set(XBYAK_REPOSITORY    https://gitee.com/tianjianhe/xbyak.git)
 set(XBYAK_TAG           v5.661) # Jul 26th
 
 include_directories(${XBYAK_INC_DIR})
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
index bdd7df190f..4033237b9e 100644
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -18,7 +18,7 @@ set(XXHASH_PREFIX_DIR ${THIRD_PARTY_PATH}/xxhash)
 set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash/src/extern_xxhash)
 set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
 set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
-set(XXHASH_REPOSITORY  ${GIT_URL}/Cyan4973/xxHash.git)
+set(XXHASH_REPOSITORY  https://gitee.com/tianjianhe/xxHash.git)
 set(XXHASH_TAG         v0.6.5)
 
 cache_third_party(extern_xxhash
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 4464787a0c..334fe5c355 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -19,7 +19,7 @@ SET(ZLIB_SOURCE_DIR ${THIRD_PARTY_PATH}/zlib/src/extern_zlib)
 SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib)
 SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE)
 SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE)
-set(ZLIB_REPOSITORY ${GIT_URL}/madler/zlib.git)
+set(ZLIB_REPOSITORY https://gitee.com/tianjianhe/zlib.git)
 set(ZLIB_TAG        v1.2.8)
 
 INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers.
@@ -41,7 +41,7 @@ ExternalProject_Add(
     CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                     -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+		    "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}  -D_GLIBCXX_USE_CXX11_ABI=0"
                     -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
                     -DBUILD_SHARED_LIBS=OFF
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 84020f57f1..d576a299b8 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -274,6 +274,11 @@ if(WITH_BOX_PS)
     list(APPEND third_party_deps extern_box_ps)
 endif(WITH_BOX_PS)
 
+if(WITH_ASCEND)
+    include(external/ascend)
+    list(APPEND third_party_deps extern_ascend)
+endif (WITH_ASCEND)
+
 if (WITH_PSCORE)
     include(external/snappy)
     list(APPEND third_party_deps extern_snappy)
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index c774a58e05..4d0cfb6297 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -31,3 +31,7 @@ endif(WITH_GLOO)
 cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context heter_service_proto)
 
 cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
+
+if(WITH_ASCEND)
+    cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend ascend_graph)
+endif(WITH_ASCEND)
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.cc b/paddle/fluid/framework/fleet/ascend_wrapper.cc
new file mode 100644
index 0000000000..d1b2f51f70
--- /dev/null
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_ASCEND
+#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
+namespace paddle {
+namespace framework {
+std::shared_ptr<AscendInstance> AscendInstance::ascend_instance_ = nullptr;
+}  // end namespace framework
+}  // end namespace paddle
+#endif
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h
new file mode 100644
index 0000000000..da79fccb8c
--- /dev/null
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
@@ -0,0 +1,183 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND
+#include <glog/logging.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/timer.h"
+
+#include "ge/ge_api.h"
+#include "ge/ge_api_types.h"
+#include "graph/attr_value.h"
+#include "graph/tensor.h"
+#include "graph/types.h"
+
+namespace paddle {
+namespace framework {
+
+// typedef std::vector<std::string> AscendGraphDesc;
+typedef ge::Graph AscendGraphDesc;
+
+class AscendInstance {
+ public:
+  virtual ~AscendInstance() {}
+  AscendInstance() {}
+
+  std::map<std::string, std::string> GetDefaultInitSessionOptions() {
+    std::map<std::string, std::string> init_options;
+    init_options["a"] = "b";
+    init_options["ge.trainFlag"] = "1";
+    return init_options;
+  }
+
+  // add other parameters here to init
+  void InitGlobalResouces() {
+    session_.reset(new ge::Session(GetDefaultInitSessionOptions()));
+    VLOG(1) << "InitGlobalResouces Done";
+  }
+
+  static std::shared_ptr<AscendInstance> GetInstance() {
+    if (nullptr == ascend_instance_) {
+      ascend_instance_.reset(new paddle::framework::AscendInstance());
+      VLOG(1) << "Initialize AscendInstance Done";
+    }
+    return ascend_instance_;
+  }
+
+  void AddAscendSubgraph(int graph_idx, const AscendGraphDesc &graph) {
+    ge::Status status = session_->AddGraph(graph_idx, graph);
+    PADDLE_ENFORCE_EQ(status, ge::SUCCESS,
+                      paddle::platform::errors::PreconditionNotMet(
+                          "Calling addGraph of graph engine failed, please "
+                          "check Ascend Log."));
+    VLOG(1) << "AddAscendSubgraph " << graph_idx << " Done";
+  }
+
+  ge::DataType VarTypeToGeType(proto::VarType::Type type) {
+    if (type == proto::VarType::FP16) {
+      return ge::DataType::DT_FLOAT16;
+    } else if (type == proto::VarType::FP32) {
+      return ge::DataType::DT_FLOAT;
+    } else if (type == proto::VarType::FP64) {
+      return ge::DataType::DT_DOUBLE;
+    } else if (type == proto::VarType::INT32) {
+      return ge::DataType::DT_INT32;
+    } else if (type == proto::VarType::INT64) {
+      return ge::DataType::DT_INT64;
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Not support %s as tensor type.", DataTypeToString(type)));
+    }
+  }
+  int GeTypeSize(proto::VarType::Type type) {
+    if (type == proto::VarType::FP16) {
+      return 2;
+    } else if (type == proto::VarType::FP32) {
+      return 4;
+    } else if (type == proto::VarType::FP64) {
+      return 8;
+    } else if (type == proto::VarType::INT32) {
+      return 4;
+    } else if (type == proto::VarType::INT64) {
+      return 8;
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Not support %s as tensor type.", DataTypeToString(type)));
+    }
+  }
+  ge::Tensor ConvertToGeTensor(const Tensor *tensor) {
+    auto numel = tensor->numel();
+    std::vector<int64_t> vec_dim;
+    auto dimen = arity(tensor->dims());
+    for (auto i = 0; i < dimen; ++i) {
+      vec_dim.push_back(tensor->dims()[i]);
+    }
+    // For Debug
+    // VLOG(1) << "input numel: " << numel << ", dimen is " << vec_dim.size() <<
+    // ", and shape is";
+    // for (const auto e : vec_dim) {
+    //   VLOG(0) << e;
+    // }
+
+    ge::Shape shape(vec_dim);
+    ge::TensorDesc tensor_desc(shape, ge::Format::FORMAT_ND,
+                               VarTypeToGeType(tensor->type()));
+    tensor_desc.SetRealDimCnt(vec_dim.size());
+
+    const uint8_t *data =
+        reinterpret_cast<const uint8_t *>(tensor->data<void>());
+    std::vector<uint8_t> dst(numel * GeTypeSize(tensor->type()));
+    memcpy(dst.data(), data, GeTypeSize(tensor->type()) * numel);
+    ge::Tensor ge_tensor(tensor_desc, dst);
+    return ge_tensor;
+  }
+
+  void RunAscendSubgraph(int graph_idx,
+                         const std::vector<const Tensor *> &inputs,
+                         std::vector<Tensor *> *outputs) {
+    VLOG(1) << "Ascend Graph[" << graph_idx << "] is about to run.";
+    // Convert paddle Tensor to GE Tensor
+    std::vector<ge::Tensor> ge_inputs;
+    for (const auto &e : inputs) {
+      ge_inputs.push_back(ConvertToGeTensor(e));
+    }
+
+    // Run Graph
+    std::vector<ge::Tensor> ge_outputs;
+    ge::Status status = session_->RunGraph(graph_idx, ge_inputs, ge_outputs);
+    PADDLE_ENFORCE_EQ(status, ge::SUCCESS,
+                      paddle::platform::errors::PreconditionNotMet(
+                          "Calling RunGraph of graph engine failed, please "
+                          "check Ascend Log."));
+    VLOG(1) << "Run Ascend Graph[" << graph_idx << "] Done";
+
+    // change tensor back, note all tensor's type computed in GE is uint8
+    for (size_t i = 0; i < ge_outputs.size(); ++i) {
+      const uint8_t *ret_data = ge_outputs[i].GetData();
+      size_t size = ge_outputs[i].GetSize();
+      VLOG(1) << "GE Tensor size of the " << i << "th output var is " << size;
+      auto *dst = (*outputs)[i]->mutable_data<uint8_t>({(int64_t)size},
+                                                       platform::CPUPlace());
+      memcpy(dst, ret_data, size);
+
+      // Following for debug:
+      // VLOG(0) << "output for " << i << " var: ";
+      // float *tmp = reinterpret_cast<float*>(dst);
+      // for (size_t j = 0; j < size / 4; ++j) {
+      //   printf("%f ", tmp[j]);
+      // }
+      // printf("\n");
+    }
+  }
+
+ protected:
+  std::shared_ptr<ge::Session> session_;
+
+ private:
+  static std::shared_ptr<AscendInstance> ascend_instance_;
+};
+}  // end namespace framework
+}  // end namespace paddle
+#endif
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 28741ce947..f46320acf1 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -115,6 +115,9 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} tensor_formatter)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} op_version_registry)
+if (WITH_ASCEND)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} ascend_wrapper)
+endif()
 
 # FIXME(typhoonzero): operator deps may not needed.
 # op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
diff --git a/paddle/fluid/operators/ascend_trigger_op.cc b/paddle/fluid/operators/ascend_trigger_op.cc
new file mode 100644
index 0000000000..b699ceec87
--- /dev/null
+++ b/paddle/fluid/operators/ascend_trigger_op.cc
@@ -0,0 +1,52 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/ascend_trigger_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AscendTriggerOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.device_context());
+  }
+};
+
+class AscendTriggerOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("FeedList", "FeedList of Ascend SubGraph").AsDuplicable();
+    AddOutput("FetchList", "FetchList of Ascend SubGraph").AsDuplicable();
+    AddAttr<int>("graph_idx", "(int, the graph index").SetDefault(-1);
+    AddComment(R"DOC(
+Trigger Ascend SubGraph
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(ascend_trigger, ops::AscendTriggerOp,
+                  ops::AscendTriggerOpMaker);
+REGISTER_OP_CPU_KERNEL(ascend_trigger, ops::AscendTriggerCPUKernel<float>)
diff --git a/paddle/fluid/operators/ascend_trigger_op.h b/paddle/fluid/operators/ascend_trigger_op.h
new file mode 100644
index 0000000000..eaa79da2ba
--- /dev/null
+++ b/paddle/fluid/operators/ascend_trigger_op.h
@@ -0,0 +1,46 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_ASCEND
+#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
+#include "paddle/fluid/framework/tensor.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class AscendTriggerCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+#ifdef PADDLE_WITH_ASCEND
+    auto ascend_ptr = paddle::framework::AscendInstance::GetInstance();
+    auto graph_idx = ctx.Attr<int>("graph_idx");
+    VLOG(4) << "AscendTrigger Kernel, begin to run graph: " << graph_idx;
+    auto inputs = ctx.MultiInput<framework::Tensor>("FeedList");
+    auto outputs = ctx.MultiOutput<framework::Tensor>("FetchList");
+    ascend_ptr->RunAscendSubgraph(graph_idx, inputs, &outputs);
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Please compile WITH_ASCEND option to enable ascend_trigger op"));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 1e4bf43f62..0f52d7344c 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -39,6 +39,11 @@ set(PYBIND_SRCS
   compatible.cc
   generator_py.cc)
 
+if(WITH_ASCEND)
+  set(PYBIND_DEPS ${PYBIND_DEPS} ascend_wrapper)
+  set(PYBIND_SRCS ${PYBIND_SRCS} ascend_wrapper_py.cc)
+endif(WITH_ASCEND)
+
 if(WITH_GLOO)
   set(PYBIND_DEPS ${PYBIND_DEPS} gloo_context)
   set(PYBIND_SRCS ${PYBIND_SRCS} gloo_context_py.cc)
diff --git a/paddle/fluid/pybind/ascend_wrapper_py.cc b/paddle/fluid/pybind/ascend_wrapper_py.cc
new file mode 100644
index 0000000000..00eca38085
--- /dev/null
+++ b/paddle/fluid/pybind/ascend_wrapper_py.cc
@@ -0,0 +1,694 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND
+#include <fcntl.h>
+
+#ifdef _POSIX_C_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#ifdef _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif
+
+#include <ge/ge_api.h>
+#include <graph/attr_value.h>
+#include <graph/operator_factory.h>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
+#include "paddle/fluid/pybind/ascend_wrapper_py.h"
+
+using namespace ge;  // NOLINT
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+void BindAscendWrapper(py::module *m) {
+  py::class_<framework::AscendInstance,
+             std::shared_ptr<framework::AscendInstance>>(*m, "AscendInstance")
+      .def(py::init([]() { return framework::AscendInstance::GetInstance(); }))
+      .def("init_global_resources",
+           &framework::AscendInstance::InitGlobalResouces,
+           py::call_guard<py::gil_scoped_release>())
+      .def("add_ascend_subgraph", &framework::AscendInstance::AddAscendSubgraph,
+           py::call_guard<py::gil_scoped_release>());
+}  // end AscendWrapper
+
+Status ge_initialize(std::map<std::string, std::string> &options) {  // NOLINT
+  py::gil_scoped_release release;
+  Status res = GEInitialize(options);
+  py::gil_scoped_acquire acquire;
+  return res;
+}
+
+enum AttrType {
+  AT_INT64 = 0,
+  AT_INT32,
+  AT_UINT32,
+  AT_LIST_INT64,
+  AT_LIST_INT32,
+  AT_LIST_UINT32,
+  AT_FLOAT,
+  AT_LIST_FLOAT,
+  AT_ATTR_VALUE,
+  AT_STRING,
+  AT_LIST_STRING,
+  AT_BOOL,
+  AT_LIST_BOOL,
+  AT_TENSOR,
+  AT_LIST_TENSOR,
+  AT_LIST_UINT8,
+  AT_LIST_LIST_INT64,
+  AT_LIST_DT,
+  AT_DT,
+  AT_LIST_NAMEATTR,
+  AT_NAMEATTR
+};
+
+void BindAscendGraph(py::module *m) {
+  m->def("ge_initialize", &ge_initialize, "GEInitialize");
+  m->def("ge_finalize", &GEFinalize, "GEFinalize");
+
+  //枚举封装
+  py::enum_<GraphRunMode>(*m, "GEGraphRunMode")
+      .value("PREDICTION", GraphRunMode::PREDICTION)
+      .value("TRAIN", GraphRunMode::TRAIN)
+      .export_values();
+
+  py::enum_<DataType>(*m, "GEDataType")
+      .value("DT_FLOAT", DataType::DT_FLOAT)
+      .value("DT_FLOAT16", DataType::DT_FLOAT16)
+      .value("DT_INT8", DataType::DT_INT8)
+      .value("DT_INT16", DataType::DT_INT16)
+      .value("DT_UINT16", DataType::DT_UINT16)
+      .value("DT_UINT8", DataType::DT_UINT8)
+      .value("DT_INT32", DataType::DT_INT32)
+      .value("DT_INT64", DataType::DT_INT64)
+      .value("DT_UINT32", DataType::DT_UINT32)
+      .value("DT_UINT64", DataType::DT_UINT64)
+      .value("DT_BOOL", DataType::DT_BOOL)
+      .value("DT_DOUBLE", DataType::DT_DOUBLE)
+      .value("DT_STRING", DataType::DT_STRING)
+      .value("DT_DUAL_SUB_INT8", DataType::DT_DUAL_SUB_INT8)
+      .value("DT_DUAL_SUB_UINT8", DataType::DT_DUAL_SUB_UINT8)
+      .value("DT_COMPLEX64", DataType::DT_COMPLEX64)
+      .value("DT_COMPLEX128", DataType::DT_COMPLEX128)
+      .value("DT_QINT8", DataType::DT_QINT8)
+      .value("DT_QINT16", DataType::DT_QINT16)
+      .value("DT_QINT32", DataType::DT_QINT32)
+      .value("DT_QUINT8", DataType::DT_QUINT8)
+      .value("DT_QUINT16", DataType::DT_QUINT16)
+      .value("DT_RESOURCE", DataType::DT_RESOURCE)
+      .value("DT_STRING_REF", DataType::DT_STRING_REF)
+      .value("DT_DUAL", DataType::DT_DUAL)
+      .value("DT_UNDEFINED", DataType::DT_UNDEFINED)
+      .export_values();
+
+  py::enum_<Format>(*m, "GEFormat")
+      .value("FORMAT_NCHW", Format::FORMAT_NCHW)
+      .value("FORMAT_NHWC", Format::FORMAT_NHWC)
+      .value("FORMAT_ND", Format::FORMAT_ND)
+      .value("FORMAT_NC1HWC0", Format::FORMAT_NC1HWC0)
+      .value("FORMAT_FRACTAL_Z", Format::FORMAT_FRACTAL_Z)
+      .value("FORMAT_NC1C0HWPAD", Format::FORMAT_NC1C0HWPAD)
+      .value("FORMAT_NHWC1C0", Format::FORMAT_NHWC1C0)
+      .value("FORMAT_FSR_NCHW", Format::FORMAT_FSR_NCHW)
+      .value("FORMAT_FRACTAL_DECONV", Format::FORMAT_FRACTAL_DECONV)
+      .value("FORMAT_C1HWNC0", Format::FORMAT_C1HWNC0)
+      .value("FORMAT_FRACTAL_DECONV_TRANSPOSE",
+             Format::FORMAT_FRACTAL_DECONV_TRANSPOSE)
+      .value("FORMAT_FRACTAL_DECONV_SP_STRIDE_TRANS",
+             Format::FORMAT_FRACTAL_DECONV_SP_STRIDE_TRANS)
+      .value("FORMAT_NC1HWC0_C04", Format::FORMAT_NC1HWC0_C04)
+      .value("FORMAT_FRACTAL_Z_C04", Format::FORMAT_FRACTAL_Z_C04)
+      .value("FORMAT_CHWN", Format::FORMAT_CHWN)
+      .value("FORMAT_FRACTAL_DECONV_SP_STRIDE8_TRANS",
+             Format::FORMAT_FRACTAL_DECONV_SP_STRIDE8_TRANS)
+      .value("FORMAT_HWCN", Format::FORMAT_HWCN)
+      .value("FORMAT_NC1KHKWHWC0", Format::FORMAT_NC1KHKWHWC0)
+      .value("FORMAT_BN_WEIGHT", Format::FORMAT_BN_WEIGHT)
+      .value("FORMAT_FILTER_HWCK", Format::FORMAT_FILTER_HWCK)
+      .value("FORMAT_HASHTABLE_LOOKUP_LOOKUPS",
+             Format::FORMAT_HASHTABLE_LOOKUP_LOOKUPS)
+      .value("FORMAT_HASHTABLE_LOOKUP_KEYS",
+             Format::FORMAT_HASHTABLE_LOOKUP_KEYS)
+      .value("FORMAT_HASHTABLE_LOOKUP_VALUE",
+             Format::FORMAT_HASHTABLE_LOOKUP_VALUE)
+      .value("FORMAT_HASHTABLE_LOOKUP_OUTPUT",
+             Format::FORMAT_HASHTABLE_LOOKUP_OUTPUT)
+      .value("FORMAT_HASHTABLE_LOOKUP_HITS",
+             Format::FORMAT_HASHTABLE_LOOKUP_HITS)
+      .value("FORMAT_C1HWNCoC0", Format::FORMAT_C1HWNCoC0)
+      .value("FORMAT_MD", Format::FORMAT_MD)
+      .value("FORMAT_NDHWC", Format::FORMAT_NDHWC)
+      .value("FORMAT_FRACTAL_ZZ", Format::FORMAT_FRACTAL_ZZ)
+      .value("FORMAT_FRACTAL_NZ", Format::FORMAT_FRACTAL_NZ)
+      .value("FORMAT_NCDHW", Format::FORMAT_NCDHW)
+      .value("FORMAT_DHWCN", Format::FORMAT_DHWCN)
+      .value("FORMAT_NDC1HWC0", Format::FORMAT_NDC1HWC0)
+      .value("FORMAT_FRACTAL_Z_3D", Format::FORMAT_FRACTAL_Z_3D)
+      .value("FORMAT_CN", Format::FORMAT_CN)
+      .value("FORMAT_NC", Format::FORMAT_NC)
+      .value("FORMAT_DHWNC", Format::FORMAT_DHWNC)
+      .value("FORMAT_FRACTAL_Z_3D_TRANSPOSE",
+             Format::FORMAT_FRACTAL_Z_3D_TRANSPOSE)
+      .value("FORMAT_FRACTAL_ZN_LSTM", Format::FORMAT_FRACTAL_ZN_LSTM)
+      .value("FORMAT_FRACTAL_Z_G", Format::FORMAT_FRACTAL_Z_G)
+      .value("FORMAT_RESERVED", Format::FORMAT_RESERVED)
+      .value("FORMAT_ALL", Format::FORMAT_ALL)
+      .value("FORMAT_NULL", Format::FORMAT_NULL)
+      .export_values();
+
+  py::enum_<UnknowShapeOpType>(*m, "GEUnknowShapeOpType")
+      .value("DEPEND_IN_SHAPE", UnknowShapeOpType::DEPEND_IN_SHAPE)
+      .value("DEPEND_CONST_VALUE", UnknowShapeOpType::DEPEND_CONST_VALUE)
+      .value("DEPEND_SHAPE_RANGE", UnknowShapeOpType::DEPEND_SHAPE_RANGE)
+      .value("DEPEND_COMPUTE", UnknowShapeOpType::DEPEND_COMPUTE)
+      .export_values();
+
+  py::enum_<DeviceType>(*m, "GEDeviceType")
+      .value("NPU", DeviceType::NPU)
+      .value("CPU", DeviceType::CPU)
+      .export_values();
+
+  py::enum_<AttrType>(*m, "GEAttrType")
+      .value("AT_INT64", AttrType::AT_INT64)
+      .value("AT_INT32", AttrType::AT_INT32)
+      .value("AT_UINT32", AttrType::AT_UINT32)
+      .value("AT_LIST_INT64", AttrType::AT_LIST_INT64)
+      .value("AT_LIST_INT32", AttrType::AT_LIST_INT32)
+      .value("AT_LIST_UINT32", AttrType::AT_LIST_UINT32)
+      .value("AT_FLOAT", AttrType::AT_FLOAT)
+      .value("AT_LIST_FLOAT", AttrType::AT_LIST_FLOAT)
+      .value("AT_ATTR_VALUE", AttrType::AT_ATTR_VALUE)
+      .value("AT_STRING", AttrType::AT_STRING)
+      .value("AT_LIST_STRING", AttrType::AT_LIST_STRING)
+      .value("AT_BOOL", AttrType::AT_BOOL)
+      .value("AT_LIST_BOOL", AttrType::AT_LIST_BOOL)
+      .value("AT_TENSOR", AttrType::AT_TENSOR)
+      .value("AT_LIST_TENSOR", AttrType::AT_LIST_TENSOR)
+      .value("AT_LIST_UINT8", AttrType::AT_LIST_UINT8)
+      .value("AT_LIST_LIST_INT64", AttrType::AT_LIST_LIST_INT64)
+      .value("AT_LIST_DT", AttrType::AT_LIST_DT)
+      .value("AT_DT", AttrType::AT_DT)
+      .value("AT_LIST_NAMEATTR", AttrType::AT_LIST_NAMEATTR)
+      .value("AT_NAMEATTR", AttrType::AT_NAMEATTR)
+      .export_values();
+
+  // 类封装
+  py::class_<Session>(*m, "GESession")
+      .def(py::init<const std::map<std::string, std::string> &>())
+      .def("add_graph",
+           (Status (Session::*)(uint32_t, const Graph &)) & Session::AddGraph)
+      .def("add_graph",
+           (Status (Session::*)(uint32_t, const Graph &,
+                                const std::map<std::string, std::string> &)) &
+               Session::AddGraph)
+      .def("remove_graph", &Session::RemoveGraph)
+      .def("run_graph",
+           [](Session &ss, uint32_t graphId,
+              const std::vector<Tensor> &inputs) -> py::tuple {
+             std::vector<Tensor> outputs;
+             Status res = ss.RunGraph(graphId, inputs, outputs);
+             return py::make_tuple(outputs, res);
+           },
+           py::call_guard<py::gil_scoped_release>())
+      .def("build_graph", &Session::BuildGraph)
+      .def("run_graph_async", &Session::RunGraphAsync)
+      .def("register_call_back_func",
+           (Status (Session::*)(  // NOLINT
+               const std::string &,
+               std::function<uint32_t(
+                   uint32_t graph_id,
+                   const std::map<std::string, ge::Tensor> &params_list)>)) &
+               Session::RegisterCallBackFunc)
+      .def("is_graph_need_rebuild", &Session::IsGraphNeedRebuild);
+
+  py::class_<Graph>(*m, "GEGraph")
+      .def(py::init<>())
+      .def(py::init<const std::string &>())
+      .def("set_inputs", &Graph::SetInputs)
+      .def("set_outputs", (Graph & (Graph::*)(const std::vector<Operator> &)) &
+                              Graph::SetOutputs)
+      .def("set_outputs",
+           (Graph & (Graph::*)(const std::vector<
+                               std::pair<Operator, std::vector<size_t>>> &)) &
+               Graph::SetOutputs)
+      .def("set_outputs",
+           (Graph &
+            (Graph::*)(const std::vector<std::pair<ge::Operator, std::string>>
+                           &)) &
+               Graph::SetOutputs)
+      .def("set_targets", &Graph::SetTargets)
+      .def("is_valid", &Graph::IsValid)
+      .def("add_op", &Graph::AddOp)
+      .def("find_op_by_name",
+           [](Graph &graph, const std::string &name) -> py::tuple {
+             ge::Operator op;
+             graphStatus status = graph.FindOpByName(name, op);
+             return py::make_tuple(op, status);
+           })
+      .def("find_op_by_type",
+           [](Graph &graph, const std::string &type) -> py::tuple {
+             std::vector<ge::Operator> ops;
+             graphStatus status = graph.FindOpByType(type, ops);
+             return py::make_tuple(ops, status);
+           })
+      .def("get_all_op_name",
+           [](Graph &graph) -> py::tuple {
+             std::vector<std::string> op_name;
+             graphStatus status = graph.GetAllOpName(op_name);
+             return py::make_tuple(op_name, status);
+           })
+      .def("save_to_file", &Graph::SaveToFile)
+      .def("load_from_file", &Graph::LoadFromFile)
+      .def("get_name", &Graph::GetName)
+      .def("set_need_iteration", &Graph::SetNeedIteration);
+
+  py::class_<Operator>(*m, "GEOperator")
+      .def(py::init<>())
+      .def(py::init<const std::string &>())
+      .def(py::init<const std::string &, const std::string &>())
+      .def("is_empty", &Operator::IsEmpty)
+      .def("get_name", &Operator::GetName)
+      .def("get_op_type", &Operator::GetOpType)
+      .def("set_input",
+           (Operator & (Operator::*)(const std::string &, const Operator &)) &
+               Operator::SetInput)
+      .def("set_input",
+           (Operator & (Operator::*)(const std::string &, const Operator &,
+                                     const std::string &)) &
+               Operator::SetInput)
+      .def("set_input", (Operator & (Operator::*)(const std::string &,
+                                                  const Operator &, uint32_t)) &
+                            Operator::SetInput)
+      .def("add_control_input", &Operator::AddControlInput)
+      .def("get_input_const_data",
+           [](Operator &op, const std::string &dst_name) -> py::tuple {
+             Tensor data;
+             graphStatus res = op.GetInputConstData(dst_name, data);
+             return py::make_tuple(data, res);
+           })
+      .def("get_input_desc",
+           (TensorDesc (Operator::*)(const std::string &) const) &
+               Operator::GetInputDesc)
+      .def("get_input_desc",
+           (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetInputDesc)
+      .def("get_dynamic_output_num", &Operator::GetDynamicOutputNum)
+      .def("get_dynamic_input_num", &Operator::GetDynamicInputNum)
+      .def("try_get_input_desc",
+           [](Operator &op, const std::string &name) -> py::tuple {
+             TensorDesc tensor_desc;
+             graphStatus status = op.TryGetInputDesc(name, tensor_desc);
+             return py::make_tuple(tensor_desc, status);
+           })
+      .def("update_input_desc", &Operator::UpdateInputDesc)
+      .def("get_output_desc",
+           (TensorDesc (Operator::*)(const std::string &) const) &
+               Operator::GetOutputDesc)
+      .def("get_output_desc",
+           (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetOutputDesc)
+      .def("update_output_desc", &Operator::UpdateOutputDesc)
+      .def("get_dynamic_input_desc", &Operator::GetDynamicInputDesc)
+      .def("update_dynamic_input_desc", &Operator::UpdateDynamicInputDesc)
+      .def("get_dynamic_output_desc", &Operator::GetDynamicOutputDesc)
+      .def("update_dynamic_output_desc", &Operator::UpdateDynamicOutputDesc)
+      .def("infer_shape_and_type", &Operator::InferShapeAndType)
+      .def("set_inference_context", &Operator::SetInferenceContext)
+      .def("get_inference_context", &Operator::GetInferenceContext)
+      .def("verify_all_attr", &Operator::VerifyAllAttr)
+      .def("get_inputs_size", &Operator::GetInputsSize)
+      .def("get_outputs_size", &Operator::GetOutputsSize)
+      .def("get_all_attr_names_and_types", &Operator::GetAllAttrNamesAndTypes)
+      .def("set_attr_int64",
+           [](Operator &op, const std::string &name,
+              int64_t value) -> Operator & {
+             int64_t tar = (int64_t)value;
+             return op.SetAttr(name, tar);
+           })
+      .def("set_attr_int32",
+           [](Operator &op, const std::string &name,
+              int32_t value) -> Operator & {
+             int32_t tar = (int32_t)value;
+             return op.SetAttr(name, tar);
+           })
+      .def("set_attr_uint32",
+           [](Operator &op, const std::string &name,
+              uint32_t value) -> Operator & {
+             uint32_t tar = (uint32_t)value;
+             return op.SetAttr(name, tar);
+           })
+      .def("set_attr_vec_int64",
+           [](Operator &op, const std::string &name,
+              const std::vector<int64_t> &value) -> Operator & {
+             int len = value.size();
+             std::vector<int64_t> tar;
+             int64_t tmp;
+             for (int i = 0; i < len; i++) {
+               tmp = (int64_t)value[i];
+               tar.push_back(tmp);
+             }
+             return op.SetAttr(name, tar);
+           })
+      .def("set_attr_vec_int32",
+           [](Operator &op, const std::string &name,
+              const std::vector<int32_t> &value) -> Operator & {
+             int len = value.size();
+             std::vector<int32_t> tar;
+             int32_t tmp;
+             for (int i = 0; i < len; i++) {
+               tmp = (int32_t)value[i];
+               tar.push_back(tmp);
+             }
+             return op.SetAttr(name, tar);
+           })
+      .def("set_attr_vec_uint32",
+           [](Operator &op, const std::string &name,
+              const std::vector<uint32_t> &value) -> Operator & {
+             int len = value.size();
+             std::vector<uint32_t> tar;
+             uint32_t tmp;
+             for (int i = 0; i < len; i++) {
+               tmp = (uint32_t)value[i];
+               tar.push_back(tmp);
+             }
+             return op.SetAttr(name, tar);
+           })
+      .def("set_attr_list_int64",
+           [](Operator &op, const std::string &name,
+              std::initializer_list<int64_t> &attrValue) -> Operator & {
+             return op.SetAttr(name, std::move(attrValue));
+           })
+      .def("set_attr_attrvalue",
+           [](Operator &op, const std::string &name, AttrValue &attrValue)
+               -> Operator & { return op.SetAttr(name, std::move(attrValue)); })
+      .def(
+          "set_attr_float",
+          [](Operator &op, const std::string &name, float value) -> Operator & {
+            float tar = static_cast<float>(value);
+            return op.SetAttr(name, tar);
+          })
+      .def("set_attr_vec_float",
+           [](Operator &op, const std::string &name,
+              const std::vector<float> &value) -> Operator & {
+             int len = value.size();
+             std::vector<float> tar;
+             float tmp;
+             for (int i = 0; i < len; i++) {
+               tmp = static_cast<float>(value[i]);
+               tar.push_back(tmp);
+             }
+             return op.SetAttr(name, tar);
+           })
+      .def("set_attr_string", (Operator & (Operator::*)(const std::string &,
+                                                        const std::string &)) &
+                                  Operator::SetAttr)
+      .def("set_attr_vec_string",
+           (Operator & (Operator::*)(const std::string &,
+                                     const std::vector<std::string> &)) &
+               Operator::SetAttr)
+      .def("set_attr_bool",
+           [](Operator &op, const std::string &name, bool value) -> Operator & {
+             if (value)
+               return op.SetAttr(name, true);
+             else
+               return op.SetAttr(name, false);
+           })
+      .def("set_attr_vec_bool",
+           [](Operator &op, const std::string &name,
+              const std::vector<bool> &value) -> Operator & {
+             int len = value.size();
+             std::vector<bool> tar;
+             for (int i = 0; i < len; i++) {
+               if (value[i])
+                 tar.push_back(true);
+               else
+                 tar.push_back(false);
+             }
+             return op.SetAttr(name, tar);
+           })
+      .def("set_attr_tensor",
+           (Operator & (Operator::*)(const std::string &, const Tensor &)) &
+               Operator::SetAttr)
+      .def("set_attr_vec_tensor",
+           (Operator &
+            (Operator::*)(const std::string &, const std::vector<Tensor> &)) &
+               Operator::SetAttr)
+      .def("set_attr_vec_uint8",
+           [](Operator &op, const std::string &name,
+              const std::vector<uint8_t> &value) -> Operator & {
+             int len = value.size();
+             std::vector<uint8_t> tar;
+             uint8_t tmp;
+             for (int i = 0; i < len; i++) {
+               tmp = (uint8_t)value[i];
+               tar.push_back(tmp);
+             }
+             return op.SetAttr(name, tar);
+           })
+      .def("set_attr_vec_vec_int64",
+           (Operator &
+            (Operator::*)(const std::string &,
+                          const std::vector<std::vector<int64_t>> &)) &
+               Operator::SetAttr)
+      .def("set_attr_vec_dtype",
+           [](Operator &op, const std::string &name,
+              const std::vector<DataType> &value) -> Operator & {
+             int len = value.size();
+             std::vector<ge::DataType> tar;
+             ge::DataType tmp;
+             for (int i = 0; i < len; i++) {
+               tmp = (ge::DataType)value[i];
+               tar.push_back(tmp);
+             }
+             return op.SetAttr(name, tar);
+           })
+      .def("set_attr_dtype",
+           [](Operator &op, const std::string &name,
+              const DataType &value) -> Operator & {
+             ge::DataType tar = (ge::DataType)value;
+             return op.SetAttr(name, tar);
+           })
+
+      .def("get_attr",
+           [](Operator &op, const std::string &name,
+              AttrType type) -> py::tuple {
+             graphStatus res = -1;
+             switch (type) {
+               case AT_INT64: {
+                 int64_t i_64_av;
+                 res = op.GetAttr(name, i_64_av);
+                 return py::make_tuple(i_64_av, res);
+               } break;
+               case AT_INT32: {
+                 int32_t i_32_av;
+                 res = op.GetAttr(name, i_32_av);
+                 return py::make_tuple(i_32_av, res);
+               } break;
+               case AT_UINT32: {
+                 uint32_t ui_32_av;
+                 res = op.GetAttr(name, ui_32_av);
+                 return py::make_tuple(ui_32_av, res);
+               } break;
+               case AT_LIST_INT64: {
+                 std::vector<int64_t> v_i_64_av;
+                 res = op.GetAttr(name, v_i_64_av);
+                 return py::make_tuple(v_i_64_av, res);
+               } break;
+               case AT_LIST_INT32: {
+                 std::vector<int32_t> v_i_32_av;
+                 res = op.GetAttr(name, v_i_32_av);
+                 return py::make_tuple(v_i_32_av, res);
+               } break;
+               case AT_LIST_UINT32: {
+                 std::vector<uint32_t> v_ui_32_av;
+                 res = op.GetAttr(name, v_ui_32_av);
+                 return py::make_tuple(v_ui_32_av, res);
+               } break;
+               case AT_FLOAT: {
+                 float f_av;
+                 res = op.GetAttr(name, f_av);
+                 return py::make_tuple(f_av, res);
+               } break;
+               case AT_LIST_FLOAT: {
+                 std::vector<float> v_f_av;
+                 res = op.GetAttr(name, v_f_av);
+                 return py::make_tuple(v_f_av, res);
+               } break;
+               case AT_ATTR_VALUE: {
+                 AttrValue o_av;
+                 res = op.GetAttr(name, o_av);
+                 return py::make_tuple(o_av, res);
+               } break;
+               case AT_STRING: {
+                 std::string s_av;
+                 res = op.GetAttr(name, s_av);
+                 return py::make_tuple(s_av, res);
+               } break;
+               case AT_LIST_STRING: {
+                 std::vector<std::string> v_s_av;
+                 res = op.GetAttr(name, v_s_av);
+                 return py::make_tuple(v_s_av, res);
+               } break;
+               case AT_BOOL: {
+                 bool b_av;
+                 res = op.GetAttr(name, b_av);
+                 return py::make_tuple(b_av, res);
+               } break;
+               case AT_LIST_BOOL: {
+                 std::vector<bool> v_b_av;
+                 res = op.GetAttr(name, v_b_av);
+                 return py::make_tuple(v_b_av, res);
+               } break;
+               case AT_TENSOR: {
+                 Tensor t_av;
+                 res = op.GetAttr(name, t_av);
+                 return py::make_tuple(t_av, res);
+               } break;
+               case AT_LIST_TENSOR: {
+                 std::vector<Tensor> v_t_av;
+                 res = op.GetAttr(name, v_t_av);
+                 return py::make_tuple(v_t_av, res);
+               } break;
+               case AT_LIST_UINT8: {
+                 std::vector<uint8_t> v_ui_8_av;
+                 res = op.GetAttr(name, v_ui_8_av);
+                 return py::make_tuple(v_ui_8_av, res);
+               } break;
+               case AT_LIST_LIST_INT64: {
+                 std::vector<std::vector<int64_t>> v_v_i_64_av;
+                 res = op.GetAttr(name, v_v_i_64_av);
+                 return py::make_tuple(v_v_i_64_av, res);
+               } break;
+               case AT_DT: {
+                 ge::DataType dt_av;
+                 res = op.GetAttr(name, dt_av);
+                 return py::make_tuple(dt_av, res);
+               } break;
+               case AT_LIST_DT: {
+                 std::vector<ge::DataType> v_dt_av;
+                 res = op.GetAttr(name, v_dt_av);
+                 return py::make_tuple(v_dt_av, res);
+               } break;
+               default:
+                 return py::make_tuple(0, res);
+                 break;
+             }
+           })
+      .def("break_connect", &Operator::BreakConnect)
+      .def("get_subgraph_names_count", &Operator::GetSubgraphNamesCount)
+      .def("get_subgraph_names", &Operator::GetSubgraphNames)
+      .def("get_subgraph_builder", &Operator::GetSubgraphBuilder)
+      .def("get_subgraph", &Operator::GetSubgraph)
+      .def("get_dynamic_subgraph_builder", &Operator::GetDynamicSubgraphBuilder)
+      .def("get_dynamic_subgraph", &Operator::GetDynamicSubgraph);
+
+  py::class_<Tensor>(*m, "GETensor")
+      .def(py::init<>())
+      .def(py::init<const TensorDesc &>())
+      .def(py::init<const TensorDesc &, const std::vector<uint8_t> &>())
+      .def(py::init<const TensorDesc &, const uint8_t *, size_t>())
+      .def("set_tensor_desc", &Tensor::SetTensorDesc)
+      .def("get_tensor_desc", &Tensor::GetTensorDesc)
+      // .def("set_data", (graphStatus(Tensor::*)(std::vector<uint8_t> &&)) &
+      // Tensor::SetData)
+      .def("set_data", (graphStatus (Tensor::*)(const std::vector<uint8_t> &)) &
+                           Tensor::SetData)
+      .def("set_data",
+           (graphStatus (Tensor::*)(const uint8_t *, size_t)) & Tensor::SetData)
+      .def("set_data",
+           (graphStatus (Tensor::*)(const std::string &)) & Tensor::SetData)
+      .def("set_data",
+           (graphStatus (Tensor::*)(const std::vector<std::string> &)) &
+               Tensor::SetData)
+
+      .def("get_data",
+           [](Tensor &ts) -> py::list {
+             py::list v_data;
+             uint8_t *data = ts.GetData();
+             size_t size = ts.GetSize();
+             for (size_t i = 0; i < size; ++i) {
+               v_data.append(data[i]);
+             }
+             return v_data;
+           })
+      .def("get_size", &Tensor::GetSize)
+      .def("is_valid", &Tensor::IsValid)
+      .def("clone", &Tensor::Clone);
+
+  py::class_<TensorDesc>(*m, "GETensorDesc")
+      .def(py::init<>())
+      .def(py::init<Shape, Format, DataType>(), py::arg("shape"),
+           py::arg("format") = FORMAT_ND, py::arg("dt") = DT_FLOAT)
+      .def(py::init<const TensorDesc &>())
+      .def("update",
+           (void (TensorDesc::*)(Shape, Format, DataType)) & TensorDesc::Update,
+           py::arg("shape"), py::arg("format") = FORMAT_ND,
+           py::arg("dt") = DT_FLOAT)
+      .def("set_shape", &TensorDesc::SetShape)
+      .def("get_shape", &TensorDesc::GetShape)
+      .def("set_unknown_dim_num_shape", &TensorDesc::SetUnknownDimNumShape)
+      .def("set_shape_range", &TensorDesc::SetShapeRange)
+      .def("get_shape_range",
+           [](TensorDesc &tensorDesc) -> py::tuple {
+             std::vector<std::pair<int64_t, int64_t>> range;
+             graphStatus status = tensorDesc.GetShapeRange(range);
+             return py::make_tuple(range, status);
+           })
+      .def("set_format", &TensorDesc::SetFormat)
+      .def("get_format", &TensorDesc::GetFormat)
+      .def("get_origin_shape", &TensorDesc::GetOriginShape)
+      .def("set_origin_shape", &TensorDesc::SetOriginShape)
+      .def("set_origin_format", &TensorDesc::SetOriginFormat)
+      .def("get_origin_format", &TensorDesc::GetOriginFormat)
+      .def("set_data_type", &TensorDesc::SetDataType)
+      .def("get_data_type", &TensorDesc::GetDataType)
+      .def("set_name", &TensorDesc::SetName)
+      .def("get_name", &TensorDesc::GetName)
+      .def("set_size", &TensorDesc::SetSize)
+      .def("get_size", &TensorDesc::GetSize)
+      .def("set_real_dim_cnt", &TensorDesc::SetRealDimCnt)
+      .def("get_real_dim_cnt", &TensorDesc::GetRealDimCnt);
+
+  py::class_<Shape>(*m, "GEShape")
+      .def(py::init<>())
+      .def(py::init<const std::vector<int64_t> &>())
+      .def("get_dim_num", &Shape::GetDimNum)
+      .def("set_dim", &Shape::SetDim)
+      .def("get_dim", &Shape::GetDim)
+      .def("get_dims", &Shape::GetDims)
+      .def("get_shape_size", &Shape::GetShapeSize);
+
+  py::class_<AttrValue>(*m, "GEAttrValue").def(py::init<>());
+
+  py::class_<OperatorFactory>(*m, "GEOperatorFactory")
+      .def("create_operator", &OperatorFactory::CreateOperator)
+      .def("get_ops_type_list",
+           []() -> py::tuple {
+             std::vector<std::string> all_ops;
+             graphStatus status = OperatorFactory::GetOpsTypeList(all_ops);
+             return py::make_tuple(all_ops, status);
+           })
+      .def("is_exist_op", &OperatorFactory::IsExistOp);
+}
+
+}  // end namespace pybind
+}  // end namespace paddle
+#endif
diff --git a/paddle/fluid/pybind/ascend_wrapper_py.h b/paddle/fluid/pybind/ascend_wrapper_py.h
new file mode 100644
index 0000000000..4af96d6ef4
--- /dev/null
+++ b/paddle/fluid/pybind/ascend_wrapper_py.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+void BindAscendGraph(py::module* m);
+void BindAscendWrapper(py::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b66dd17bbc..72b3c9645b 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -65,6 +65,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/monitor.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
+#ifdef PADDLE_WITH_ASCEND
+#include "paddle/fluid/pybind/ascend_wrapper_py.h"
+#endif
 #include "paddle/fluid/pybind/box_helper_py.h"
 #include "paddle/fluid/pybind/compatible.h"
 #include "paddle/fluid/pybind/const_value.h"
@@ -2837,6 +2840,10 @@ All parameter, weight, gradient are variables in Paddle.
   BindCompatible(&m);
   BindDataset(&m);
   BindGenerator(&m);
+#ifdef PADDLE_WITH_ASCEND
+  BindAscendWrapper(&m);
+  BindAscendGraph(&m);
+#endif
 #ifdef PADDLE_WITH_CRYPTO
   BindCrypto(&m);
 #endif
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
new file mode 100644
index 0000000000..d7ac81bb5c
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.framework as framework
+from paddle.fluid.optimizer import Optimizer
+import paddle.fluid.core as core
+import numpy as np
+import ascend_parser
+
+
+class AscendIRParser(object):
+    def __init__(self):
+        self.graph_idx = 0
+
+    def _construct_input_map(self, input_varlist):
+        ret_map = {}
+        ge_in_operator = []
+        for id, var in enumerate(input_varlist):
+            if var.is_data:  # input data
+                ge_input = core.GEOperatorFactory.create_operator(
+                    var.name, "Data").set_attr_int32("index", id)
+                ret_map[var.name] = ge_input
+                ge_in_operator.append(ge_input)
+            else:  # param, learning ...
+                ge_input = core.GEOperatorFactory.create_operator(var.name,
+                                                                  "Variable")
+                ge_input.update_output_desc("y",
+                                            core.GETensorDesc(
+                                                core.GEShape(var.shape),
+                                                core.GEFormat.FORMAT_ND,
+                                                core.GEDataType.DT_FLOAT))
+                ret_map[var.name] = ge_input
+        return ge_in_operator, ret_map
+
+    def parse_op(self, op):
+        if op.type in ascend_parser.registerd_op:
+            print("Op[%s] has been registered, begin to parse it" % (op.type))
+            op_parser = self.parser_factory.create_parse(
+                ascend_parser.registerd_op[op.type])
+            op_parser.apply(op)
+        else:
+            print("Op[%s] has not been registered, so we have to skip it" %
+                  (op.type))
+
+    def _parse_program(self,
+                       graph_name,
+                       program,
+                       input_varlist=[],
+                       fetch_list=[]):
+        begin_graph_idx = self.graph_idx
+        ge_in_operator = []
+        ge_out_operator = []
+        self.var2geop = {}
+
+        block = program.global_block()
+        if len(block.ops) == 0:
+            print("There is no ops in program %s" % (graph_name))
+            return []
+
+        graph = core.GEGraph(graph_name)
+
+        ge_in_operator, self.var2geop = self._construct_input_map(input_varlist)
+
+        self.parser_factory = ascend_parser.AscendParserFactory(graph,
+                                                                self.var2geop)
+        for i, curop in list(enumerate(block.ops)):
+            self.parse_op(curop)
+
+        # Set fetch_var for GE
+        for e in fetch_list:
+            name = e
+            if not isinstance(e, str):
+                name = e.name
+            ge_out_operator.append(self.var2geop[name])
+
+        # (Debug) If you want to print back prop vars, append/assign the varname in ge_out_operator here, such as: 
+        # if graph_name == "main":
+        #     ge_out_operator.append(self.var2geop["reduce_sum_0.tmp_0@GRAD"])
+
+        # Add ops that may be input of a graph, such as const.
+        for varname, geop in self.var2geop.items():
+            if varname.startswith("geinput"):
+                ge_in_operator.append(geop)
+
+        graph.set_inputs(ge_in_operator).set_outputs(ge_out_operator)
+
+        # Remove ops of origin program
+        op_num = len(block.ops)
+        for i in range(op_num - 1, -1, -1):
+            block._remove_op(i)
+
+        input_varlist = [var for var in input_varlist if var.is_data]
+
+        block.append_op(
+            type="ascend_trigger",
+            inputs={"FeedList": input_varlist},
+            outputs={"FetchList": fetch_list},
+            attrs={'graph_idx': self.graph_idx})
+        self.graph_idx += 1
+        return graph
+
+    def parse_program(self, startup_program, main_program, input_varlist,
+                      fetch_list):
+        startup_graph = self._parse_program("startup", startup_program)
+        main_graph = self._parse_program("main", main_program, input_varlist,
+                                         fetch_list)
+        return startup_graph, main_graph
+
+
+# AscendOptimizer is a wrapper for basic optimizer now
+# We will make it part of fleet meta_optimizer in the future
+class AscendOptimizer(Optimizer):
+    def __init__(self, optimizer, fetch_list=[]):
+        self.inner_opt = optimizer
+        self.fetch_list = fetch_list
+
+    def __del__(self):
+        core.ge_finalize()
+
+    def _can_apply(self):
+        if not self.user_defined_strategy.ascend:
+            return False
+        # TODO(hutuxian): other check here
+        return True
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.ascend = False
+        dist_strategy.ascend_configs = {}
+
+    def _get_input_varlist(program):
+        ret_list = []
+        for var in program.list_vars():
+            if var.is_data or var.persistable:
+                ret_list.append(var)
+        return ret_list
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        minimized = self.inner_opt.minimize(
+            loss, startup_program=startup_program)
+
+        self.ascend_instance = core.AscendInstance()
+
+        # Config about Graph Engine can be found in https://support.huaweicloud.com/
+        config = {
+            "ge.exec.deviceId": "0",
+            "ge.graphRunMode": "1",
+            "ge.exec.precision_mode": "must_keep_origin_dtype"
+        }
+        core.ge_initialize(config)
+
+        # Init Session
+        self.ascend_instance.init_global_resources()
+
+        main_block = loss.block
+        self.parser = AscendIRParser()
+
+        input_varlist = _get_input_varlist(main_block.program)
+        startup_graph, main_graph = self.parser.parse_program(
+            startup_program, main_block.program, input_varlist, self.fetch_list)
+
+        self.ascend_instance.add_ascend_subgraph(0, startup_graph)
+        self.ascend_instance.add_ascend_subgraph(1, main_graph)
+
+        return minimized
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
new file mode 100644
index 0000000000..b497b5eecd
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
@@ -0,0 +1,529 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.framework as framework
+from paddle.fluid.optimizer import Optimizer
+import paddle.fluid.core as core
+import numpy as np
+
+registerd_op = {
+    "elementwise_add": "AddParser",
+    "matmul": "MatMulParser",
+    "mul": "MulParser",
+    "relu": "ReluParser",
+    "softmax_with_cross_entropy": "SoftmaxWithCrossEntropyParser",
+    "shape": "ShapeParser",
+    "fill_constant": "FillConstantParser",
+    "reduce_sum": "ReduceSumParser",
+    "reduce_sum_grad": "ReduceSumGradParser",
+    "matmul_grad": "MatMulGradParser",
+    "mul_grad": "MulGradParser",
+    "relu_grad": "ReluGradParser",
+    "softmax_with_cross_entropy_grad": "SoftmaxWithCrossEntropyGradParser",
+    "truncated_gaussian_random": "TruncatedNormalParser",
+    "sgd": "SGDParser"
+}
+global_cnt = -1
+global_input_cnt = -1
+
+
+class AscendHelper(object):
+    def __init__(self):
+        self.dtype2ge_map = {
+            0: core.GEDataType.DT_BOOL,
+            1: core.GEDataType.DT_INT16,
+            2: core.GEDataType.DT_INT32,
+            3: core.GEDataType.DT_INT64,
+            4: core.GEDataType.DT_FLOAT16,
+            5: core.GEDataType.DT_FLOAT,
+            6: core.GEDataType.DT_DOUBLE
+        }
+        self.dtype2np_map = {
+            0: "bool",
+            1: "int16",
+            2: "int32",
+            3: "int64",
+            4: "float16",
+            5: "float32",
+            6: "float64"
+        }
+
+    def dtype2ge(self, dtype):
+        assert dtype in self.dtype2ge_map, "dtype[%d] is not supported %d" % (
+            dtype)
+        return self.dtype2ge_map[dtype]
+
+    def dtype2np(self, index):
+        assert index in self.dtype2np_map, "index[%d] is not supported %d" % (
+            dtype)
+        return self.dtype2np_map[index]
+
+
+class AscendParserFactory(object):
+    def __init__(self, graph, var2geop):
+        self.graph = graph
+        self.var2geop = var2geop
+
+    def create_parse(self, parser_class):
+        try:
+            parser = globals()[parser_class](self.graph, self.var2geop)
+            return parser
+        except:
+            raise ValueError("parser class %s does not exist" % parser_class)
+
+
+class AscendParserBase(object):
+    def __init__(self, graph, var2geop):
+        self.graph = graph
+        self.var2geop = var2geop
+        self.op = None
+        self.ascend_helper = AscendHelper()
+
+    def _get_ge_input(self, input_var_name):
+        assert input_var_name in self.var2geop, "var %s not created before" % (
+            input_var_name)
+        return self.var2geop[input_var_name]
+
+    def update_output(self, geop_list, index_list):
+        output_num = len(self.op.output_names)
+        assert output_num == len(
+            index_list
+        ), "Parser[%s]'s output number[%d] is not equal to parameters number[%d]" % (
+            self.parser_name, len(index_list), output_num)
+        for output_id in range(output_num):
+            arguments = self.op.output(self.op.output_names[output_id])
+            print("%d argument:  %s" % (output_id, str(arguments)))
+            if len(arguments) > 0:
+                assert len(arguments) == len(
+                    index_list[output_id]
+                ), "Parser[%s]'s %dth argument number[%d] is not equal to paddle's number[%d]" % (
+                    self.parser_name, output_id, len(index_list[output_id]),
+                    len(arguments))
+                for i in range(len(arguments)):
+                    print("assgin index_list[%d][%d] to %s" %
+                          (output_id, i, arguments[i]))
+                    self.var2geop[arguments[i]] = geop_list[index_list[
+                        output_id][i]]
+
+        for geop in geop_list:
+            self.graph.add_op(geop)
+
+    def apply(self, op):
+        self.op = op
+        assert self.op.type == self.parser_name, "op [%s] != parser_name[%s]" % (
+            self.op.type, self.parser_name)
+        print("begin to parse op %s" % (self.parser_name))
+        geop_list, index_list = self._apply()
+        self.update_output(geop_list, index_list)
+
+    def _mark_as_input(self, ge_tensor):
+        global global_input_cnt
+        global_input_cnt += 1
+        self.var2geop["geinput." + str(global_input_cnt)] = ge_tensor
+
+    def _accumulated_op_id(self):
+        global global_cnt
+        global_cnt += 1
+        return "." + str(global_cnt)
+
+    def _create_ge_tensor(self, shape, dtype, value):
+        tensor_desc = core.GETensorDesc(
+            core.GEShape(shape), core.GEFormat.FORMAT_ND,
+            self.ascend_helper.dtype2ge(dtype))
+        tensor = core.GETensor(tensor_desc)
+
+        data = (value * np.ones((
+            shape))).reshape(shape).astype(self.ascend_helper.dtype2np(dtype))
+        buf = data.tobytes()
+        data_8 = np.frombuffer(buf, dtype=np.uint8)
+        tensor.set_data(data_8)
+        return tensor
+
+
+class AddParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(AddParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_add"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        add = core.GEOperatorFactory.create_operator(
+            "add" + self._accumulated_op_id(), "Add").set_input(
+                "x1", x).set_input("x2", y)
+        return [add], [[0]]
+
+
+class ReduceSumParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReduceSumParser, self).__init__(graph, var2geop)
+        self.parser_name = "reduce_sum"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        axes = self.op.attr("dim")
+        keep_dims = self.op.attr("keep_dim")
+        reduce_sum = core.GEOperatorFactory.create_operator(
+            "reduce_sum" + self._accumulated_op_id(), "ReduceSumD").set_input(
+                "x", x, 0).set_attr_vec_int32("axes", axes).set_attr_bool(
+                    "keep_dims", keep_dims)
+        return [reduce_sum], [[0]]
+
+
+class ReduceSumGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReduceSumGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "reduce_sum_grad"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        input = self._get_ge_input(self.op.input_arg_names[1])
+
+        shape_tensor = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", input,
+                                                                    0)
+        axis_const = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", self._create_ge_tensor([1], 2, -1))
+        self._mark_as_input(axis_const)
+
+        broadcast = core.GEOperatorFactory.create_operator(
+            "broadcast_to_d" + self._accumulated_op_id(),
+            "BroadcastTo").set_input("x", x).set_input("shape", shape_tensor)
+        # unsqueeze cannot get right result, but ExpandDims seems have the same functionality.
+        reduce_sum_grad = core.GEOperatorFactory.create_operator(
+            "expand" + self._accumulated_op_id(), "ExpandDims").set_input(
+                "x", broadcast).set_input("axis", axis_const)
+        return [shape_tensor, axis_const, broadcast, reduce_sum_grad], [[3]]
+
+
+class MatMulParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MatMulParser, self).__init__(graph, var2geop)
+        self.parser_name = "matmul"
+
+    def _apply(self):
+        x1 = self._get_ge_input(self.op.input_arg_names[0])
+        x2 = self._get_ge_input(self.op.input_arg_names[1])
+        matmul = core.GEOperatorFactory.create_operator(
+            "matmul" + self._accumulated_op_id(), "MatMul").set_input(
+                "x1", x1).set_input("x2", x2)
+        return [matmul], [[0]]
+
+
+class MatMulGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MatMulGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "matmul_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        y = self._get_ge_input(self.op.input_arg_names[2])
+
+        x_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(), "MatMul").set_input(
+                "x1", out_grad).set_input("x2", y).set_attr_bool(
+                    "transpose_x1", False).set_attr_bool("transpose_x2", True)
+        y_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(), "MatMul").set_input(
+                "x1", x).set_input("x2", out_grad).set_attr_bool(
+                    "transpose_x1", True).set_attr_bool("transpose_x2", False)
+        return [x_grad, y_grad], [[0], [1]]
+
+
+class MulGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MulGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "mul_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        y = self._get_ge_input(self.op.input_arg_names[2])
+
+        x_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(), "MatMul").set_input(
+                "x1", out_grad).set_input("x2", y).set_attr_bool(
+                    "transpose_x1", False).set_attr_bool("transpose_x2", True)
+        y_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(), "MatMul").set_input(
+                "x1", x).set_input("x2", out_grad).set_attr_bool(
+                    "transpose_x1", True).set_attr_bool("transpose_x2", False)
+
+        return [x_grad, y_grad], [[0], [1]]
+
+
+class MulParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MulParser, self).__init__(graph, var2geop)
+        self.parser_name = "mul"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+
+        matmul = core.GEOperatorFactory.create_operator(
+            "mul" + self._accumulated_op_id(), "MatMul").set_input(
+                "x1", x).set_input("x2", y)
+        return [matmul], [[0]]
+
+
+class ReluParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReluParser, self).__init__(graph, var2geop)
+        self.parser_name = "relu"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        relu = core.GEOperatorFactory.create_operator(
+            "relu" + self._accumulated_op_id(), "Relu").set_input("x", x)
+        return [relu], [[0]]
+
+
+class ReluGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReluGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "relu_grad"
+
+    def _apply(self):
+        out = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+        relu_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(), "ReluGrad").set_input(
+                "gradients", out_grad).set_input("features", out)
+        return [relu_grad], [[0]]
+
+
+class SoftmaxWithCrossEntropyParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SoftmaxWithCrossEntropyParser, self).__init__(graph, var2geop)
+        self.parser_name = "softmax_with_cross_entropy"
+
+    def _apply(self):
+        label = self._get_ge_input(self.op.input_arg_names[0])
+        logits = self._get_ge_input(self.op.input_arg_names[1])
+
+        cls_num = self.op.block.var(self.op.input_arg_names[1]).shape[1]
+        softmax = core.GEOperatorFactory.create_operator(
+            "softmax" + self._accumulated_op_id(), "SoftmaxV2").set_input(
+                "x", logits)
+        label = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", label).set_attr_int32("dst_type", 3)
+
+        tensoron = self._create_ge_tensor([1], 5, 1)
+        on_const = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", tensoron)
+        self._mark_as_input(on_const)
+        tensoroff = self._create_ge_tensor([1], 5, 0)
+        off_const = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", tensoroff)
+        self._mark_as_input(off_const)
+        onehot = core.GEOperatorFactory.create_operator(
+            "onehot" + self._accumulated_op_id(), "OneHotD").set_input(
+                "x", label).set_input("on_value", on_const).set_input(
+                    "off_value", off_const).set_attr_int32("depth", cls_num)
+        squeeze = core.GEOperatorFactory.create_operator(
+            "mul" + self._accumulated_op_id(), "Squeeze").set_input("x", onehot)
+        loss = core.GEOperatorFactory.create_operator(
+            "loss" + self._accumulated_op_id(),
+            "SoftmaxCrossEntropyWithLogits").set_input(
+                "features", logits).set_input("labels", squeeze)
+
+        return [label, softmax, on_const, off_const, onehot, squeeze,
+                loss], [[6], [1]]
+
+
+class SoftmaxWithCrossEntropyGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SoftmaxWithCrossEntropyGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "softmax_with_cross_entropy_grad"
+
+    def _apply(self):
+        label = self._get_ge_input(self.op.input_arg_names[0])
+        loss_grad = self._get_ge_input(self.op.input_arg_names[1])
+        softmax = self._get_ge_input(self.op.input_arg_names[2])
+        cls_num = self.op.block.var(self.op.input_arg_names[2]).shape[1]
+
+        tensoron = self._create_ge_tensor([1], 5, 1)
+        on_const = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", tensoron)
+        self._mark_as_input(on_const)
+        tensoroff = self._create_ge_tensor([1], 5, 0)
+        off_const = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", tensoroff)
+        self._mark_as_input(off_const)
+        label = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", label).set_attr_int32("dst_type", 3)
+        onehot = core.GEOperatorFactory.create_operator(
+            "onehot" + self._accumulated_op_id(), "OneHotD").set_input(
+                "x", label).set_input("on_value", on_const).set_input(
+                    "off_value", off_const).set_attr_int32("depth", cls_num)
+        # the fuck onehot will add a demension, so must call squeeze afterward
+        squeeze = core.GEOperatorFactory.create_operator(
+            "mul" + self._accumulated_op_id(), "Squeeze").set_input("x", onehot)
+        sub = core.GEOperatorFactory.create_operator(
+            "sub" + self._accumulated_op_id(), "Sub").set_input(
+                "x1", softmax).set_input("x2", squeeze)
+        grad = core.GEOperatorFactory.create_operator(
+            "mul" + self._accumulated_op_id(), "Mul").set_input(
+                "x1", loss_grad).set_input("x2", sub)
+        return [on_const, off_const, label, onehot, squeeze, sub, grad], [[-1]]
+
+
+class ShapeParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ShapeParser, self).__init__(graph, var2geop)
+        self.parser_name = "shape"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", x)
+        return [shape], [[0]]
+
+
+class FillConstantParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(FillConstantParser, self).__init__(graph, var2geop)
+        self.parser_name = "fill_constant"
+
+    def _apply(self):
+        shape = self.op.attr("shape")
+        dtype = self.op.attr("dtype")
+        value = self.op.attr("value")
+        print("shape: ", shape)
+        print("dtype: ", dtype)
+        print("value: ", value)
+        tensor = self._create_ge_tensor(shape, dtype, value)
+        const = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", tensor)
+        self._mark_as_input(const)
+        if self.op.block.var(self.op.output('Out')[0]).persistable:
+            print("%s fill_constant" % (self.op.output('Out')[0]))
+            var = core.GEOperatorFactory.create_operator(
+                self.op.output('Out')[0], "Variable")
+            var.update_output_desc("y",
+                                   core.GETensorDesc(
+                                       core.GEShape(shape),
+                                       core.GEFormat.FORMAT_ND,
+                                       core.GEDataType.DT_FLOAT))
+            assign = core.GEOperatorFactory.create_operator(
+                "assign" + self._accumulated_op_id(), "Assign").set_input(
+                    "value", const).set_input("ref", var)
+            return [const], [[0]]
+        else:
+            print(
+                "self.op.output('Out')[0] is not persistable in fill_constant")
+            return [const], [[0]]
+
+
+class SGDParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SGDParser, self).__init__(graph, var2geop)
+        self.parser_name = "sgd"
+
+    def _apply(self):
+        grad = self._get_ge_input(self.op.input_arg_names[0])
+        lr = self._get_ge_input(self.op.input_arg_names[1])
+        param = self._get_ge_input(self.op.input_arg_names[2])
+        sgd = core.GEOperatorFactory.create_operator(
+            "momentum" + self._accumulated_op_id(),
+            "ApplyGradientDescent").set_input("var", param).set_input(
+                "alpha", lr).set_input("delta", grad)
+        return [sgd], [[0]]
+
+
+class TruncatedNormalParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(TruncatedNormalParser, self).__init__(graph, var2geop)
+        self.parser_name = "truncated_gaussian_random"
+
+    def _apply(self):
+        shape = self.op.attr("shape")
+        dtype = self.op.attr("dtype")
+        mean = self.op.attr("mean")
+        std = self.op.attr("std")
+        seed = self.op.attr("seed")
+        tensor1 = self._create_ge_tensor([len(shape)], 2, shape)
+        shape_tensor = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", tensor1)
+
+        tensor2 = self._create_ge_tensor([1], dtype, mean)
+        mean_tensor = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", tensor2)
+
+        tensor3 = self._create_ge_tensor([1], dtype, std)
+        std_tensor = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", tensor3)
+
+        tensor4 = self._create_ge_tensor([1], dtype, mean - 2 * std)
+        min_tensor = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", tensor4)
+
+        tensor5 = self._create_ge_tensor([1], dtype, mean + 2 * std)
+        max_tensor = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", tensor5)
+
+        self._mark_as_input(shape_tensor)
+        self._mark_as_input(mean_tensor)
+        self._mark_as_input(std_tensor)
+        self._mark_as_input(min_tensor)
+        self._mark_as_input(max_tensor)
+
+        truncated_normal = core.GEOperatorFactory.create_operator(
+            "truncated_normal" + self._accumulated_op_id(),
+            "ParameterizedTruncatedNormal").set_input(
+                "shape", shape_tensor).set_input(
+                    "means", mean_tensor).set_input(
+                        "stdevs", std_tensor).set_input(
+                            "min", min_tensor).set_input(
+                                "max", max_tensor).set_attr_int32("seed", 0)
+
+        ## wirte the output of truncatedNormal from startup_program to main_program
+        if self.op.block.var(self.op.output('Out')[0]).persistable:
+            print("%s is Persistable in truncated_normal" %
+                  (self.op.output('Out')[0]))
+            #var = core.GEOperatorFactory.create_operator(self.op.output('Out')[0], "Variable").set_input("x", truncated_normal)
+            var = core.GEOperatorFactory.create_operator(
+                self.op.output('Out')[0], "Variable")
+            var.update_output_desc("y",
+                                   core.GETensorDesc(
+                                       core.GEShape(shape),
+                                       core.GEFormat.FORMAT_ND,
+                                       core.GEDataType.DT_FLOAT))
+            assign = core.GEOperatorFactory.create_operator(
+                "assign" + self._accumulated_op_id(), "Assign").set_input(
+                    "value", truncated_normal).set_input("ref", var)
+            return [
+                shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor,
+                truncated_normal
+            ], [[-1]]
+        else:
+            print(
+                "self.op.output('Out')[0] is not persistable in truncated_noraml"
+            )
+            return [truncated_normal], [[0]]  #[assign]
diff --git a/python/paddle/fluid/tests/unittests/test_ascend_trigger.py b/python/paddle/fluid/tests/unittests/test_ascend_trigger.py
new file mode 100644
index 0000000000..644b550bc4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_ascend_trigger.py
@@ -0,0 +1,49 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import unittest
+
+
+class TestAscendTriggerOP(unittest.TestCase):
+    """ TestCases for ascend_trigger op"""
+
+    def test_ascend_trigger_op(self):
+        paddle.enable_static()
+        program = fluid.Program()
+        block = program.global_block()
+        with fluid.program_guard(program):
+            x = fluid.data(name='x', shape=[1], dtype='int64', lod_level=0)
+            y = fluid.data(name='y', shape=[1], dtype='int64', lod_level=0)
+            block.append_op(
+                type="ascend_trigger",
+                inputs={"FeedList": [x]},
+                outputs={"FetchList": [y]},
+                attrs={'graph_idx': 0})
+
+        exe = paddle.static.Executor(paddle.CPUPlace())
+        try:
+            exe.run(program)
+        except RuntimeError as e:
+            pass
+        except:
+            self.assertTrue(False)
+
+        paddle.disable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab