diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 31dfc816be68a99422e68bb43cfead9823cd9954..b856bb51f4d56fd67a206130fae5afa7bb37aa0f 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -211,6 +211,9 @@ cc_library( if(WITH_GPU) target_link_libraries(var_type_traits dynload_cuda) endif() +if(WITH_XPU) + target_link_libraries(var_type_traits dynload_xpti) +endif() # every source file that includes "dnnl.h" must depends on mkldnn # or, the first one should depends on mkldnn diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index a5cf7587c7ce0acaf431ccb6ab34a856fb446f57..4d7f496aaa42d003d945c55e1c28449b490db8ea 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -68,6 +68,10 @@ if(WITH_GPU OR WITH_ROCM) set(GPU_CTX_DEPS dynload_cuda dynamic_loader) endif() +if(WITH_XPU) + set(XPU_CTX_DEPS dynload_xpti dynamic_loader) +endif() + if(WITH_IPU) set(IPU_CTX_DEPS ipu_info) else() @@ -277,6 +281,17 @@ elseif(WITH_ROCM) stats op_proto_maker shape_inference) +elseif(WITH_XPU) + cc_library( + profiler + SRCS profiler.cc + DEPS phi + enforce + dynload_xpti + new_profiler + stats + op_proto_maker + shape_inference) else() cc_library( profiler diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt index 2f09e25de274de7923a6f159a514a63b5d48bc8e..f9e9659fa9f4cc1c40758c849440215615df5835 100644 --- a/paddle/fluid/platform/device/xpu/CMakeLists.txt +++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt @@ -14,11 +14,23 @@ set(XPU_CTX_DEPS cc_library( xpu_info SRCS xpu_info.cc - DEPS glog enforce xpulib device_context place phi) + DEPS glog + enforce + xpulib + device_context + place + phi + dynload_xpti) cc_library( xpu_op_list SRCS xpu_op_list.cc - DEPS glog enforce xpulib device_context op_kernel_type phi) + DEPS glog + enforce + xpulib + device_context + op_kernel_type + phi + dynload_xpti) cc_library( xpu_resource_pool SRCS xpu_resource_pool.cc diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 976223be354d6ad0d9834768d26a370773740be4..4cb3bfdb3adaefedfef33e2fe5c738a3174dd459 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -72,6 +72,12 @@ else() SRCS warpctc.cc DEPS dynamic_loader warpctc phi) endif() +if(WITH_XPU) + cc_library( + dynload_xpti + SRCS xpti.cc + DEPS dynamic_loader phi_dynload_xpti) +endif() # TODO(TJ): add iomp, mkldnn? diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index afa689a3f904d3eeb870dc025ec065f39f9607c6..40f69a87f37f867826ca2c153a76bdebdaae90e6 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -75,6 +75,8 @@ void* GetCusparseLtDsoHandle() { return phi::dynload::GetCusparseLtDsoHandle(); } +void* GetXPTIDsoHandle() { return phi::dynload::GetXPTIDsoHandle(); } + } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 10b985e0b204429a20118ebb5d2f70ba8eba2f54..93a19645a0a34e5de96f4cbacacb0b9c8fcc8379 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -46,6 +46,7 @@ void* GetCUFFTDsoHandle(); void* GetMKLRTDsoHandle(); void* GetROCFFTDsoHandle(); void* GetCusparseLtDsoHandle(); +void* GetXPTIDsoHandle(); void SetPaddleLibPath(const std::string&); } // namespace dynload diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt index df5b9818d69629b4972735c040d95c5926d05b49..85eba90ec6166fda521a8352f3fdf88ef6a3e073 100644 --- a/paddle/fluid/platform/profiler/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/CMakeLists.txt @@ -6,6 +6,10 @@ cc_library( cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog) +cc_library( + xpu_tracer + SRCS xpu_tracer.cc + DEPS enforce glog) add_subdirectory(custom_device) cc_library( event_node @@ -32,7 +36,12 @@ cc_library( cc_library( new_profiler SRCS profiler.cc - DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind + DEPS host_tracer + cuda_tracer + xpu_tracer + profiler_utils + cpu_utilization + event_bind custom_tracer) cc_test( test_event_node diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc index 67d7fe95e3198bdd23fe679c348c02b0120bb058..e0a91629a10d6517e3620d640714762db2f8e0ab 100644 --- a/paddle/fluid/platform/profiler/profiler.cc +++ b/paddle/fluid/platform/profiler/profiler.cc @@ -31,6 +31,7 @@ #include "paddle/fluid/platform/profiler/host_tracer.h" #include "paddle/fluid/platform/profiler/trace_event_collector.h" #include "paddle/fluid/platform/profiler/utils.h" +#include "paddle/fluid/platform/profiler/xpu_tracer.h" #ifdef PADDLE_WITH_CUSTOM_DEVICE #include "paddle/phi/backends/device_manager.h" #endif @@ -53,6 +54,10 @@ void SynchronizeDevice() { phi::DeviceManager::SynchronizeDevice(place); } #endif +#ifdef PADDLE_WITH_XPU + // TODO(zhangxiaoci) xpu do not support device sync yet + // KL3 might do +#endif } std::atomic Profiler::alive_{false}; @@ -82,6 +87,14 @@ bool Profiler::IsCnpapiSupported() { return supported; } +bool Profiler::IsXPTISupported() { + bool supported = false; +#ifdef PADDLE_WITH_XPTI + supported = true; +#endif + return supported; +} + Profiler::Profiler(const ProfilerOptions& options, const std::vector& custom_device_types) { options_ = options; @@ -99,6 +112,9 @@ Profiler::Profiler(const ProfilerOptions& options, tracers_.emplace_back(&CustomTracer::GetInstance(dev_type), false); } } + if (trace_switch.test(kProfileXPUOptionBit)) { + tracers_.emplace_back(&XPUTracer::GetInstance(), false); + } } Profiler::~Profiler() { alive_.store(false); } diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h index b486e7543d96ccdcacbf2be18eb9b68532fbb558..28cf7a2d3857224c6c76b854585676154f1f135c 100644 --- a/paddle/fluid/platform/profiler/profiler.h +++ b/paddle/fluid/platform/profiler/profiler.h @@ -34,12 +34,13 @@ namespace platform { static constexpr uint32_t kProfileCPUOptionBit = 0; static constexpr uint32_t kProfileGPUOptionBit = 1; +static constexpr uint32_t kProfileXPUOptionBit = 2; static constexpr uint32_t kProfileCustomDeviceOptionBit = 3; void SynchronizeDevice(); struct ProfilerOptions { - uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu + uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu, bit 2: xpu uint32_t trace_level = FLAGS_host_trace_level; }; @@ -57,6 +58,8 @@ class Profiler { static bool IsCnpapiSupported(); + static bool IsXPTISupported(); + void Prepare(); void Start(); diff --git a/paddle/fluid/platform/profiler/xpu_tracer.cc b/paddle/fluid/platform/profiler/xpu_tracer.cc index 5e687b9c7457f7dc199dfef03b7351a53f5d6569..a45dc5635f4c51a64ac65b5c4f8c5c1b748f2419 100644 --- a/paddle/fluid/platform/profiler/xpu_tracer.cc +++ b/paddle/fluid/platform/profiler/xpu_tracer.cc @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +// #include "paddle/fluid/platform/profiler/xpu_tracer.h" @@ -23,6 +24,7 @@ #include "paddle/phi/backends/device_manager.h" #endif +#ifdef PADDLE_WITH_XPTI #define XPTI_CALL(call) \ do { \ XPTIResult _status = call; \ @@ -31,6 +33,7 @@ exit(-1); \ } \ } while (0) +#endif // PADDLE_WITH_XPTI namespace paddle { namespace platform { @@ -40,7 +43,7 @@ void XPUTracer::PrepareTracing() { state_ == TracerState::UNINITED || state_ == TracerState::STOPED, true, platform::errors::PreconditionNotMet("XPUTracer must be UNINITED")); -#ifdef PADDLE_WITH_XPU +#ifdef PADDLE_WITH_XPTI XPTI_CALL(dynload::xptiActivityEnable()); VLOG(3) << "enable xpti activity"; #endif @@ -52,7 +55,7 @@ void XPUTracer::StartTracing() { state_ == TracerState::READY, true, platform::errors::PreconditionNotMet("Tracer must be READY or STOPPED")); -#ifdef PADDLE_WITH_XPU +#ifdef PADDLE_WITH_XPTI XPTI_CALL(dynload::xptiStartTracing()); #endif tracing_start_ns_ = PosixInNsec(); @@ -64,7 +67,7 @@ void XPUTracer::StopTracing() { state_, TracerState::STARTED, platform::errors::PreconditionNotMet("Tracer must be STARTED")); -#ifdef PADDLE_WITH_XPU +#ifdef PADDLE_WITH_XPTI XPTI_CALL(dynload::xptiStopTracing()); XPTI_CALL(dynload::xptiActivityDisable()); VLOG(3) << "disable xpti activity"; @@ -72,7 +75,7 @@ void XPUTracer::StopTracing() { state_ = TracerState::STOPED; } -#ifdef PADDLE_WITH_XPU +#ifdef PADDLE_WITH_XPTI void AddApiRecord(const baidu::xpu::xpti::XPTIEventApi* api, uint64_t start_ns, TraceEventCollector* collector) { @@ -158,7 +161,7 @@ void XPUTracer::CollectTraceData(TraceEventCollector* collector) { state_, TracerState::STOPED, platform::errors::PreconditionNotMet("Tracer must be STOPED")); -#ifdef PADDLE_WITH_XPU +#ifdef PADDLE_WITH_XPTI XPTI_CALL(dynload::xptiActivityFlushAll()); baidu::xpu::xpti::XPTIEvent* record = nullptr; while (true) { diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 21516f3ced3401cd5af10ace776b5c524c994cf1..f1b553a3db08146c9c283b7564677a363a00328e 100755 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -86,6 +86,10 @@ if(WITH_NCCL OR WITH_RCCL) set(PYBIND_DEPS ${PYBIND_DEPS} reducer) endif() +if(WITH_XPU) + set(PYBIND_DEPS ${PYBIND_DEPS} dynload_xpti) +endif() + if(WITH_XPU_BKCL) set(PYBIND_DEPS ${PYBIND_DEPS} reducer) set(PYBIND_DEPS ${PYBIND_DEPS} bkcl_context) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 1c2914ee7ab9c05fc8b70f2324addb239efe8d2c..2016bd47b0aed1dcbd46d8c6bebb7b23e95c8018 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2408,6 +2408,7 @@ All parameter, weight, gradient are variables in Paddle. .def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported) .def("is_cnpapi_supported", &paddle::platform::Profiler::IsCnpapiSupported) + .def("is_xpti_supported", &paddle::platform::Profiler::IsXPTISupported) .def("prepare", [](paddle::platform::Profiler *profiler) { platform::EnableHostEventRecorder(); diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt index 3ec479398a27b70c0d6c7fd05fc5ed06d60b73b4..1c916682cf7b1cd2ee10e8ead5006e29d7b26b8c 100644 --- a/paddle/phi/backends/CMakeLists.txt +++ b/paddle/phi/backends/CMakeLists.txt @@ -22,6 +22,7 @@ if(WITH_XPU) list(APPEND BACKENDS_SRCS xpu/xpu_context.cc xpu/xpu_info.cc) list(APPEND BACKENDS_SRCS xpu/xpu_op_list.cc xpu/xpu1_op_list.cc xpu/xpu2_op_list.cc xpu/xpu_l3_strategy.cc) + list(APPEND BACKENDS_DEPS phi_dynload_xpti) endif() if(WITH_MKLDNN) diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt index 568c54cb342431e29cd3303e8dc6bcbb5fa79dca..838b623ae7b38133bb0b81f940c333207f1debdd 100644 --- a/paddle/phi/backends/dynload/CMakeLists.txt +++ b/paddle/phi/backends/dynload/CMakeLists.txt @@ -73,6 +73,13 @@ if(WITH_MKLML) endif() endif() +if(WITH_XPU) + cc_library( + phi_dynload_xpti + SRCS xpti.cc + DEPS phi) +endif() + if(WITH_FLASHATTN) list(APPEND DYNLOAD_COMMON_SRCS flashattn.cc) endif() diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc index 85248360361ea778861d368bd47d4427594747a6..354ff5b7dc855359d4f6f244e86e4deebdf0176f 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.cc +++ b/paddle/phi/backends/dynload/dynamic_loader.cc @@ -95,6 +95,10 @@ PHI_DEFINE_string(rccl_dir, "dlopen will search rccl from LD_LIBRARY_PATH"); #endif +#ifdef PADDLE_WITH_XPU +DEFINE_string(xpti_dir, "", "Specify path for loading libxpti.so."); +#endif + namespace phi { namespace dynload { @@ -601,5 +605,13 @@ void* GetCusparseLtDsoHandle() { #endif } +void* GetXPTIDsoHandle() { +#ifdef PADDLE_WITH_XPTI + return GetDsoHandleFromSearchPath(FLAGS_xpti_dir, "libxpti.so"); +#else + return nullptr; +#endif +} + } // namespace dynload } // namespace phi diff --git a/paddle/phi/backends/dynload/dynamic_loader.h b/paddle/phi/backends/dynload/dynamic_loader.h index e248696e9e68982f4a03e5afc28aba9c67088b8d..6ddeb1386410f0e92ccd11660f08067e411f3fab 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.h +++ b/paddle/phi/backends/dynload/dynamic_loader.h @@ -47,6 +47,7 @@ void* GetCUFFTDsoHandle(); void* GetMKLRTDsoHandle(); void* GetROCFFTDsoHandle(); void* GetCusparseLtDsoHandle(); +void* GetXPTIDsoHandle(); void SetPaddleLibPath(const std::string&); diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py index 065721a2747508dc6462b1fe4de8409cda1d4714..af50a0f27d0631e0e2723cc4503326dcda737d6d 100644 --- a/python/paddle/profiler/profiler.py +++ b/python/paddle/profiler/profiler.py @@ -98,16 +98,19 @@ class ProfilerState(Enum): class ProfilerTarget(Enum): r""" - ProfilerTarget is used to specify target device for :ref:`profiling ` . Only CPU and GPU are supported currently. + ProfilerTarget is used to specify target device for :ref:`profiling ` . Only CPU, GPU and XPU are supported currently. The meaning of each ProfilerState is as following - **ProfilerTarget.CPU** : Profile events on CPU. - **ProfilerTarget.GPU** : Profile events on GPU. + + - **ProfilerTarget.XPU** : Profile events on XPU. """ CPU = 0 GPU = 1 + XPU = 2 CUSTOM_DEVICE = 3 @@ -334,6 +337,12 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]: ProfilerTarget.CPU, ProfilerTarget.CUSTOM_DEVICE, ] + if _Profiler.is_xpti_supported(): + return [ + ProfilerTarget.CPU, + ProfilerTarget.XPU, + ProfilerTarget.CUSTOM_DEVICE, + ] return [ProfilerTarget.CPU, ProfilerTarget.CUSTOM_DEVICE] @@ -342,7 +351,7 @@ class Profiler: Profiler context manager, user interface to manage profiling process to start, stop, export profiling data and print summary table. Args: - targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU ` and :ref:`ProfilerTarget.GPU ` . + targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU ` , :ref:`ProfilerTarget.GPU ` and :ref:`ProfilerTarget.XPU ` . scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState `. This callable object can be generated by :ref:`make_scheduler ` function. If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch, which means profiling range [start_batch, end_batch). @@ -495,6 +504,8 @@ class Profiler: profileoption.trace_switch |= 1 if ProfilerTarget.GPU in self.targets: profileoption.trace_switch |= 1 << 1 + if ProfilerTarget.XPU in self.targets: + profileoption.trace_switch |= 1 << 2 if ProfilerTarget.CUSTOM_DEVICE in self.targets: profileoption.trace_switch |= 1 << 3 if not custom_device_types: diff --git a/python/setup.py.in b/python/setup.py.in index 1fe7264c715605b15f5c9babeea5df43458d0186..2fa1a2c9280226014d1f70943761c1c110f6f563 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -716,6 +716,10 @@ if '${WITH_XPU_XFT}' == 'ON': shutil.copy('${XPU_XFT_LIB}', libs_path) package_data['paddle.libs']+=['${XPU_XFT_LIB_NAME}'] +if '${WITH_XPTI}' == 'ON': + shutil.copy('${XPU_XPTI_LIB}', libs_path) + package_data['paddle.libs']+=['${XPU_XPTI_LIB_NAME}'] + # remove unused paddle/libs/__init__.py if os.path.isfile(libs_path+'/__init__.py'): os.remove(libs_path+'/__init__.py') diff --git a/setup.py b/setup.py index de97ae234a2a37d264a424d97b24ea49467f3178..8b40699fee6d6d4b1e22736ef72a665075237e92 100644 --- a/setup.py +++ b/setup.py @@ -1188,6 +1188,11 @@ def get_package_data_and_package_dir(): if env_dict.get("WITH_XPU_XFT") == 'ON': shutil.copy(env_dict.get("XPU_XFT_LIB"), libs_path) package_data['paddle.libs'] += [env_dict.get("XPU_XFT_LIB_NAME")] + + if env_dict.get("WITH_XPTI") == 'ON': + shutil.copy(env_dict.get("XPU_XPTI_LIB"), libs_path) + package_data['paddle.libs'] += [env_dict.get("XPU_XPTI_LIB_NAME")] + # remove unused paddle/libs/__init__.py if os.path.isfile(libs_path + '/__init__.py'): os.remove(libs_path + '/__init__.py')