From 9c3f4b13dda3601fa6bb93f7b7c962b4f28e6923 Mon Sep 17 00:00:00 2001 From: XiaociZhang Date: Mon, 26 Jun 2023 11:50:41 +0800 Subject: [PATCH] [XPU] support xpu runtime profiler: follow up (#54690) * [XPU] support xpu runtime profiler: follow up * fix compile issue --- paddle/fluid/framework/CMakeLists.txt | 3 +++ paddle/fluid/platform/CMakeLists.txt | 15 +++++++++++++++ paddle/fluid/platform/device/xpu/CMakeLists.txt | 16 ++++++++++++++-- paddle/fluid/platform/dynload/CMakeLists.txt | 6 ++++++ paddle/fluid/platform/dynload/dynamic_loader.cc | 2 ++ paddle/fluid/platform/dynload/dynamic_loader.h | 1 + paddle/fluid/platform/profiler/CMakeLists.txt | 11 ++++++++++- paddle/fluid/platform/profiler/profiler.cc | 16 ++++++++++++++++ paddle/fluid/platform/profiler/profiler.h | 5 ++++- paddle/fluid/platform/profiler/xpu_tracer.cc | 13 ++++++++----- paddle/fluid/pybind/CMakeLists.txt | 4 ++++ paddle/fluid/pybind/pybind.cc | 1 + paddle/phi/backends/CMakeLists.txt | 1 + paddle/phi/backends/dynload/CMakeLists.txt | 7 +++++++ paddle/phi/backends/dynload/dynamic_loader.cc | 12 ++++++++++++ paddle/phi/backends/dynload/dynamic_loader.h | 1 + python/paddle/profiler/profiler.py | 15 +++++++++++++-- python/setup.py.in | 4 ++++ setup.py | 5 +++++ 19 files changed, 127 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 31dfc816be6..b856bb51f4d 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -211,6 +211,9 @@ cc_library( if(WITH_GPU) target_link_libraries(var_type_traits dynload_cuda) endif() +if(WITH_XPU) + target_link_libraries(var_type_traits dynload_xpti) +endif() # every source file that includes "dnnl.h" must depends on mkldnn # or, the first one should depends on mkldnn diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index a5cf7587c7c..4d7f496aaa4 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -68,6 +68,10 @@ if(WITH_GPU OR WITH_ROCM) set(GPU_CTX_DEPS dynload_cuda dynamic_loader) endif() +if(WITH_XPU) + set(XPU_CTX_DEPS dynload_xpti dynamic_loader) +endif() + if(WITH_IPU) set(IPU_CTX_DEPS ipu_info) else() @@ -277,6 +281,17 @@ elseif(WITH_ROCM) stats op_proto_maker shape_inference) +elseif(WITH_XPU) + cc_library( + profiler + SRCS profiler.cc + DEPS phi + enforce + dynload_xpti + new_profiler + stats + op_proto_maker + shape_inference) else() cc_library( profiler diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt index 2f09e25de27..f9e9659fa9f 100644 --- a/paddle/fluid/platform/device/xpu/CMakeLists.txt +++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt @@ -14,11 +14,23 @@ set(XPU_CTX_DEPS cc_library( xpu_info SRCS xpu_info.cc - DEPS glog enforce xpulib device_context place phi) + DEPS glog + enforce + xpulib + device_context + place + phi + dynload_xpti) cc_library( xpu_op_list SRCS xpu_op_list.cc - DEPS glog enforce xpulib device_context op_kernel_type phi) + DEPS glog + enforce + xpulib + device_context + op_kernel_type + phi + dynload_xpti) cc_library( xpu_resource_pool SRCS xpu_resource_pool.cc diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 976223be354..4cb3bfdb3ad 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -72,6 +72,12 @@ else() SRCS warpctc.cc DEPS dynamic_loader warpctc phi) endif() +if(WITH_XPU) + cc_library( + dynload_xpti + SRCS xpti.cc + DEPS dynamic_loader phi_dynload_xpti) +endif() # TODO(TJ): add iomp, mkldnn? diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index afa689a3f90..40f69a87f37 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -75,6 +75,8 @@ void* GetCusparseLtDsoHandle() { return phi::dynload::GetCusparseLtDsoHandle(); } +void* GetXPTIDsoHandle() { return phi::dynload::GetXPTIDsoHandle(); } + } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 10b985e0b20..93a19645a0a 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -46,6 +46,7 @@ void* GetCUFFTDsoHandle(); void* GetMKLRTDsoHandle(); void* GetROCFFTDsoHandle(); void* GetCusparseLtDsoHandle(); +void* GetXPTIDsoHandle(); void SetPaddleLibPath(const std::string&); } // namespace dynload diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt index df5b9818d69..85eba90ec61 100644 --- a/paddle/fluid/platform/profiler/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/CMakeLists.txt @@ -6,6 +6,10 @@ cc_library( cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog) +cc_library( + xpu_tracer + SRCS xpu_tracer.cc + DEPS enforce glog) add_subdirectory(custom_device) cc_library( event_node @@ -32,7 +36,12 @@ cc_library( cc_library( new_profiler SRCS profiler.cc - DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind + DEPS host_tracer + cuda_tracer + xpu_tracer + profiler_utils + cpu_utilization + event_bind custom_tracer) cc_test( test_event_node diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc index 67d7fe95e31..e0a91629a10 100644 --- a/paddle/fluid/platform/profiler/profiler.cc +++ b/paddle/fluid/platform/profiler/profiler.cc @@ -31,6 +31,7 @@ #include "paddle/fluid/platform/profiler/host_tracer.h" #include "paddle/fluid/platform/profiler/trace_event_collector.h" #include "paddle/fluid/platform/profiler/utils.h" +#include "paddle/fluid/platform/profiler/xpu_tracer.h" #ifdef PADDLE_WITH_CUSTOM_DEVICE #include "paddle/phi/backends/device_manager.h" #endif @@ -53,6 +54,10 @@ void SynchronizeDevice() { phi::DeviceManager::SynchronizeDevice(place); } #endif +#ifdef PADDLE_WITH_XPU + // TODO(zhangxiaoci) xpu do not support device sync yet + // KL3 might do +#endif } std::atomic Profiler::alive_{false}; @@ -82,6 +87,14 @@ bool Profiler::IsCnpapiSupported() { return supported; } +bool Profiler::IsXPTISupported() { + bool supported = false; +#ifdef PADDLE_WITH_XPTI + supported = true; +#endif + return supported; +} + Profiler::Profiler(const ProfilerOptions& options, const std::vector& custom_device_types) { options_ = options; @@ -99,6 +112,9 @@ Profiler::Profiler(const ProfilerOptions& options, tracers_.emplace_back(&CustomTracer::GetInstance(dev_type), false); } } + if (trace_switch.test(kProfileXPUOptionBit)) { + tracers_.emplace_back(&XPUTracer::GetInstance(), false); + } } Profiler::~Profiler() { alive_.store(false); } diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h index b486e7543d9..28cf7a2d385 100644 --- a/paddle/fluid/platform/profiler/profiler.h +++ b/paddle/fluid/platform/profiler/profiler.h @@ -34,12 +34,13 @@ namespace platform { static constexpr uint32_t kProfileCPUOptionBit = 0; static constexpr uint32_t kProfileGPUOptionBit = 1; +static constexpr uint32_t kProfileXPUOptionBit = 2; static constexpr uint32_t kProfileCustomDeviceOptionBit = 3; void SynchronizeDevice(); struct ProfilerOptions { - uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu + uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu, bit 2: xpu uint32_t trace_level = FLAGS_host_trace_level; }; @@ -57,6 +58,8 @@ class Profiler { static bool IsCnpapiSupported(); + static bool IsXPTISupported(); + void Prepare(); void Start(); diff --git a/paddle/fluid/platform/profiler/xpu_tracer.cc b/paddle/fluid/platform/profiler/xpu_tracer.cc index 5e687b9c745..a45dc5635f4 100644 --- a/paddle/fluid/platform/profiler/xpu_tracer.cc +++ b/paddle/fluid/platform/profiler/xpu_tracer.cc @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +// #include "paddle/fluid/platform/profiler/xpu_tracer.h" @@ -23,6 +24,7 @@ #include "paddle/phi/backends/device_manager.h" #endif +#ifdef PADDLE_WITH_XPTI #define XPTI_CALL(call) \ do { \ XPTIResult _status = call; \ @@ -31,6 +33,7 @@ exit(-1); \ } \ } while (0) +#endif // PADDLE_WITH_XPTI namespace paddle { namespace platform { @@ -40,7 +43,7 @@ void XPUTracer::PrepareTracing() { state_ == TracerState::UNINITED || state_ == TracerState::STOPED, true, platform::errors::PreconditionNotMet("XPUTracer must be UNINITED")); -#ifdef PADDLE_WITH_XPU +#ifdef PADDLE_WITH_XPTI XPTI_CALL(dynload::xptiActivityEnable()); VLOG(3) << "enable xpti activity"; #endif @@ -52,7 +55,7 @@ void XPUTracer::StartTracing() { state_ == TracerState::READY, true, platform::errors::PreconditionNotMet("Tracer must be READY or STOPPED")); -#ifdef PADDLE_WITH_XPU +#ifdef PADDLE_WITH_XPTI XPTI_CALL(dynload::xptiStartTracing()); #endif tracing_start_ns_ = PosixInNsec(); @@ -64,7 +67,7 @@ void XPUTracer::StopTracing() { state_, TracerState::STARTED, platform::errors::PreconditionNotMet("Tracer must be STARTED")); -#ifdef PADDLE_WITH_XPU +#ifdef PADDLE_WITH_XPTI XPTI_CALL(dynload::xptiStopTracing()); XPTI_CALL(dynload::xptiActivityDisable()); VLOG(3) << "disable xpti activity"; @@ -72,7 +75,7 @@ void XPUTracer::StopTracing() { state_ = TracerState::STOPED; } -#ifdef PADDLE_WITH_XPU +#ifdef PADDLE_WITH_XPTI void AddApiRecord(const baidu::xpu::xpti::XPTIEventApi* api, uint64_t start_ns, TraceEventCollector* collector) { @@ -158,7 +161,7 @@ void XPUTracer::CollectTraceData(TraceEventCollector* collector) { state_, TracerState::STOPED, platform::errors::PreconditionNotMet("Tracer must be STOPED")); -#ifdef PADDLE_WITH_XPU +#ifdef PADDLE_WITH_XPTI XPTI_CALL(dynload::xptiActivityFlushAll()); baidu::xpu::xpti::XPTIEvent* record = nullptr; while (true) { diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 21516f3ced3..f1b553a3db0 100755 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -86,6 +86,10 @@ if(WITH_NCCL OR WITH_RCCL) set(PYBIND_DEPS ${PYBIND_DEPS} reducer) endif() +if(WITH_XPU) + set(PYBIND_DEPS ${PYBIND_DEPS} dynload_xpti) +endif() + if(WITH_XPU_BKCL) set(PYBIND_DEPS ${PYBIND_DEPS} reducer) set(PYBIND_DEPS ${PYBIND_DEPS} bkcl_context) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 1c2914ee7ab..2016bd47b0a 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2408,6 +2408,7 @@ All parameter, weight, gradient are variables in Paddle. .def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported) .def("is_cnpapi_supported", &paddle::platform::Profiler::IsCnpapiSupported) + .def("is_xpti_supported", &paddle::platform::Profiler::IsXPTISupported) .def("prepare", [](paddle::platform::Profiler *profiler) { platform::EnableHostEventRecorder(); diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt index 3ec479398a2..1c916682cf7 100644 --- a/paddle/phi/backends/CMakeLists.txt +++ b/paddle/phi/backends/CMakeLists.txt @@ -22,6 +22,7 @@ if(WITH_XPU) list(APPEND BACKENDS_SRCS xpu/xpu_context.cc xpu/xpu_info.cc) list(APPEND BACKENDS_SRCS xpu/xpu_op_list.cc xpu/xpu1_op_list.cc xpu/xpu2_op_list.cc xpu/xpu_l3_strategy.cc) + list(APPEND BACKENDS_DEPS phi_dynload_xpti) endif() if(WITH_MKLDNN) diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt index 568c54cb342..838b623ae7b 100644 --- a/paddle/phi/backends/dynload/CMakeLists.txt +++ b/paddle/phi/backends/dynload/CMakeLists.txt @@ -73,6 +73,13 @@ if(WITH_MKLML) endif() endif() +if(WITH_XPU) + cc_library( + phi_dynload_xpti + SRCS xpti.cc + DEPS phi) +endif() + if(WITH_FLASHATTN) list(APPEND DYNLOAD_COMMON_SRCS flashattn.cc) endif() diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc index 85248360361..354ff5b7dc8 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.cc +++ b/paddle/phi/backends/dynload/dynamic_loader.cc @@ -95,6 +95,10 @@ PHI_DEFINE_string(rccl_dir, "dlopen will search rccl from LD_LIBRARY_PATH"); #endif +#ifdef PADDLE_WITH_XPU +DEFINE_string(xpti_dir, "", "Specify path for loading libxpti.so."); +#endif + namespace phi { namespace dynload { @@ -601,5 +605,13 @@ void* GetCusparseLtDsoHandle() { #endif } +void* GetXPTIDsoHandle() { +#ifdef PADDLE_WITH_XPTI + return GetDsoHandleFromSearchPath(FLAGS_xpti_dir, "libxpti.so"); +#else + return nullptr; +#endif +} + } // namespace dynload } // namespace phi diff --git a/paddle/phi/backends/dynload/dynamic_loader.h b/paddle/phi/backends/dynload/dynamic_loader.h index e248696e9e6..6ddeb138641 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.h +++ b/paddle/phi/backends/dynload/dynamic_loader.h @@ -47,6 +47,7 @@ void* GetCUFFTDsoHandle(); void* GetMKLRTDsoHandle(); void* GetROCFFTDsoHandle(); void* GetCusparseLtDsoHandle(); +void* GetXPTIDsoHandle(); void SetPaddleLibPath(const std::string&); diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py index 065721a2747..af50a0f27d0 100644 --- a/python/paddle/profiler/profiler.py +++ b/python/paddle/profiler/profiler.py @@ -98,16 +98,19 @@ class ProfilerState(Enum): class ProfilerTarget(Enum): r""" - ProfilerTarget is used to specify target device for :ref:`profiling ` . Only CPU and GPU are supported currently. + ProfilerTarget is used to specify target device for :ref:`profiling ` . Only CPU, GPU and XPU are supported currently. The meaning of each ProfilerState is as following - **ProfilerTarget.CPU** : Profile events on CPU. - **ProfilerTarget.GPU** : Profile events on GPU. + + - **ProfilerTarget.XPU** : Profile events on XPU. """ CPU = 0 GPU = 1 + XPU = 2 CUSTOM_DEVICE = 3 @@ -334,6 +337,12 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]: ProfilerTarget.CPU, ProfilerTarget.CUSTOM_DEVICE, ] + if _Profiler.is_xpti_supported(): + return [ + ProfilerTarget.CPU, + ProfilerTarget.XPU, + ProfilerTarget.CUSTOM_DEVICE, + ] return [ProfilerTarget.CPU, ProfilerTarget.CUSTOM_DEVICE] @@ -342,7 +351,7 @@ class Profiler: Profiler context manager, user interface to manage profiling process to start, stop, export profiling data and print summary table. Args: - targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU ` and :ref:`ProfilerTarget.GPU ` . + targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU ` , :ref:`ProfilerTarget.GPU ` and :ref:`ProfilerTarget.XPU ` . scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState `. This callable object can be generated by :ref:`make_scheduler ` function. If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch, which means profiling range [start_batch, end_batch). @@ -495,6 +504,8 @@ class Profiler: profileoption.trace_switch |= 1 if ProfilerTarget.GPU in self.targets: profileoption.trace_switch |= 1 << 1 + if ProfilerTarget.XPU in self.targets: + profileoption.trace_switch |= 1 << 2 if ProfilerTarget.CUSTOM_DEVICE in self.targets: profileoption.trace_switch |= 1 << 3 if not custom_device_types: diff --git a/python/setup.py.in b/python/setup.py.in index 1fe7264c715..2fa1a2c9280 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -716,6 +716,10 @@ if '${WITH_XPU_XFT}' == 'ON': shutil.copy('${XPU_XFT_LIB}', libs_path) package_data['paddle.libs']+=['${XPU_XFT_LIB_NAME}'] +if '${WITH_XPTI}' == 'ON': + shutil.copy('${XPU_XPTI_LIB}', libs_path) + package_data['paddle.libs']+=['${XPU_XPTI_LIB_NAME}'] + # remove unused paddle/libs/__init__.py if os.path.isfile(libs_path+'/__init__.py'): os.remove(libs_path+'/__init__.py') diff --git a/setup.py b/setup.py index de97ae234a2..8b40699fee6 100644 --- a/setup.py +++ b/setup.py @@ -1188,6 +1188,11 @@ def get_package_data_and_package_dir(): if env_dict.get("WITH_XPU_XFT") == 'ON': shutil.copy(env_dict.get("XPU_XFT_LIB"), libs_path) package_data['paddle.libs'] += [env_dict.get("XPU_XFT_LIB_NAME")] + + if env_dict.get("WITH_XPTI") == 'ON': + shutil.copy(env_dict.get("XPU_XPTI_LIB"), libs_path) + package_data['paddle.libs'] += [env_dict.get("XPU_XPTI_LIB_NAME")] + # remove unused paddle/libs/__init__.py if os.path.isfile(libs_path + '/__init__.py'): os.remove(libs_path + '/__init__.py') -- GitLab