未验证 提交 9c3f4b13 编写于 作者: X XiaociZhang 提交者: GitHub

[XPU] support xpu runtime profiler: follow up (#54690)

* [XPU] support xpu runtime profiler: follow up

* fix compile issue
上级 ba09621a
......@@ -211,6 +211,9 @@ cc_library(
if(WITH_GPU)
target_link_libraries(var_type_traits dynload_cuda)
endif()
if(WITH_XPU)
target_link_libraries(var_type_traits dynload_xpti)
endif()
# every source file that includes "dnnl.h" must depends on mkldnn
# or, the first one should depends on mkldnn
......
......@@ -68,6 +68,10 @@ if(WITH_GPU OR WITH_ROCM)
set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
endif()
if(WITH_XPU)
set(XPU_CTX_DEPS dynload_xpti dynamic_loader)
endif()
if(WITH_IPU)
set(IPU_CTX_DEPS ipu_info)
else()
......@@ -277,6 +281,17 @@ elseif(WITH_ROCM)
stats
op_proto_maker
shape_inference)
elseif(WITH_XPU)
cc_library(
profiler
SRCS profiler.cc
DEPS phi
enforce
dynload_xpti
new_profiler
stats
op_proto_maker
shape_inference)
else()
cc_library(
profiler
......
......@@ -14,11 +14,23 @@ set(XPU_CTX_DEPS
cc_library(
xpu_info
SRCS xpu_info.cc
DEPS glog enforce xpulib device_context place phi)
DEPS glog
enforce
xpulib
device_context
place
phi
dynload_xpti)
cc_library(
xpu_op_list
SRCS xpu_op_list.cc
DEPS glog enforce xpulib device_context op_kernel_type phi)
DEPS glog
enforce
xpulib
device_context
op_kernel_type
phi
dynload_xpti)
cc_library(
xpu_resource_pool
SRCS xpu_resource_pool.cc
......
......@@ -72,6 +72,12 @@ else()
SRCS warpctc.cc
DEPS dynamic_loader warpctc phi)
endif()
if(WITH_XPU)
cc_library(
dynload_xpti
SRCS xpti.cc
DEPS dynamic_loader phi_dynload_xpti)
endif()
# TODO(TJ): add iomp, mkldnn?
......
......@@ -75,6 +75,8 @@ void* GetCusparseLtDsoHandle() {
return phi::dynload::GetCusparseLtDsoHandle();
}
void* GetXPTIDsoHandle() { return phi::dynload::GetXPTIDsoHandle(); }
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -46,6 +46,7 @@ void* GetCUFFTDsoHandle();
void* GetMKLRTDsoHandle();
void* GetROCFFTDsoHandle();
void* GetCusparseLtDsoHandle();
void* GetXPTIDsoHandle();
void SetPaddleLibPath(const std::string&);
} // namespace dynload
......
......@@ -6,6 +6,10 @@ cc_library(
cuda_tracer
SRCS cuda_tracer.cc cupti_data_process.cc
DEPS workqueue_utils enforce glog)
cc_library(
xpu_tracer
SRCS xpu_tracer.cc
DEPS enforce glog)
add_subdirectory(custom_device)
cc_library(
event_node
......@@ -32,7 +36,12 @@ cc_library(
cc_library(
new_profiler
SRCS profiler.cc
DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind
DEPS host_tracer
cuda_tracer
xpu_tracer
profiler_utils
cpu_utilization
event_bind
custom_tracer)
cc_test(
test_event_node
......
......@@ -31,6 +31,7 @@
#include "paddle/fluid/platform/profiler/host_tracer.h"
#include "paddle/fluid/platform/profiler/trace_event_collector.h"
#include "paddle/fluid/platform/profiler/utils.h"
#include "paddle/fluid/platform/profiler/xpu_tracer.h"
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/phi/backends/device_manager.h"
#endif
......@@ -53,6 +54,10 @@ void SynchronizeDevice() {
phi::DeviceManager::SynchronizeDevice(place);
}
#endif
#ifdef PADDLE_WITH_XPU
// TODO(zhangxiaoci) xpu do not support device sync yet
// KL3 might do
#endif
}
std::atomic<bool> Profiler::alive_{false};
......@@ -82,6 +87,14 @@ bool Profiler::IsCnpapiSupported() {
return supported;
}
bool Profiler::IsXPTISupported() {
bool supported = false;
#ifdef PADDLE_WITH_XPTI
supported = true;
#endif
return supported;
}
Profiler::Profiler(const ProfilerOptions& options,
const std::vector<std::string>& custom_device_types) {
options_ = options;
......@@ -99,6 +112,9 @@ Profiler::Profiler(const ProfilerOptions& options,
tracers_.emplace_back(&CustomTracer::GetInstance(dev_type), false);
}
}
if (trace_switch.test(kProfileXPUOptionBit)) {
tracers_.emplace_back(&XPUTracer::GetInstance(), false);
}
}
Profiler::~Profiler() { alive_.store(false); }
......
......@@ -34,12 +34,13 @@ namespace platform {
static constexpr uint32_t kProfileCPUOptionBit = 0;
static constexpr uint32_t kProfileGPUOptionBit = 1;
static constexpr uint32_t kProfileXPUOptionBit = 2;
static constexpr uint32_t kProfileCustomDeviceOptionBit = 3;
void SynchronizeDevice();
struct ProfilerOptions {
uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu
uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu, bit 2: xpu
uint32_t trace_level = FLAGS_host_trace_level;
};
......@@ -57,6 +58,8 @@ class Profiler {
static bool IsCnpapiSupported();
static bool IsXPTISupported();
void Prepare();
void Start();
......
......@@ -11,6 +11,7 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "paddle/fluid/platform/profiler/xpu_tracer.h"
......@@ -23,6 +24,7 @@
#include "paddle/phi/backends/device_manager.h"
#endif
#ifdef PADDLE_WITH_XPTI
#define XPTI_CALL(call) \
do { \
XPTIResult _status = call; \
......@@ -31,6 +33,7 @@
exit(-1); \
} \
} while (0)
#endif // PADDLE_WITH_XPTI
namespace paddle {
namespace platform {
......@@ -40,7 +43,7 @@ void XPUTracer::PrepareTracing() {
state_ == TracerState::UNINITED || state_ == TracerState::STOPED,
true,
platform::errors::PreconditionNotMet("XPUTracer must be UNINITED"));
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPTI
XPTI_CALL(dynload::xptiActivityEnable());
VLOG(3) << "enable xpti activity";
#endif
......@@ -52,7 +55,7 @@ void XPUTracer::StartTracing() {
state_ == TracerState::READY,
true,
platform::errors::PreconditionNotMet("Tracer must be READY or STOPPED"));
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPTI
XPTI_CALL(dynload::xptiStartTracing());
#endif
tracing_start_ns_ = PosixInNsec();
......@@ -64,7 +67,7 @@ void XPUTracer::StopTracing() {
state_,
TracerState::STARTED,
platform::errors::PreconditionNotMet("Tracer must be STARTED"));
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPTI
XPTI_CALL(dynload::xptiStopTracing());
XPTI_CALL(dynload::xptiActivityDisable());
VLOG(3) << "disable xpti activity";
......@@ -72,7 +75,7 @@ void XPUTracer::StopTracing() {
state_ = TracerState::STOPED;
}
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPTI
void AddApiRecord(const baidu::xpu::xpti::XPTIEventApi* api,
uint64_t start_ns,
TraceEventCollector* collector) {
......@@ -158,7 +161,7 @@ void XPUTracer::CollectTraceData(TraceEventCollector* collector) {
state_,
TracerState::STOPED,
platform::errors::PreconditionNotMet("Tracer must be STOPED"));
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPTI
XPTI_CALL(dynload::xptiActivityFlushAll());
baidu::xpu::xpti::XPTIEvent* record = nullptr;
while (true) {
......
......@@ -86,6 +86,10 @@ if(WITH_NCCL OR WITH_RCCL)
set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
endif()
if(WITH_XPU)
set(PYBIND_DEPS ${PYBIND_DEPS} dynload_xpti)
endif()
if(WITH_XPU_BKCL)
set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
set(PYBIND_DEPS ${PYBIND_DEPS} bkcl_context)
......
......@@ -2408,6 +2408,7 @@ All parameter, weight, gradient are variables in Paddle.
.def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported)
.def("is_cnpapi_supported",
&paddle::platform::Profiler::IsCnpapiSupported)
.def("is_xpti_supported", &paddle::platform::Profiler::IsXPTISupported)
.def("prepare",
[](paddle::platform::Profiler *profiler) {
platform::EnableHostEventRecorder();
......
......@@ -22,6 +22,7 @@ if(WITH_XPU)
list(APPEND BACKENDS_SRCS xpu/xpu_context.cc xpu/xpu_info.cc)
list(APPEND BACKENDS_SRCS xpu/xpu_op_list.cc xpu/xpu1_op_list.cc
xpu/xpu2_op_list.cc xpu/xpu_l3_strategy.cc)
list(APPEND BACKENDS_DEPS phi_dynload_xpti)
endif()
if(WITH_MKLDNN)
......
......@@ -73,6 +73,13 @@ if(WITH_MKLML)
endif()
endif()
if(WITH_XPU)
cc_library(
phi_dynload_xpti
SRCS xpti.cc
DEPS phi)
endif()
if(WITH_FLASHATTN)
list(APPEND DYNLOAD_COMMON_SRCS flashattn.cc)
endif()
......
......@@ -95,6 +95,10 @@ PHI_DEFINE_string(rccl_dir,
"dlopen will search rccl from LD_LIBRARY_PATH");
#endif
#ifdef PADDLE_WITH_XPU
DEFINE_string(xpti_dir, "", "Specify path for loading libxpti.so.");
#endif
namespace phi {
namespace dynload {
......@@ -601,5 +605,13 @@ void* GetCusparseLtDsoHandle() {
#endif
}
void* GetXPTIDsoHandle() {
#ifdef PADDLE_WITH_XPTI
return GetDsoHandleFromSearchPath(FLAGS_xpti_dir, "libxpti.so");
#else
return nullptr;
#endif
}
} // namespace dynload
} // namespace phi
......@@ -47,6 +47,7 @@ void* GetCUFFTDsoHandle();
void* GetMKLRTDsoHandle();
void* GetROCFFTDsoHandle();
void* GetCusparseLtDsoHandle();
void* GetXPTIDsoHandle();
void SetPaddleLibPath(const std::string&);
......
......@@ -98,16 +98,19 @@ class ProfilerState(Enum):
class ProfilerTarget(Enum):
r"""
ProfilerTarget is used to specify target device for :ref:`profiling <api_paddle_profiler_Profiler>` . Only CPU and GPU are supported currently.
ProfilerTarget is used to specify target device for :ref:`profiling <api_paddle_profiler_Profiler>` . Only CPU, GPU and XPU are supported currently.
The meaning of each ProfilerState is as following
- **ProfilerTarget.CPU** : Profile events on CPU.
- **ProfilerTarget.GPU** : Profile events on GPU.
- **ProfilerTarget.XPU** : Profile events on XPU.
"""
CPU = 0
GPU = 1
XPU = 2
CUSTOM_DEVICE = 3
......@@ -334,6 +337,12 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]:
ProfilerTarget.CPU,
ProfilerTarget.CUSTOM_DEVICE,
]
if _Profiler.is_xpti_supported():
return [
ProfilerTarget.CPU,
ProfilerTarget.XPU,
ProfilerTarget.CUSTOM_DEVICE,
]
return [ProfilerTarget.CPU, ProfilerTarget.CUSTOM_DEVICE]
......@@ -342,7 +351,7 @@ class Profiler:
Profiler context manager, user interface to manage profiling process to start, stop, export profiling data and print summary table.
Args:
targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU <api_paddle_profiler_ProfilerTarget>` and :ref:`ProfilerTarget.GPU <api_paddle_profiler_ProfilerTarget>` .
targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU <api_paddle_profiler_ProfilerTarget>` , :ref:`ProfilerTarget.GPU <api_paddle_profiler_ProfilerTarget>` and :ref:`ProfilerTarget.XPU <api_paddle_profiler_ProfilerTarget>` .
scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState <api_paddle_profiler_ProfilerState>`. This callable object can be generated by :ref:`make_scheduler <api_paddle_profiler_make_scheduler>` function.
If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
which means profiling range [start_batch, end_batch).
......@@ -495,6 +504,8 @@ class Profiler:
profileoption.trace_switch |= 1
if ProfilerTarget.GPU in self.targets:
profileoption.trace_switch |= 1 << 1
if ProfilerTarget.XPU in self.targets:
profileoption.trace_switch |= 1 << 2
if ProfilerTarget.CUSTOM_DEVICE in self.targets:
profileoption.trace_switch |= 1 << 3
if not custom_device_types:
......
......@@ -716,6 +716,10 @@ if '${WITH_XPU_XFT}' == 'ON':
shutil.copy('${XPU_XFT_LIB}', libs_path)
package_data['paddle.libs']+=['${XPU_XFT_LIB_NAME}']
if '${WITH_XPTI}' == 'ON':
shutil.copy('${XPU_XPTI_LIB}', libs_path)
package_data['paddle.libs']+=['${XPU_XPTI_LIB_NAME}']
# remove unused paddle/libs/__init__.py
if os.path.isfile(libs_path+'/__init__.py'):
os.remove(libs_path+'/__init__.py')
......
......@@ -1188,6 +1188,11 @@ def get_package_data_and_package_dir():
if env_dict.get("WITH_XPU_XFT") == 'ON':
shutil.copy(env_dict.get("XPU_XFT_LIB"), libs_path)
package_data['paddle.libs'] += [env_dict.get("XPU_XFT_LIB_NAME")]
if env_dict.get("WITH_XPTI") == 'ON':
shutil.copy(env_dict.get("XPU_XPTI_LIB"), libs_path)
package_data['paddle.libs'] += [env_dict.get("XPU_XPTI_LIB_NAME")]
# remove unused paddle/libs/__init__.py
if os.path.isfile(libs_path + '/__init__.py'):
os.remove(libs_path + '/__init__.py')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册