提交 deb04809 编写于 作者: Z ZongwuYang

test=develop

Fix the bug that profiler cannot trace the nccl allreduce operator
上级 6224e61f
...@@ -143,7 +143,7 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, ...@@ -143,7 +143,7 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: { case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
auto *kernel = auto *kernel =
reinterpret_cast<const CUpti_ActivityKernel3 *>(record); reinterpret_cast<const CUpti_ActivityKernel3 *>(record);
tracer->AddKernelRecords(kernel->start, kernel->end, tracer->AddKernelRecords(kernel->name, kernel->start, kernel->end,
kernel->deviceId, kernel->streamId, kernel->deviceId, kernel->streamId,
kernel->correlationId); kernel->correlationId);
break; break;
...@@ -224,8 +224,9 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -224,8 +224,9 @@ class DeviceTracerImpl : public DeviceTracer {
stream_id, correlation_id, bytes}); stream_id, correlation_id, bytes});
} }
void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
int64_t stream_id, uint32_t correlation_id) { int64_t device_id, int64_t stream_id,
uint32_t correlation_id) {
// 0 means timestamp information could not be collected for the kernel. // 0 means timestamp information could not be collected for the kernel.
if (start == 0 || end == 0) { if (start == 0 || end == 0) {
VLOG(30) << correlation_id << " cannot be traced"; VLOG(30) << correlation_id << " cannot be traced";
...@@ -233,7 +234,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -233,7 +234,7 @@ class DeviceTracerImpl : public DeviceTracer {
} }
std::lock_guard<std::mutex> l(trace_mu_); std::lock_guard<std::mutex> l(trace_mu_);
kernel_records_.push_back( kernel_records_.push_back(
KernelRecord{start, end, device_id, stream_id, correlation_id}); KernelRecord{name, start, end, device_id, stream_id, correlation_id});
} }
bool IsEnabled() { bool IsEnabled() {
...@@ -276,13 +277,13 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -276,13 +277,13 @@ class DeviceTracerImpl : public DeviceTracer {
profile_pb.set_start_ns(start_ns_); profile_pb.set_start_ns(start_ns_);
profile_pb.set_end_ns(end_ns_); profile_pb.set_end_ns(end_ns_);
for (const KernelRecord &r : kernel_records_) { for (const KernelRecord &r : kernel_records_) {
if (correlations_.find(r.correlation_id) == correlations_.end()) {
fprintf(stderr, "cannot relate a kernel activity\n");
continue;
}
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel); event->set_type(proto::Event::GPUKernel);
if (correlations_.find(r.correlation_id) != correlations_.end()) {
event->set_name(correlations_.at(r.correlation_id)); event->set_name(correlations_.at(r.correlation_id));
} else {
event->set_name(r.name);
}
event->set_start_ns(r.start_ns); event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns); event->set_end_ns(r.end_ns);
event->set_sub_device_id(r.stream_id); event->set_sub_device_id(r.stream_id);
......
...@@ -39,6 +39,7 @@ inline uint64_t PosixInNsec() { ...@@ -39,6 +39,7 @@ inline uint64_t PosixInNsec() {
class DeviceTracer { class DeviceTracer {
public: public:
struct KernelRecord { struct KernelRecord {
std::string name;
uint64_t start_ns; uint64_t start_ns;
uint64_t end_ns; uint64_t end_ns;
int64_t device_id; int64_t device_id;
...@@ -84,8 +85,9 @@ class DeviceTracer { ...@@ -84,8 +85,9 @@ class DeviceTracer {
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability. // added before for human readability.
virtual void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
int64_t stream_id, uint32_t correlation_id) = 0; int64_t device_id, int64_t stream_id,
uint32_t correlation_id) = 0;
// Generate a proto after done (Disabled). // Generate a proto after done (Disabled).
virtual proto::Profile GenProfile(const std::string& profile_path) = 0; virtual proto::Profile GenProfile(const std::string& profile_path) = 0;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册