// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto2"; package paddle.platform; enum TracerEventTypeProto { // Used to mark operator record Operator = 0; // Used to mark dataloader record Dataloader = 1; // Used to mark profile step record ProfileStep = 2; // Used to mark cuda runtime record returned by cupti CudaRuntime = 3; // Used to mark kernel computation record returned by cupti Kernel = 4; // Used to mark memcpy record returned by cupti Memcpy = 5; // Used to mark memset record returned by cupti Memset = 6; // Used to mark record defined by user UserDefined = 7; // A flag to denote the number of current types NumTypes = 8; } message KernelEventInfoProto { // The X-dimension block size for the kernel. required uint32 block_x = 1; // The Y-dimension block size for the kernel. required uint32 block_y = 2; // The Z-dimension grid size for the kernel. required uint32 block_z = 3; // X-dimension of a grid. required uint32 grid_x = 4; // Y-dimension of a grid. required uint32 grid_y = 5; // Z-dimension of a grid. required uint32 grid_z = 6; // The dynamic shared memory reserved for the kernel, in bytes. required uint32 dynamic_shared_memory = 7; // The static shared memory allocated for the kernel, in bytes. required uint32 static_shared_memory = 8; // The number of registers required for each thread executing the kernel. required uint32 registers_per_thread = 9; // The amount of local memory reserved for each thread, in bytes. required uint32 local_memory_per_thread = 10; // The total amount of local memory reserved for the kernel, in bytes. required uint32 local_memory_total = 11; // The timestamp when the kernel is queued up in the command buffer, in ns. // This timestamp is not collected by default. Use API // cuptiActivityEnableLatencyTimestamps() to enable collection. required uint64 queued = 12; // The timestamp when the command buffer containing the kernel launch is // submitted to the GPU, in ns. // This timestamp is not collected by default. Use API // cuptiActivityEnableLatencyTimestamps() to enable collection. required uint64 submitted = 13; // The completed timestamp for the kernel execution, in ns. required uint64 completed = 14; } message MemcpyEventInfoProto { // The number of bytes transferred by the memory copy. required uint64 num_bytes = 1; // The kind of the memory copy. // Each kind represents the source and destination targets of a memory copy. // Targets are host, device, and array. Refer to CUpti_ActivityMemcpyKind required string copy_kind = 2; // The source memory kind read by the memory copy. // Each kind represents the type of the memory accessed by a memory // operation/copy. Refer to CUpti_ActivityMemoryKind required string src_kind = 3; // The destination memory kind read by the memory copy. required string dst_kind = 4; } message MemsetEventInfoProto { // The number of bytes being set by the memory set. required uint64 num_bytes = 1; // The memory kind of the memory set. Refer to CUpti_ActivityMemoryKind required string memory_kind = 2; // the value being assigned to memory by the memory set. required uint32 value = 3; } message HostTraceEventProto { required string name = 1; required TracerEventTypeProto type = 2; // start timestamp of the record required uint64 start_ns = 3; // end timestamp of the record required uint64 end_ns = 4; // process id of the record required uint64 process_id = 5; // thread id of the record required uint64 thread_id = 6; } message CudaRuntimeTraceEventProto { // record name required string name = 1; // start timestamp of the record required uint64 start_ns = 2; // end timestamp of the record required uint64 end_ns = 3; // process id of the record required uint64 process_id = 4; // thread id of the record required uint64 thread_id = 5; // correlation id, used for correlating async activities happened on device required uint32 correlation_id = 6; // callback id, used to identify which cuda runtime api is called required uint32 callback_id = 7; } message DeviceTraceEventProto { // record name required string name = 1; // record type, one of TracerEventType required TracerEventTypeProto type = 2; // start timestamp of the record required uint64 start_ns = 3; // end timestamp of the record required uint64 end_ns = 4; // device id required uint64 device_id = 5; // context id required uint64 context_id = 6; // stream id required uint64 stream_id = 7; // correlation id, used for correlating async activities happened on device required uint32 correlation_id = 8; // union, specific device record type has different detail information oneof detail_info { // used for TracerEventType::Kernel KernelEventInfoProto kernel_info = 9; // used for TracerEventType::Memcpy MemcpyEventInfoProto memcpy_info = 10; // used for TracerEventType::Memset MemsetEventInfoProto memset_info = 11; } } message DeviceTraceEventNodeProto { required DeviceTraceEventProto device_event = 1; } message CudaRuntimeTraceEventNodeProto { required CudaRuntimeTraceEventProto runtime_trace_event = 1; repeated DeviceTraceEventNodeProto device_nodes = 2; } message HostTraceEventNodeProto { required int64 id = 1; required int64 parentid = 2; required HostTraceEventProto host_trace_event = 3; repeated CudaRuntimeTraceEventNodeProto runtime_nodes = 4; } message ThreadNodeTreeProto { required uint64 thread_id = 1; repeated HostTraceEventNodeProto host_nodes = 2; } message NodeTreesProto { required string version = 1; repeated ThreadNodeTreeProto thread_trees = 2; }