// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto2";
package paddle.platform;

enum TracerEventTypeProto {
  // Used to mark operator record
  Operator = 0;
  // Used to mark dataloader record
  Dataloader = 1;
  // Used to mark profile step record
  ProfileStep = 2;
  // Used to mark cuda runtime record returned by cupti
  CudaRuntime = 3;
  // Used to mark kernel computation record returned by cupti
  Kernel = 4;
  // Used to mark memcpy record returned by cupti
  Memcpy = 5;
  // Used to mark memset record returned by cupti
  Memset = 6;
  // Used to mark record defined by user
  UserDefined = 7;
  // Used to mark operator detail, (such as infer shape, compute)
  OperatorInner = 8;
  // Used to mark model training or testing perspective, forward process
  Forward = 9;
  // Used to mark model training perspective, backward process
  Backward = 10;
  // Used to mark model training perspective, optimization process
  Optimization = 11;
  // Used to mark distributed training perspective
  Communication = 12;
  // Used to mark python api
  PythonOp = 13;
  // Used to mark python level userdefined
  PythonUserDefined = 14;
  // Used to mark mlu runtime record returned by cnpapi
  MluRuntime = 15;
};

enum TracerMemEventTypeProto {
  // Used to mark memory allocation which is managed by paddle
  Allocate = 0;
  // Used to mark memory free which is managed by paddle
  Free = 1;
  // Used to mark reserved memory allocation which is applied from device.
  ReservedAllocate = 2;
  // Used to mark reserved memory free which is released to device.
  ReservedFree = 3;
};

message KernelEventInfoProto {
  // The X-dimension block size for the kernel.
  required uint32 block_x = 1;
  // The Y-dimension block size for the kernel.
  required uint32 block_y = 2;
  // The Z-dimension grid size for the kernel.
  required uint32 block_z = 3;
  // X-dimension of a grid.
  required uint32 grid_x = 4;
  // Y-dimension of a grid.
  required uint32 grid_y = 5;
  // Z-dimension of a grid.
  required uint32 grid_z = 6;
  // The dynamic shared memory reserved for the kernel, in bytes.
  required uint32 dynamic_shared_memory = 7;
  // The static shared memory allocated for the kernel, in bytes.
  required uint32 static_shared_memory = 8;
  // The number of registers required for each thread executing the kernel.
  required uint32 registers_per_thread = 9;
  // The amount of local memory reserved for each thread, in bytes.
  required uint32 local_memory_per_thread = 10;
  // The total amount of local memory reserved for the kernel, in bytes.
  required uint32 local_memory_total = 11;
  // The timestamp when the kernel is queued up in the command buffer, in ns.
  // This timestamp is not collected by default. Use API
  // cuptiActivityEnableLatencyTimestamps() to enable collection.
  required uint64 queued = 12;
  // The timestamp when the command buffer containing the kernel launch is
  // submitted to the GPU, in ns.
  // This timestamp is not collected by default. Use API
  // cuptiActivityEnableLatencyTimestamps() to enable collection.
  required uint64 submitted = 13;
  // The completed timestamp for the kernel execution, in ns.
  required uint64 completed = 14;
  // blocks per sm
  required float blocks_per_sm = 15;
  // warps per sm
  required float warps_per_sm = 16;
  // theoretical achieved occupancy
  required float occupancy = 17;
}

message MemcpyEventInfoProto {
  // The number of bytes transferred by the memory copy.
  required uint64 num_bytes = 1;
  // The kind of the memory copy.
  // Each kind represents the source and destination targets of a memory copy.
  // Targets are host, device, and array. Refer to CUpti_ActivityMemcpyKind
  required string copy_kind = 2;
  // The source memory kind read by the memory copy.
  // Each kind represents the type of the memory accessed by a memory
  // operation/copy. Refer to CUpti_ActivityMemoryKind
  required string src_kind = 3;
  // The destination memory kind read by the memory copy.
  required string dst_kind = 4;
}

message MemsetEventInfoProto {
  // The number of bytes being set by the memory set.
  required uint64 num_bytes = 1;
  // The memory kind of the memory set. Refer to CUpti_ActivityMemoryKind
  required string memory_kind = 2;
  // the value being assigned to memory by the memory set.
  required uint32 value = 3;
}

message HostTraceEventProto {
  required string name = 1;
  required TracerEventTypeProto type = 2;
  // start timestamp of the record
  required uint64 start_ns = 3;
  // end timestamp of the record
  required uint64 end_ns = 4;
  // process id of the record
  required uint64 process_id = 5;
  // thread id of the record
  required uint64 thread_id = 6;
}

message MemTraceEventProto {
  // timestamp of the record
  required uint64 timestamp_ns = 1;
  // memory manipulation type
  required TracerMemEventTypeProto type = 2;
  // memory addr of allocation or free
  required uint64 addr = 3;
  // process id of the record
  required uint64 process_id = 4;
  // thread id of the record
  required uint64 thread_id = 5;
  // increase bytes after this manipulation, allocation for sign +, free for
  // sign -
  required int64 increase_bytes = 6;
  // place
  required string place = 7;
  // current total allocated memory
  required uint64 current_allocated = 8;
  // current total reserved memory
  required uint64 current_reserved = 9;
  // current peak allocated memory
  required uint64 peak_allocated = 10;
  // current peak reserved memory
  required uint64 peak_reserved = 11;
}

message OperatorSupplementEventProto {
  // timestamp of the record
  required uint64 timestamp_ns = 1;
  // op type name
  required string op_type = 2;
  // process id of the record
  required uint64 process_id = 3;
  // thread id of the record
  required uint64 thread_id = 4;
  // input shapes
  message input_shape_proto {
    repeated string key = 1;
    message shape_vector {
      message shape { repeated uint64 size = 1; }
      repeated shape shapes = 1;
    }
    repeated shape_vector shape_vecs = 2;
  }
  required input_shape_proto input_shapes = 5;
  // dtypes
  message dtype_proto {
    repeated string key = 1;
    message dtype_vector { repeated string dtype = 1; }
    repeated dtype_vector dtype_vecs = 2;
  }
  required dtype_proto dtypes = 6;
  // call stack
  required string callstack = 7;
}

message CudaRuntimeTraceEventProto {
  // record name
  required string name = 1;
  // start timestamp of the record
  required uint64 start_ns = 2;
  // end timestamp of the record
  required uint64 end_ns = 3;
  // process id of the record
  required uint64 process_id = 4;
  // thread id of the record
  required uint64 thread_id = 5;
  // correlation id, used for correlating async activities happened on device
  required uint32 correlation_id = 6;
  // callback id, used to identify which cuda runtime api is called
  required uint32 callback_id = 7;
}

message DeviceTraceEventProto {
  // record name
  required string name = 1;
  // record type, one of TracerEventType
  required TracerEventTypeProto type = 2;
  // start timestamp of the record
  required uint64 start_ns = 3;
  // end timestamp of the record
  required uint64 end_ns = 4;
  // device id
  required uint64 device_id = 5;
  // context id
  required uint64 context_id = 6;
  // stream id
  required uint64 stream_id = 7;
  // correlation id, used for correlating async activities happened on device
  required uint32 correlation_id = 8;
  // union, specific device record type has different detail information
  oneof detail_info {
    // used for TracerEventType::Kernel
    KernelEventInfoProto kernel_info = 9;
    // used for TracerEventType::Memcpy
    MemcpyEventInfoProto memcpy_info = 10;
    // used for TracerEventType::Memset
    MemsetEventInfoProto memset_info = 11;
  }
}

message OperatorSupplementEventNodeProto {
  required OperatorSupplementEventProto op_supplement_event = 1;
}

message MemTraceEventNodeProto { required MemTraceEventProto mem_event = 1; }

message DeviceTraceEventNodeProto {
  required DeviceTraceEventProto device_event = 1;
}

message CudaRuntimeTraceEventNodeProto {
  required CudaRuntimeTraceEventProto runtime_trace_event = 1;
  repeated DeviceTraceEventNodeProto device_nodes = 2;
}

message HostTraceEventNodeProto {
  required int64 id = 1;
  required int64 parentid = 2;
  required HostTraceEventProto host_trace_event = 3;
  repeated CudaRuntimeTraceEventNodeProto runtime_nodes = 4;
  // below is added in version 1.0.1
  repeated MemTraceEventNodeProto mem_nodes = 5;
  repeated OperatorSupplementEventNodeProto op_supplement_nodes = 6;
}

message ThreadNodeTreeProto {
  required uint64 thread_id = 1;
  repeated HostTraceEventNodeProto host_nodes = 2;
}

message ExtraInfoMap {
  required string key = 1;
  required string value = 2;
}

message DevicePropertyProto {
  required uint32 id = 1;
  required string name = 2;
  required uint64 total_global_memory = 3;
  required uint32 compute_major = 4;
  required uint32 compute_minor = 5;
  required uint32 max_threads_per_block = 6;
  required uint32 max_threads_per_multiprocessor = 7;
  required uint32 regs_per_block = 8;
  required uint32 regs_per_multiprocessor = 9;
  required uint32 warp_size = 10;
  required uint64 shared_memory_per_block = 11;
  required uint64 shared_memory_per_multiprocessor = 12;
  required uint32 sm_count = 13;
  required uint64 shared_memory_per_block_optin = 14;
}

message NodeTreesProto {
  required string version = 1;
  required uint32 span_indx = 2;
  repeated ThreadNodeTreeProto thread_trees = 3;
  repeated ExtraInfoMap extra_info = 4;
  repeated DevicePropertyProto device_property = 5;
}