未验证 提交 6de20581 编写于 作者: C chenjian 提交者: GitHub

Fix operator type record in profiler [cherry-pick PR44582] (#44654)

* fix record event for operator type in new dygraph (#44582)

* fix new dygraph record event for op

* update unit test

* fix file mode
上级 b71833ea
...@@ -71,7 +71,7 @@ PARSE_PYTHON_C_ARGS_TEMPLATE = \ ...@@ -71,7 +71,7 @@ PARSE_PYTHON_C_ARGS_TEMPLATE = \
RECORD_EVENT_TEMPLATE = \ RECORD_EVENT_TEMPLATE = \
" paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::Operator, 1);" "paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::UserDefined, 1);"
RETURN_INPLACE_PYOBJECT_TEMPLATE = \ RETURN_INPLACE_PYOBJECT_TEMPLATE = \
...@@ -253,6 +253,7 @@ NAMESPACE_WRAPPER_TEMPLATE = \ ...@@ -253,6 +253,7 @@ NAMESPACE_WRAPPER_TEMPLATE = \
## Generator Classes ## ## Generator Classes ##
####################### #######################
class PythonCSingleFunctionGenerator(FunctionGeneratorBase): class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
def __init__(self, forward_api_contents, namespace): def __init__(self, forward_api_contents, namespace):
# Members from Parent: # Members from Parent:
#self.namespace #self.namespace
...@@ -327,8 +328,8 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase): ...@@ -327,8 +328,8 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
set_device_str = FUNCTION_SET_DEVICE_TEMPLATE.format(expected_place_str) set_device_str = FUNCTION_SET_DEVICE_TEMPLATE.format(expected_place_str)
# Generate Dygraph Function Call Logic # Generate Dygraph Function Call Logic
num_args = len(forward_inputs_position_map.keys()) + len( num_args = len(
orig_forward_attrs_list) forward_inputs_position_map.keys()) + len(orig_forward_attrs_list)
dygraph_function_call_list = ["" for i in range(num_args)] dygraph_function_call_list = ["" for i in range(num_args)]
for name, (_, pos) in forward_inputs_position_map.items(): for name, (_, pos) in forward_inputs_position_map.items():
dygraph_function_call_list[pos] = f"{name}" dygraph_function_call_list[pos] = f"{name}"
...@@ -441,6 +442,7 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase): ...@@ -441,6 +442,7 @@ class PythonCSingleFunctionGenerator(FunctionGeneratorBase):
class PythonCYamlGenerator(YamlGeneratorBase): class PythonCYamlGenerator(YamlGeneratorBase):
def __init__(self, path): def __init__(self, path):
# Parent members: # Parent members:
# self.namespace # self.namespace
...@@ -457,8 +459,8 @@ class PythonCYamlGenerator(YamlGeneratorBase): ...@@ -457,8 +459,8 @@ class PythonCYamlGenerator(YamlGeneratorBase):
forward_api_list = self.forward_api_list forward_api_list = self.forward_api_list
for forward_api_content in forward_api_list: for forward_api_content in forward_api_list:
f_generator = PythonCSingleFunctionGenerator(forward_api_content, f_generator = PythonCSingleFunctionGenerator(
namespace) forward_api_content, namespace)
status = f_generator.run() status = f_generator.run()
if status == True: if status == True:
......
...@@ -30,10 +30,10 @@ ...@@ -30,10 +30,10 @@
namespace egr { namespace egr {
/* /*
* GeneralGrad is Helpper class to implement custom grad operation between * GeneralGrad is Helpper class to implement custom grad operation between
* outputs and inputs. * outputs and inputs.
* *
* **/ * **/
class GeneralGrad { class GeneralGrad {
public: public:
static GeneralGrad& Instance() { return *general_grad_; } static GeneralGrad& Instance() { return *general_grad_; }
...@@ -64,7 +64,8 @@ class GeneralGrad { ...@@ -64,7 +64,8 @@ class GeneralGrad {
paddle::platform::errors::Fatal( paddle::platform::errors::Fatal(
"There is no grad op for %s:[%d] or it's" "There is no grad op for %s:[%d] or it's"
"stop_gradient=True.", "stop_gradient=True.",
msg, i)); msg,
i));
if (is_no_grad_vars) { if (is_no_grad_vars) {
(no_grad_var_nodes_inputmeta_map)[target_node] = auto_grad_meta; (no_grad_var_nodes_inputmeta_map)[target_node] = auto_grad_meta;
} else { // normal input } else { // normal input
...@@ -248,7 +249,8 @@ class GeneralGrad { ...@@ -248,7 +249,8 @@ class GeneralGrad {
std::vector<paddle::experimental::Tensor> GetResults( std::vector<paddle::experimental::Tensor> GetResults(
const std::vector<paddle::experimental::Tensor>& inputs, const std::vector<paddle::experimental::Tensor>& inputs,
bool allow_unused, bool create_graph) { bool allow_unused,
bool create_graph) {
VLOG(6) << "Running in GetResults"; VLOG(6) << "Running in GetResults";
if (inputs.empty()) return {}; if (inputs.empty()) return {};
...@@ -276,7 +278,8 @@ class GeneralGrad { ...@@ -276,7 +278,8 @@ class GeneralGrad {
tensor_auto_grad_meta->SetStopGradient(!create_graph); tensor_auto_grad_meta->SetStopGradient(!create_graph);
results.emplace_back(iter->second); results.emplace_back(iter->second);
} else { } else {
PADDLE_ENFORCE_EQ(allow_unused, true, PADDLE_ENFORCE_EQ(allow_unused,
true,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
"The %d-th input does not appear in the backward " "The %d-th input does not appear in the backward "
"graph. Please check the input tensor or set " "graph. Please check the input tensor or set "
...@@ -493,7 +496,8 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap( ...@@ -493,7 +496,8 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
void EnforceGradNodeHasInput(GradNodeBase* node) { void EnforceGradNodeHasInput(GradNodeBase* node) {
VLOG(6) << "Running in EnforceGradNodeHasInput"; VLOG(6) << "Running in EnforceGradNodeHasInput";
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
node->IsTensorWrappersCleared(), true, node->IsTensorWrappersCleared(),
true,
paddle::platform::errors::Fatal( paddle::platform::errors::Fatal(
"The TensorWrappers of %s do not exist. This may be because:\n" "The TensorWrappers of %s do not exist. This may be because:\n"
"You calculate backward twice for the same subgraph without " "You calculate backward twice for the same subgraph without "
...@@ -509,10 +513,13 @@ void DuplicateCheck(const std::vector<paddle::experimental::Tensor>& inputs, ...@@ -509,10 +513,13 @@ void DuplicateCheck(const std::vector<paddle::experimental::Tensor>& inputs,
for (auto in : inputs) { for (auto in : inputs) {
AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(in); AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(in);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
visisted_ins.count(auto_grad_meta), 0, visisted_ins.count(auto_grad_meta),
0,
paddle::platform::errors::AlreadyExists( paddle::platform::errors::AlreadyExists(
"%s contain duplicate tensor %s, please check %s carefully.", msg, "%s contain duplicate tensor %s, please check %s carefully.",
in.name(), msg)); msg,
in.name(),
msg));
visisted_ins.insert(auto_grad_meta); visisted_ins.insert(auto_grad_meta);
} }
} }
...@@ -522,7 +529,8 @@ GeneralGrad* GeneralGrad::general_grad_ = new GeneralGrad(); ...@@ -522,7 +529,8 @@ GeneralGrad* GeneralGrad::general_grad_ = new GeneralGrad();
std::vector<paddle::experimental::Tensor> RunBackward( std::vector<paddle::experimental::Tensor> RunBackward(
const std::vector<paddle::experimental::Tensor>& tensors, // output const std::vector<paddle::experimental::Tensor>& tensors, // output
const std::vector<paddle::experimental::Tensor>& grad_tensors, const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph, bool create_graph = false, bool retain_graph,
bool create_graph = false,
const std::vector<paddle::experimental::Tensor>& inputs = {}, const std::vector<paddle::experimental::Tensor>& inputs = {},
bool allow_unused = false, bool allow_unused = false,
const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) { const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) {
...@@ -631,8 +639,8 @@ std::vector<paddle::experimental::Tensor> RunBackward( ...@@ -631,8 +639,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
if (is_general_grad) { if (is_general_grad) {
// Prepare several vital preprocess for GeneralGrad // Prepare several vital preprocess for GeneralGrad
GeneralGrad::Instance().PreparedForGeneralGrad(inputs, no_grad_vars, &queue, GeneralGrad::Instance().PreparedForGeneralGrad(
node_input_buffers_dict); inputs, no_grad_vars, &queue, node_input_buffers_dict);
} }
VLOG(6) << " startup_ops' size is :" << queue.size(); VLOG(6) << " startup_ops' size is :" << queue.size();
...@@ -651,7 +659,8 @@ std::vector<paddle::experimental::Tensor> RunBackward( ...@@ -651,7 +659,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
paddle::platform::RecordEvent node_record_event( paddle::platform::RecordEvent node_record_event(
std::string((*node).name()) + " grad_node", std::string((*node).name()) + " grad_node",
paddle::platform::TracerEventType::Operator, 1); paddle::platform::TracerEventType::Operator,
1);
if (queue.size() > 1 && node_in_degree_map[node] != 0) { if (queue.size() > 1 && node_in_degree_map[node] != 0) {
queue.pop(); queue.pop();
...@@ -716,7 +725,8 @@ std::vector<paddle::experimental::Tensor> RunBackward( ...@@ -716,7 +725,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
"Number of edges should be either empty ( for leaf node " "Number of edges should be either empty ( for leaf node "
") or the same as number of output grad tensors, but we " ") or the same as number of output grad tensors, but we "
"got edges size is: %d, grad_output size is: %d", "got edges size is: %d, grad_output size is: %d",
edges.size(), grad_output_tensors.size())); edges.size(),
grad_output_tensors.size()));
for (size_t i = 0; i < edges.size(); i++) { for (size_t i = 0; i < edges.size(); i++) {
for (size_t j = 0; j < edges[i].size(); j++) { for (size_t j = 0; j < edges[i].size(); j++) {
...@@ -739,7 +749,8 @@ std::vector<paddle::experimental::Tensor> RunBackward( ...@@ -739,7 +749,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
} }
PADDLE_ENFORCE_LT( PADDLE_ENFORCE_LT(
j, grad_output_tensors[i].size(), j,
grad_output_tensors[i].size(),
paddle::platform::errors::Fatal( paddle::platform::errors::Fatal(
"Rank of grad_output_tensors should be less than " "Rank of grad_output_tensors should be less than "
"grad_output_tensors[i].size(), which is: %d. This error may " "grad_output_tensors[i].size(), which is: %d. This error may "
...@@ -771,8 +782,9 @@ std::vector<paddle::experimental::Tensor> RunBackward( ...@@ -771,8 +782,9 @@ std::vector<paddle::experimental::Tensor> RunBackward(
VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first
<< ", rank: " << edge_rank.second; << ", rank: " << edge_rank.second;
node_input_buffers_dict[next_node]->add( node_input_buffers_dict[next_node]->add(edge_rank.first,
edge_rank.first, edge_rank.second, grad_output_tensor, edge_rank.second,
grad_output_tensor,
create_graph); create_graph);
// Update queue // Update queue
...@@ -810,7 +822,7 @@ void Backward( ...@@ -810,7 +822,7 @@ void Backward(
bool retain_graph) { bool retain_graph) {
VLOG(6) << "Run in Backward"; VLOG(6) << "Run in Backward";
paddle::platform::RecordEvent backward_record_event( paddle::platform::RecordEvent backward_record_event(
"backward", paddle::platform::TracerEventType::Operator, 1); "backward", paddle::platform::TracerEventType::UserDefined, 1);
RunBackward(tensors, grad_tensors, retain_graph); RunBackward(tensors, grad_tensors, retain_graph);
phi::autotune::AutoTuneStatus::Instance().Update(); phi::autotune::AutoTuneStatus::Instance().Update();
} }
...@@ -819,14 +831,22 @@ std::vector<paddle::experimental::Tensor> Grad( ...@@ -819,14 +831,22 @@ std::vector<paddle::experimental::Tensor> Grad(
const std::vector<paddle::experimental::Tensor>& tensors, // outputs const std::vector<paddle::experimental::Tensor>& tensors, // outputs
const std::vector<paddle::experimental::Tensor>& inputs, const std::vector<paddle::experimental::Tensor>& inputs,
const std::vector<paddle::experimental::Tensor>& grad_tensors, const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused, bool retain_graph,
bool create_graph,
bool only_inputs,
bool allow_unused,
const std::vector<paddle::experimental::Tensor>& no_grad_vars) { const std::vector<paddle::experimental::Tensor>& no_grad_vars) {
VLOG(6) << "Run in Grad"; VLOG(6) << "Run in Grad";
DuplicateCheck(inputs, true /* is_input */); DuplicateCheck(inputs, true /* is_input */);
DuplicateCheck(tensors, false /* is_input */); DuplicateCheck(tensors, false /* is_input */);
return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs, return RunBackward(tensors,
allow_unused, no_grad_vars); grad_tensors,
retain_graph,
create_graph,
inputs,
allow_unused,
no_grad_vars);
} }
} // namespace egr } // namespace egr
...@@ -588,7 +588,7 @@ void ChromeTracingLogger::StartLog() { ...@@ -588,7 +588,7 @@ void ChromeTracingLogger::StartLog() {
std::string( std::string(
R"JSON( R"JSON(
{ {
"id": %d, "name": "%s", "totalGlobalMem": %u, "id": %d, "name": "%s", "totalGlobalMem": %llu,
"computeMajor": %d, "computeMinor": %d, "computeMajor": %d, "computeMinor": %d,
"maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d, "maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d,
"regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d, "regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d,
...@@ -618,7 +618,7 @@ void ChromeTracingLogger::StartLog() { ...@@ -618,7 +618,7 @@ void ChromeTracingLogger::StartLog() {
std::string( std::string(
R"JSON( R"JSON(
{ {
"id": %d, "name": "%s", "totalGlobalMem": %u, "id": %d, "name": "%s", "totalGlobalMem": %llu,
"computeMajor": %d, "computeMinor": %d, "computeMajor": %d, "computeMinor": %d,
"maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d, "maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d,
"regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d, "regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d,
......
...@@ -19,6 +19,7 @@ import paddle.profiler as profiler ...@@ -19,6 +19,7 @@ import paddle.profiler as profiler
class HostPythonNode: class HostPythonNode:
def __init__(self, name, type, start_ns, end_ns, process_id, thread_id): def __init__(self, name, type, start_ns, end_ns, process_id, thread_id):
self.name = name self.name = name
self.type = type self.type = type
...@@ -32,6 +33,7 @@ class HostPythonNode: ...@@ -32,6 +33,7 @@ class HostPythonNode:
class DevicePythonNode: class DevicePythonNode:
def __init__(self, name, type, start_ns, end_ns, device_id, context_id, def __init__(self, name, type, start_ns, end_ns, device_id, context_id,
stream_id): stream_id):
self.name = name self.name = name
...@@ -44,6 +46,7 @@ class DevicePythonNode: ...@@ -44,6 +46,7 @@ class DevicePythonNode:
class TestProfilerStatistic(unittest.TestCase): class TestProfilerStatistic(unittest.TestCase):
def test_statistic_case1(self): def test_statistic_case1(self):
root_node = HostPythonNode('Root Node', root_node = HostPythonNode('Root Node',
profiler.TracerEventType.UserDefined, 0, profiler.TracerEventType.UserDefined, 0,
...@@ -54,13 +57,15 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -54,13 +57,15 @@ class TestProfilerStatistic(unittest.TestCase):
dataloader_node = HostPythonNode('Dataloader', dataloader_node = HostPythonNode('Dataloader',
profiler.TracerEventType.Dataloader, 5, profiler.TracerEventType.Dataloader, 5,
15, 1000, 1001) 15, 1000, 1001)
mobilenet_node = HostPythonNode( mobilenet_node = HostPythonNode('MobileNet',
'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) profiler.TracerEventType.Forward, 20,
yolonet_node = HostPythonNode( 50, 1000, 1001)
'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001) yolonet_node = HostPythonNode('Yolov3Net',
profiler.TracerEventType.Forward, 50, 110,
1000, 1001)
userdefined_node = HostPythonNode('Communication Time', userdefined_node = HostPythonNode(
profiler.TracerEventType.UserDefined, 'Communication Time', profiler.TracerEventType.PythonUserDefined,
100, 110, 1000, 1001) 100, 110, 1000, 1001)
communication_node = HostPythonNode( communication_node = HostPythonNode(
...@@ -72,8 +77,9 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -72,8 +77,9 @@ class TestProfilerStatistic(unittest.TestCase):
optimization_node = HostPythonNode( optimization_node = HostPythonNode(
'Optimization', profiler.TracerEventType.Optimization, 220, 300, 'Optimization', profiler.TracerEventType.Optimization, 220, 300,
1000, 1001) 1000, 1001)
conv2d_node = HostPythonNode( conv2d_node = HostPythonNode('conv2d',
'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001) profiler.TracerEventType.Operator, 25, 40,
1000, 1001)
sync_batch_norm_node = HostPythonNode('sync_batch_norm', sync_batch_norm_node = HostPythonNode('sync_batch_norm',
profiler.TracerEventType.Operator, profiler.TracerEventType.Operator,
60, 100, 1000, 1001) 60, 100, 1000, 1001)
...@@ -92,10 +98,12 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -92,10 +98,12 @@ class TestProfilerStatistic(unittest.TestCase):
conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy', conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
profiler.TracerEventType.CudaRuntime, profiler.TracerEventType.CudaRuntime,
35, 40, 1000, 1001) 35, 40, 1000, 1001)
conv2d_kernel = DevicePythonNode( conv2d_kernel = DevicePythonNode('conv2d_kernel',
'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0) profiler.TracerEventType.Kernel, 35,
conv2d_memcpy = DevicePythonNode( 50, 0, 0, 0)
'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0) conv2d_memcpy = DevicePythonNode('conv2d_memcpy',
profiler.TracerEventType.Memcpy, 50,
60, 0, 0, 0)
sync_batch_norm_infer_shape = HostPythonNode( sync_batch_norm_infer_shape = HostPythonNode(
'sync_batch_norm::infer_shape', 'sync_batch_norm::infer_shape',
profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001) profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
...@@ -146,8 +154,8 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -146,8 +154,8 @@ class TestProfilerStatistic(unittest.TestCase):
'Process Cpu Utilization': '1.02', 'Process Cpu Utilization': '1.02',
'System Cpu Utilization': '0.68' 'System Cpu Utilization': '0.68'
} }
statistic_data = profiler.profiler_statistic.StatisticData(thread_tree, statistic_data = profiler.profiler_statistic.StatisticData(
extra_info) thread_tree, extra_info)
time_range_summary = statistic_data.time_range_summary time_range_summary = statistic_data.time_range_summary
event_summary = statistic_data.event_summary event_summary = statistic_data.event_summary
...@@ -180,7 +188,7 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -180,7 +188,7 @@ class TestProfilerStatistic(unittest.TestCase):
0, profiler.TracerEventType.Memcpy), 60) 0, profiler.TracerEventType.Memcpy), 60)
self.assertEqual( self.assertEqual(
time_range_summary.get_cpu_range_sum( time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.UserDefined), 25) profiler.TracerEventType.UserDefined), 15)
self.assertEqual( self.assertEqual(
time_range_summary.get_cpu_range_sum( time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Communication), 5) profiler.TracerEventType.Communication), 5)
...@@ -200,8 +208,9 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -200,8 +208,9 @@ class TestProfilerStatistic(unittest.TestCase):
0) 0)
self.assertEqual( self.assertEqual(
event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15) event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
self.assertEqual(event_summary.memory_manipulation_items['AsyncMemcpy'] self.assertEqual(
.general_gpu_time, 60) event_summary.memory_manipulation_items['AsyncMemcpy'].
general_gpu_time, 60)
print( print(
profiler.profiler_statistic._build_table( profiler.profiler_statistic._build_table(
statistic_data, statistic_data,
...@@ -222,13 +231,15 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -222,13 +231,15 @@ class TestProfilerStatistic(unittest.TestCase):
profiler.TracerEventType.Dataloader, 5, profiler.TracerEventType.Dataloader, 5,
15, 1000, 1001) 15, 1000, 1001)
mobilenet_node = HostPythonNode( mobilenet_node = HostPythonNode('MobileNet',
'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) profiler.TracerEventType.Forward, 20,
yolonet_node = HostPythonNode( 50, 1000, 1001)
'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001) yolonet_node = HostPythonNode('Yolov3Net',
profiler.TracerEventType.Forward, 50, 110,
1000, 1001)
userdefined_node = HostPythonNode('Communication Time', userdefined_node = HostPythonNode(
profiler.TracerEventType.UserDefined, 'Communication Time', profiler.TracerEventType.PythonUserDefined,
100, 110, 1000, 1001) 100, 110, 1000, 1001)
allreduce_launchkernel0 = HostPythonNode( allreduce_launchkernel0 = HostPythonNode(
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 102, 104, 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 102, 104,
...@@ -263,8 +274,9 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -263,8 +274,9 @@ class TestProfilerStatistic(unittest.TestCase):
optimization_node = HostPythonNode( optimization_node = HostPythonNode(
'Optimization', profiler.TracerEventType.Optimization, 220, 300, 'Optimization', profiler.TracerEventType.Optimization, 220, 300,
1000, 1001) 1000, 1001)
conv2d_node = HostPythonNode( conv2d_node = HostPythonNode('conv2d',
'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001) profiler.TracerEventType.Operator, 25, 40,
1000, 1001)
sync_batch_norm_node = HostPythonNode('sync_batch_norm', sync_batch_norm_node = HostPythonNode('sync_batch_norm',
profiler.TracerEventType.Operator, profiler.TracerEventType.Operator,
60, 100, 1000, 1001) 60, 100, 1000, 1001)
...@@ -283,10 +295,12 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -283,10 +295,12 @@ class TestProfilerStatistic(unittest.TestCase):
conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy', conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
profiler.TracerEventType.CudaRuntime, profiler.TracerEventType.CudaRuntime,
35, 40, 1000, 1001) 35, 40, 1000, 1001)
conv2d_kernel = DevicePythonNode( conv2d_kernel = DevicePythonNode('conv2d_kernel',
'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0) profiler.TracerEventType.Kernel, 35,
conv2d_memcpy = DevicePythonNode( 50, 0, 0, 0)
'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0) conv2d_memcpy = DevicePythonNode('conv2d_memcpy',
profiler.TracerEventType.Memcpy, 50,
60, 0, 0, 0)
sync_batch_norm_infer_shape = HostPythonNode( sync_batch_norm_infer_shape = HostPythonNode(
'sync_batch_norm::infer_shape', 'sync_batch_norm::infer_shape',
profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001) profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
...@@ -363,8 +377,8 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -363,8 +377,8 @@ class TestProfilerStatistic(unittest.TestCase):
'Process Cpu Utilization': '1.02', 'Process Cpu Utilization': '1.02',
'System Cpu Utilization': '0.68' 'System Cpu Utilization': '0.68'
} }
statistic_data = profiler.profiler_statistic.StatisticData(thread_tree, statistic_data = profiler.profiler_statistic.StatisticData(
extra_info) thread_tree, extra_info)
time_range_summary = statistic_data.time_range_summary time_range_summary = statistic_data.time_range_summary
event_summary = statistic_data.event_summary event_summary = statistic_data.event_summary
distributed_summary = statistic_data.distributed_summary distributed_summary = statistic_data.distributed_summary
...@@ -398,7 +412,7 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -398,7 +412,7 @@ class TestProfilerStatistic(unittest.TestCase):
0, profiler.TracerEventType.Memcpy), 60) 0, profiler.TracerEventType.Memcpy), 60)
self.assertEqual( self.assertEqual(
time_range_summary.get_cpu_range_sum( time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.UserDefined), 25) profiler.TracerEventType.UserDefined), 15)
self.assertEqual( self.assertEqual(
time_range_summary.get_cpu_range_sum( time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Communication), 5) profiler.TracerEventType.Communication), 5)
...@@ -433,8 +447,9 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -433,8 +447,9 @@ class TestProfilerStatistic(unittest.TestCase):
0) 0)
self.assertEqual( self.assertEqual(
event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15) event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
self.assertEqual(event_summary.memory_manipulation_items['AsyncMemcpy'] self.assertEqual(
.general_gpu_time, 60) event_summary.memory_manipulation_items['AsyncMemcpy'].
general_gpu_time, 60)
print( print(
profiler.profiler_statistic._build_table( profiler.profiler_statistic._build_table(
statistic_data, statistic_data,
...@@ -454,8 +469,9 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -454,8 +469,9 @@ class TestProfilerStatistic(unittest.TestCase):
dataloader_node = HostPythonNode('Dataloader', dataloader_node = HostPythonNode('Dataloader',
profiler.TracerEventType.Dataloader, 5, profiler.TracerEventType.Dataloader, 5,
15, 1000, 1001) 15, 1000, 1001)
mobilenet_node = HostPythonNode( mobilenet_node = HostPythonNode('MobileNet',
'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) profiler.TracerEventType.Forward, 20,
50, 1000, 1001)
backward_node = HostPythonNode('Gradient Backward', backward_node = HostPythonNode('Gradient Backward',
profiler.TracerEventType.Backward, 120, profiler.TracerEventType.Backward, 120,
...@@ -463,12 +479,13 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -463,12 +479,13 @@ class TestProfilerStatistic(unittest.TestCase):
optimization_node = HostPythonNode( optimization_node = HostPythonNode(
'Optimization', profiler.TracerEventType.Optimization, 220, 300, 'Optimization', profiler.TracerEventType.Optimization, 220, 300,
1000, 1001) 1000, 1001)
userdefined_node = HostPythonNode('Communication Time', userdefined_node = HostPythonNode(
profiler.TracerEventType.UserDefined, 'Communication Time', profiler.TracerEventType.PythonUserDefined,
60, 70, 1000, 1001) 60, 70, 1000, 1001)
conv2d_node = HostPythonNode( conv2d_node = HostPythonNode('conv2d',
'conv2d', profiler.TracerEventType.Operator, 25, 25, 1000, 1001) profiler.TracerEventType.Operator, 25, 25,
1000, 1001)
conv2d_infer_shape = HostPythonNode( conv2d_infer_shape = HostPythonNode(
'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25, 'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
...@@ -480,8 +497,9 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -480,8 +497,9 @@ class TestProfilerStatistic(unittest.TestCase):
'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 25, 25, 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 25, 25,
1000, 1001) 1000, 1001)
conv2d_kernel = DevicePythonNode( conv2d_kernel = DevicePythonNode('conv2d_kernel',
'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0) profiler.TracerEventType.Kernel, 35,
35, 0, 0, 0)
another_kernel = DevicePythonNode( another_kernel = DevicePythonNode(
'void phi::funcs::VectorizedBroadcastKernel<float, float, phi::funcs::AddFunctor<float>, phi::funcs::AddFunctor<float>>()', 'void phi::funcs::VectorizedBroadcastKernel<float, float, phi::funcs::AddFunctor<float>, phi::funcs::AddFunctor<float>>()',
profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0) profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0)
...@@ -500,15 +518,16 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -500,15 +518,16 @@ class TestProfilerStatistic(unittest.TestCase):
'Process Cpu Utilization': '1.02', 'Process Cpu Utilization': '1.02',
'System Cpu Utilization': '0.68' 'System Cpu Utilization': '0.68'
} }
statistic_data = profiler.profiler_statistic.StatisticData(thread_tree, statistic_data = profiler.profiler_statistic.StatisticData(
extra_info) thread_tree, extra_info)
time_range_summary = statistic_data.time_range_summary time_range_summary = statistic_data.time_range_summary
event_summary = statistic_data.event_summary event_summary = statistic_data.event_summary
self.assertEqual(event_summary.items['conv2d'].cpu_time, 0) self.assertEqual(event_summary.items['conv2d'].cpu_time, 0)
self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 0) self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 0)
self.assertEqual(event_summary.userdefined_items['Communication Time'] self.assertEqual(
.general_gpu_time, 0) event_summary.userdefined_items['Communication Time'].
general_gpu_time, 0)
for sort_key in [ for sort_key in [
profiler.SortedKeys.CPUTotal, profiler.SortedKeys.CPUMax, profiler.SortedKeys.CPUTotal, profiler.SortedKeys.CPUMax,
profiler.SortedKeys.CPUMin, profiler.SortedKeys.CPUAvg, profiler.SortedKeys.CPUMin, profiler.SortedKeys.CPUAvg,
...@@ -516,8 +535,7 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -516,8 +535,7 @@ class TestProfilerStatistic(unittest.TestCase):
profiler.SortedKeys.GPUMin, profiler.SortedKeys.GPUAvg profiler.SortedKeys.GPUMin, profiler.SortedKeys.GPUAvg
]: ]:
print( print(
profiler.profiler_statistic._build_table( profiler.profiler_statistic._build_table(statistic_data,
statistic_data,
sorted_by=sort_key, sorted_by=sort_key,
op_detail=True, op_detail=True,
thread_sep=False, thread_sep=False,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册