未验证 提交 8ea83400 编写于 作者: C chenjian 提交者: GitHub

Add input shape record for new dygraph operator (#44999)

* fix

* add control flag and input shapes for new dygraph

* fix file mode

* improve code coverage

* fix a bug in statstic

* fix according to review

* optimize performance

* fix
上级 35166902
...@@ -400,7 +400,7 @@ static void PerformBackwardInplace(const std::string& op_type, ...@@ -400,7 +400,7 @@ static void PerformBackwardInplace(const std::string& op_type,
void BasicEngine::Execute() { void BasicEngine::Execute() {
platform::RecordEvent backward_record_event( platform::RecordEvent backward_record_event(
"backward", platform::TracerEventType::Operator, 1); "backward", platform::TracerEventType::UserDefined, 1);
if (init_nodes_.empty()) { if (init_nodes_.empty()) {
return; return;
......
...@@ -42,6 +42,10 @@ DEFINE_bool(enable_host_event_recorder_hook, ...@@ -42,6 +42,10 @@ DEFINE_bool(enable_host_event_recorder_hook,
false, false,
"enable HostEventRecorder, hook Profiler"); "enable HostEventRecorder, hook Profiler");
DEFINE_bool(enable_record_input_shape, false, "enable input shape recorder");
DEFINE_bool(enable_record_memory, false, "enable memory recorder");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -258,6 +262,9 @@ RecordOpInfoSupplement::RecordOpInfoSupplement( ...@@ -258,6 +262,9 @@ RecordOpInfoSupplement::RecordOpInfoSupplement(
if (FLAGS_enable_host_event_recorder_hook == false) { if (FLAGS_enable_host_event_recorder_hook == false) {
return; return;
} }
if (IsEnabled() == false) {
return;
}
std::map<std::string, std::vector<framework::DDim>> input_shapes; std::map<std::string, std::vector<framework::DDim>> input_shapes;
std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes; std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
for (auto it = ctx.inputs.begin(); it != ctx.inputs.end(); it++) { for (auto it = ctx.inputs.begin(); it != ctx.inputs.end(); it++) {
...@@ -285,6 +292,9 @@ RecordOpInfoSupplement::RecordOpInfoSupplement( ...@@ -285,6 +292,9 @@ RecordOpInfoSupplement::RecordOpInfoSupplement(
if (FLAGS_enable_host_event_recorder_hook == false) { if (FLAGS_enable_host_event_recorder_hook == false) {
return; return;
} }
if (IsEnabled() == false) {
return;
}
std::map<std::string, std::vector<framework::DDim>> input_shapes; std::map<std::string, std::vector<framework::DDim>> input_shapes;
std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes; std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
for (auto it = kernel_signature.input_names.begin(); for (auto it = kernel_signature.input_names.begin();
...@@ -308,6 +318,33 @@ RecordOpInfoSupplement::RecordOpInfoSupplement( ...@@ -308,6 +318,33 @@ RecordOpInfoSupplement::RecordOpInfoSupplement(
PosixInNsec(), type, input_shapes, dtypes, callstack); PosixInNsec(), type, input_shapes, dtypes, callstack);
} }
RecordOpInfoSupplement::RecordOpInfoSupplement(
const std::string &type,
const std::vector<std::pair<const char *, std::vector<framework::DDim>>>
&input_shapes) {
if (FLAGS_enable_host_event_recorder_hook == false) {
return;
}
if (IsEnabled() == false) {
return;
}
std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
std::vector<std::string> callstack;
HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance().RecordEvent(
PosixInNsec(), type, input_shapes, dtypes, callstack);
}
bool RecordEvent::IsEnabled() {
return FLAGS_enable_host_event_recorder_hook || g_enable_nvprof_hook ||
g_state != ProfilerState::kDisabled;
}
bool RecordOpInfoSupplement::IsEnabled() {
return FLAGS_enable_record_input_shape;
}
bool RecordMemEvent::IsEnabled() { return FLAGS_enable_record_memory; }
std::map<const char *, std::map<uint64_t, std::vector<uint64_t>>> std::map<const char *, std::map<uint64_t, std::vector<uint64_t>>>
RecordMemEvent::size_cache; RecordMemEvent::size_cache;
...@@ -322,6 +359,11 @@ RecordMemEvent::RecordMemEvent(const void *ptr, ...@@ -322,6 +359,11 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
FLAGS_enable_host_event_recorder_hook == false) { FLAGS_enable_host_event_recorder_hook == false) {
return; return;
} }
if (IsEnabled() == false) {
return;
}
if (type == TracerMemEventType::Allocate) { if (type == TracerMemEventType::Allocate) {
uint64_t current_allocated; uint64_t current_allocated;
uint64_t peak_allocated; uint64_t peak_allocated;
...@@ -1045,6 +1087,14 @@ void DisableHostEventRecorder() { ...@@ -1045,6 +1087,14 @@ void DisableHostEventRecorder() {
FLAGS_enable_host_event_recorder_hook = false; FLAGS_enable_host_event_recorder_hook = false;
} }
void EnableInputShapeRecorder() { FLAGS_enable_record_input_shape = true; }
void DisableInputShapeRecorder() { FLAGS_enable_record_input_shape = false; }
void EnableMemoryRecorder() { FLAGS_enable_record_memory = true; }
void DisableMemoryRecorder() { FLAGS_enable_record_memory = false; }
std::string PrintHostEvents() { std::string PrintHostEvents() {
std::ostringstream oss; std::ostringstream oss;
auto host_evt_sec = auto host_evt_sec =
......
...@@ -248,6 +248,12 @@ void NvprofDisableRecordEvent(); ...@@ -248,6 +248,12 @@ void NvprofDisableRecordEvent();
void EnableHostEventRecorder(); void EnableHostEventRecorder();
void DisableHostEventRecorder(); void DisableHostEventRecorder();
void EnableMemoryRecorder();
void DisableMemoryRecorder();
void EnableInputShapeRecorder();
void DisableInputShapeRecorder();
// Defined for UT // Defined for UT
std::string PrintHostEvents(); std::string PrintHostEvents();
......
...@@ -124,6 +124,23 @@ struct OperatorSupplementOriginEvent { ...@@ -124,6 +124,23 @@ struct OperatorSupplementOriginEvent {
strncpy(buf, type_name.c_str(), type_name.length() + 1); strncpy(buf, type_name.c_str(), type_name.length() + 1);
op_type = buf; op_type = buf;
} }
OperatorSupplementOriginEvent(
std::function<void *(size_t)> arena_allocator,
uint64_t timestamp_ns,
const std::string &type_name,
const std::vector<std::pair<const char *, std::vector<framework::DDim>>>
&shapes,
const std::map<std::string, std::vector<framework::proto::VarType::Type>>
&dtypes,
const std::vector<std::string> callstack)
: timestamp_ns(timestamp_ns), dtypes(dtypes), callstack(callstack) {
auto buf = static_cast<char *>(arena_allocator(type_name.length() + 1));
strncpy(buf, type_name.c_str(), type_name.length() + 1);
op_type = buf;
for (auto it = shapes.begin(); it != shapes.end(); it++) {
input_shapes[std::string((*it).first)] = (*it).second;
}
}
uint64_t timestamp_ns; uint64_t timestamp_ns;
const char *op_type = nullptr; // not owned, designed for performance const char *op_type = nullptr; // not owned, designed for performance
// input shapes // input shapes
......
...@@ -48,6 +48,7 @@ struct RecordInstantEvent { ...@@ -48,6 +48,7 @@ struct RecordInstantEvent {
// Chrome Trace Viewer Format: Duration Event/Complte Event // Chrome Trace Viewer Format: Duration Event/Complte Event
class RecordEvent { class RecordEvent {
public: public:
static bool IsEnabled();
/** /**
* @param name: If your string argument has a longer lifetime (e.g.: string * @param name: If your string argument has a longer lifetime (e.g.: string
* literal, static variables, etc) than the event, use 'const char* name'. * literal, static variables, etc) than the event, use 'const char* name'.
......
...@@ -27,6 +27,7 @@ namespace platform { ...@@ -27,6 +27,7 @@ namespace platform {
// The events can be used to draw memory variation curve. // The events can be used to draw memory variation curve.
class RecordMemEvent { class RecordMemEvent {
public: public:
static bool IsEnabled();
/** /**
* @param ptr: Pointer address allocated or free. * @param ptr: Pointer address allocated or free.
* @param place: Device for this memory event. * @param place: Device for this memory event.
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <map>
#include <string> #include <string>
#include <utility>
#include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/framework/type_defs.h"
...@@ -30,6 +32,8 @@ namespace platform { ...@@ -30,6 +32,8 @@ namespace platform {
class RecordOpInfoSupplement { class RecordOpInfoSupplement {
public: public:
static bool IsEnabled();
/** /**
* @param type: Operator type name. * @param type: Operator type name.
* @param attrs: Attribute map of op. * @param attrs: Attribute map of op.
...@@ -50,6 +54,14 @@ class RecordOpInfoSupplement { ...@@ -50,6 +54,14 @@ class RecordOpInfoSupplement {
const framework::AttributeMap& attrs, const framework::AttributeMap& attrs,
const framework::InferShapeContext& shape_ctx, const framework::InferShapeContext& shape_ctx,
const phi::KernelSignature& kernel_signature); const phi::KernelSignature& kernel_signature);
/**
*
*/
explicit RecordOpInfoSupplement(
const std::string& type,
const std::vector<std::pair<const char*, std::vector<framework::DDim>>>&
input_shapes);
}; };
} // namespace platform } // namespace platform
......
...@@ -2133,6 +2133,12 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2133,6 +2133,12 @@ All parameter, weight, gradient are variables in Paddle.
.value("PythonUserDefined", .value("PythonUserDefined",
paddle::platform::TracerEventType::PythonUserDefined); paddle::platform::TracerEventType::PythonUserDefined);
m.def("load_profiler_result", &paddle::platform::LoadProfilerResult); m.def("load_profiler_result", &paddle::platform::LoadProfilerResult);
m.def("enable_memory_recorder", &paddle::platform::EnableMemoryRecorder);
m.def("disable_memory_recorder", &paddle::platform::DisableMemoryRecorder);
m.def("enable_input_shape_recorder",
&paddle::platform::EnableInputShapeRecorder);
m.def("disable_input_shape_recorder",
&paddle::platform::DisableInputShapeRecorder);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m.def("set_cublas_switch", platform::SetAllowTF32Cublas); m.def("set_cublas_switch", platform::SetAllowTF32Cublas);
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
import re import re
import collections
PREFIX_TENSOR_NAME = 'input_' PREFIX_TENSOR_NAME = 'input_'
PREFIX_META_TENSOR_NAME = 'meta_' PREFIX_META_TENSOR_NAME = 'meta_'
...@@ -569,6 +570,7 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d ...@@ -569,6 +570,7 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
kernel_param = input_names + attr_names kernel_param = input_names + attr_names
input_tensor_code = "" input_tensor_code = ""
input_name_tensor_map = collections.defaultdict(list)
for i, input_name in enumerate(input_names): for i, input_name in enumerate(input_names):
# set input code # set input code
if input_name in kernel_param: if input_name in kernel_param:
...@@ -582,17 +584,23 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d ...@@ -582,17 +584,23 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
'support_trans_dtype']: 'support_trans_dtype']:
trans_flag = "{false, true}" trans_flag = "{false, true}"
if input_name in self.optional_vars: if input_name in self.optional_vars:
input_name_tensor_map[input_name].append(
(f"{PREFIX_TENSOR_NAME}{input_name}", False))
input_tensor_code = input_tensor_code + f""" input_tensor_code = input_tensor_code + f"""
{code_indent} auto {PREFIX_TENSOR_NAME}{input_name} = PrepareData({input_name}, kernel.InputAt({kernel_param.index(input_name)}), {trans_flag});""" {code_indent} auto {PREFIX_TENSOR_NAME}{input_name} = PrepareData({input_name}, kernel.InputAt({kernel_param.index(input_name)}), {trans_flag});"""
else: else:
if self.inputs['input_info'][ if self.inputs['input_info'][
input_name] == "const Tensor&": input_name] == "const Tensor&":
input_name_tensor_map[input_name].append(
(f"{PREFIX_TENSOR_NAME}{input_name}", False))
input_tensor_code = input_tensor_code + f""" input_tensor_code = input_tensor_code + f"""
{code_indent} auto {PREFIX_TENSOR_NAME}{input_name} = PrepareData({input_name}, kernel.InputAt({kernel_param.index(input_name)}), {trans_flag});""" {code_indent} auto {PREFIX_TENSOR_NAME}{input_name} = PrepareData({input_name}, kernel.InputAt({kernel_param.index(input_name)}), {trans_flag});"""
elif self.inputs['input_info'][ elif self.inputs['input_info'][
input_name] == "const std::vector<Tensor>&": input_name] == "const std::vector<Tensor>&":
input_name_tensor_map[input_name].append(
(f"{PREFIX_TENSOR_NAME}{input_name}_vec", True))
input_tensor_code = input_tensor_code + f""" input_tensor_code = input_tensor_code + f"""
{code_indent} auto {PREFIX_TENSOR_NAME}{input_name}_vec = PrepareData({input_name}, kernel.InputAt({kernel_param.index(input_name)}), {trans_flag}); {code_indent} auto {PREFIX_TENSOR_NAME}{input_name}_vec = PrepareData({input_name}, kernel.InputAt({kernel_param.index(input_name)}), {trans_flag});
{code_indent} std::vector<const phi::DenseTensor*> {PREFIX_TENSOR_NAME}{input_name}({PREFIX_TENSOR_NAME}{input_name}_vec->size()); {code_indent} std::vector<const phi::DenseTensor*> {PREFIX_TENSOR_NAME}{input_name}({PREFIX_TENSOR_NAME}{input_name}_vec->size());
...@@ -604,8 +612,11 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d ...@@ -604,8 +612,11 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
# do nothing # do nothing
pass pass
else: # input is selected_rows else: # input is selected_rows
input_name_tensor_map[input_name].append(
(f"{PREFIX_TENSOR_NAME}{input_name}", False))
input_tensor_code = input_tensor_code + f""" input_tensor_code = input_tensor_code + f"""
{code_indent} auto {PREFIX_TENSOR_NAME}{input_name} = TensorToSelectedRows({input_name});""" {code_indent} auto {PREFIX_TENSOR_NAME}{input_name} = TensorToSelectedRows({input_name});
"""
else: else:
if input_name in self.infer_meta['param']: if input_name in self.infer_meta['param']:
if input_name in self.optional_vars: if input_name in self.optional_vars:
...@@ -621,7 +632,65 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d ...@@ -621,7 +632,65 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
else: else:
input_tensor_code = input_tensor_code + f""" input_tensor_code = input_tensor_code + f"""
{code_indent} auto {PREFIX_TENSOR_NAME}{input_name} = {input_name}.impl();""" {code_indent} auto {PREFIX_TENSOR_NAME}{input_name} = {input_name}.impl();"""
input_tensor_code = input_tensor_code + f"""
{code_indent} if(platform::RecordOpInfoSupplement::IsEnabled()){{"""
single_tensor_names = []
list_tensor_names = []
for input_name, input_tensors in input_name_tensor_map.items():
has_vector_tensor = False
for input_tensor, is_vector in input_tensors:
if is_vector is True:
has_vector_tensor = True
if has_vector_tensor is False:
single_tensor_names.append(input_name)
else:
list_tensor_names.append(input_name)
if not single_tensor_names:
input_tensor_code = input_tensor_code + f"""
{code_indent} std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes;"""
else:
input_tensor_code = input_tensor_code + f"""
{code_indent} std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{{"""
for input_name in single_tensor_names[:-1]:
input_tensors = input_name_tensor_map[input_name]
input_tensor_code = input_tensor_code + f"""
{code_indent} {{"{input_name}", {{"""
for input_tensor, _ in input_tensors[:-1]:
input_tensor_code = input_tensor_code + f"""
{code_indent} (*{input_tensor}).dims(),"""
input_tensor_code = input_tensor_code + f"""
{code_indent} (*{input_tensors[-1][0]}).dims()}}}},"""
input_tensors = input_name_tensor_map[single_tensor_names[-1]]
input_tensor_code = input_tensor_code + f"""
{code_indent} {{"{single_tensor_names[-1]}", {{"""
for input_tensor, _ in input_tensors[:-1]:
input_tensor_code = input_tensor_code + f"""
{code_indent} (*{input_tensor}).dims(),"""
input_tensor_code = input_tensor_code + f"""
{code_indent} (*{input_tensors[-1][0]}).dims()}}}}}};"""
if list_tensor_names:
input_tensor_code = input_tensor_code + f"""
{code_indent} std::vector<phi::DDim> ddims_vec;"""
for input_name in list_tensor_names:
input_tensor_code = input_tensor_code + f"""
{code_indent} ddims_vec.clear();"""
for input_tensor, is_vector in input_name_tensor_map[input_name]:
if is_vector:
input_tensor_code = input_tensor_code + f"""
{code_indent} ddims_vec.reserve({input_tensor[:-4]}.size());
{code_indent} for (size_t i = 0; i < {input_tensor[:-4]}.size(); ++i) {{
{code_indent} ddims_vec.emplace_back((*{input_tensor[:-4]}[i]).dims());
{code_indent} }}"""
else:
input_tensor_code = input_tensor_code + f"""
ddims_vec.emplace_back((*{input_tensor}).dims());
{code_indent} """
input_tensor_code = input_tensor_code + f"""
{code_indent} input_shapes.emplace_back("{input_name}", ddims_vec);"""
input_tensor_code = input_tensor_code + f"""
{code_indent} platform::RecordOpInfoSupplement("{self.api}", input_shapes);
{code_indent} }}"""
kernel_args = ["*dev_ctx"] kernel_args = ["*dev_ctx"]
for param in kernel_param: for param in kernel_param:
if param in input_names: if param in input_names:
...@@ -709,17 +778,26 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d ...@@ -709,17 +778,26 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
{code_indent} "{kernel_name}", {{kernel_backend, kernel_layout, kernel_data_type}}{cudnn_args}); {code_indent} "{kernel_name}", {{kernel_backend, kernel_layout, kernel_data_type}}{cudnn_args});
{code_indent} const auto& kernel = kernel_result.kernel; {code_indent} const auto& kernel = kernel_result.kernel;
{code_indent} VLOG(6) << "{kernel_name} kernel: " << kernel; {code_indent} VLOG(6) << "{kernel_name} kernel: " << kernel;
{code_indent} auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend); {code_indent} auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);
{input_tensors} {input_tensors}
{output_create} {output_create}
{code_indent} paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
{code_indent} if(paddle::platform::RecordEvent::IsEnabled()){{
{code_indent} infer_shape_record_event = new paddle::platform::RecordEvent(\"{self.api} infer_meta\", paddle::platform::TracerEventType::OperatorInner, 1);
{code_indent} }}
{self.gene_infer_meta(kernel_output_names, code_indent)} {self.gene_infer_meta(kernel_output_names, code_indent)}
{code_indent} if(infer_shape_record_event != nullptr){{
{code_indent} delete infer_shape_record_event;
{code_indent} }}
{code_indent} using kernel_signature = {kernel_signature}; {code_indent} using kernel_signature = {kernel_signature};
{code_indent} auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>(); {code_indent} auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
{code_indent} {{ {code_indent} paddle::platform::RecordEvent* kernel_record_event = nullptr;
{code_indent} paddle::platform::RecordEvent kernel_record_event(\"{kernel_name} compute\", paddle::platform::TracerEventType::OperatorInner, 1); {code_indent} if(paddle::platform::RecordEvent::IsEnabled()){{
{code_indent} kernel_record_event = new paddle::platform::RecordEvent(\"{self.api} compute\", paddle::platform::TracerEventType::OperatorInner, 1);
{code_indent} }}
{code_indent} (*kernel_fn)({kernel_args}, {", ".join(outputs_args)}); {code_indent} (*kernel_fn)({kernel_args}, {", ".join(outputs_args)});
{code_indent} if(kernel_record_event != nullptr){{
{code_indent} delete kernel_record_event;
{code_indent} }} {code_indent} }}
{code_indent} if (kernel_result.has_fallback_cpu) {{ {code_indent} if (kernel_result.has_fallback_cpu) {{
{fallback_kernel_output_trans} {fallback_kernel_output_trans}
......
...@@ -252,6 +252,7 @@ def source_include(header_file_path): ...@@ -252,6 +252,7 @@ def source_include(header_file_path):
#include "paddle/phi/infermeta/ternary.h" #include "paddle/phi/infermeta/ternary.h"
#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
DECLARE_bool(conv2d_disable_cudnn); DECLARE_bool(conv2d_disable_cudnn);
""" """
......
...@@ -220,6 +220,7 @@ def source_include(header_file_path): ...@@ -220,6 +220,7 @@ def source_include(header_file_path):
#include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
DECLARE_bool(conv2d_disable_cudnn); DECLARE_bool(conv2d_disable_cudnn);
""" """
......
...@@ -52,6 +52,7 @@ def source_include(header_file_path): ...@@ -52,6 +52,7 @@ def source_include(header_file_path):
#include "paddle/phi/infermeta/ternary.h" #include "paddle/phi/infermeta/ternary.h"
#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
""" """
......
...@@ -927,7 +927,7 @@ class Layer(object): ...@@ -927,7 +927,7 @@ class Layer(object):
self._built = True self._built = True
if in_profiler_mode(): if in_profiler_mode():
with profiler.RecordEvent(self.full_name(), with profiler.RecordEvent(self.__class__.__name__,
profiler.TracerEventType.Forward): profiler.TracerEventType.Forward):
outputs = self.forward(*inputs, **kwargs) outputs = self.forward(*inputs, **kwargs)
else: else:
......
...@@ -135,7 +135,9 @@ class TestProfiler(unittest.TestCase): ...@@ -135,7 +135,9 @@ class TestProfiler(unittest.TestCase):
record=2, record=2,
repeat=1, repeat=1,
skip_first=1), skip_first=1),
on_trace_ready=my_trace_back) as prof: on_trace_ready=my_trace_back,
profile_memory=True,
record_shapes=True) as prof:
for i in range(5): for i in range(5):
y = x / 2.0 y = x / 2.0
paddle.grad(outputs=y, inputs=[x], grad_outputs=ones_like_y) paddle.grad(outputs=y, inputs=[x], grad_outputs=ones_like_y)
......
...@@ -23,7 +23,10 @@ import json ...@@ -23,7 +23,10 @@ import json
import paddle import paddle
from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions, from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions,
TracerEventType) TracerEventType, enable_memory_recorder,
enable_input_shape_recorder,
disable_memory_recorder,
disable_input_shape_recorder)
from .utils import RecordEvent, wrap_optimizers from .utils import RecordEvent, wrap_optimizers
from .profiler_statistic import StatisticData, _build_table, SortedKeys from .profiler_statistic import StatisticData, _build_table, SortedKeys
...@@ -279,6 +282,8 @@ class Profiler: ...@@ -279,6 +282,8 @@ class Profiler:
This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. The default value is :ref:`export_chrome_tracing <api_paddle_profiler_export_chrome_tracing>` (./profiler_log/). This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. The default value is :ref:`export_chrome_tracing <api_paddle_profiler_export_chrome_tracing>` (./profiler_log/).
timer_only (bool, optional): If it is True, the cost of Dataloader and every step of the model will be count without profiling. Otherwise, the model will timer_only (bool, optional): If it is True, the cost of Dataloader and every step of the model will be count without profiling. Otherwise, the model will
be timed and profiled. Default: False. be timed and profiled. Default: False.
record_shapes (bool, optional): If it is True, collect op's input shape information. Default: False.
profile_memory (bool, optional): If it is True, collect tensor memory allocation and release information. Default: False.
Examples: Examples:
1. profiling range [2, 5). 1. profiling range [2, 5).
...@@ -396,6 +401,8 @@ class Profiler: ...@@ -396,6 +401,8 @@ class Profiler:
scheduler: Union[Callable[[int], ProfilerState], tuple, scheduler: Union[Callable[[int], ProfilerState], tuple,
None] = None, None] = None,
on_trace_ready: Optional[Callable[..., Any]] = None, on_trace_ready: Optional[Callable[..., Any]] = None,
record_shapes: Optional[bool] = False,
profile_memory=False,
timer_only: Optional[bool] = False): timer_only: Optional[bool] = False):
supported_targets = _get_supported_targets() supported_targets = _get_supported_targets()
if targets: if targets:
...@@ -447,6 +454,8 @@ class Profiler: ...@@ -447,6 +454,8 @@ class Profiler:
self.record_event = None self.record_event = None
self.profiler_result = None self.profiler_result = None
self.timer_only = timer_only self.timer_only = timer_only
self.record_shapes = record_shapes
self.profile_memory = profile_memory
def __enter__(self): def __enter__(self):
self.start() self.start()
...@@ -481,6 +490,10 @@ class Profiler: ...@@ -481,6 +490,10 @@ class Profiler:
benchmark().begin() benchmark().begin()
if self.timer_only: if self.timer_only:
return return
if self.record_shapes:
enable_input_shape_recorder()
if self.profile_memory:
enable_memory_recorder()
# CLOSED -> self.current_state # CLOSED -> self.current_state
utils._is_profiler_used = True utils._is_profiler_used = True
if self.current_state == ProfilerState.READY: if self.current_state == ProfilerState.READY:
...@@ -520,6 +533,10 @@ class Profiler: ...@@ -520,6 +533,10 @@ class Profiler:
benchmark().end() benchmark().end()
if self.timer_only: if self.timer_only:
return return
if self.record_shapes:
disable_input_shape_recorder()
if self.profile_memory:
disable_memory_recorder()
# self.current_state -> CLOSED # self.current_state -> CLOSED
# In this situation, RECORD state is regarded as RECORD_AND_RETURN # In this situation, RECORD state is regarded as RECORD_AND_RETURN
if self.record_event: if self.record_event:
......
...@@ -86,6 +86,7 @@ class HostStatisticNode: ...@@ -86,6 +86,7 @@ class HostStatisticNode:
for rt in self.runtime_node: for rt in self.runtime_node:
rt.cal_statistic() rt.cal_statistic()
self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns
self.self_cpu_time = self.cpu_time
for child in self.children_node: for child in self.children_node:
self.gpu_time += child.gpu_time self.gpu_time += child.gpu_time
self.general_gpu_time += child.general_gpu_time self.general_gpu_time += child.general_gpu_time
...@@ -918,7 +919,7 @@ def _build_table(statistic_data, ...@@ -918,7 +919,7 @@ def _build_table(statistic_data,
accmulation_time = 0 accmulation_time = 0
gpu_accmulation_time = 0 gpu_accmulation_time = 0
gpu_total_time = statistic_data.event_summary.model_perspective_items[ gpu_total_time = statistic_data.event_summary.model_perspective_items[
'ProfileStep'].general_gpu_time 'ProfileStep'].gpu_time
for name in [ for name in [
'ProfileStep', 'Dataloader', 'Forward', 'Backward', 'ProfileStep', 'Dataloader', 'Forward', 'Backward',
'Optimization' 'Optimization'
...@@ -928,7 +929,7 @@ def _build_table(statistic_data, ...@@ -928,7 +929,7 @@ def _build_table(statistic_data,
if gpu_total_time == 0: if gpu_total_time == 0:
gpu_ratio = 0 gpu_ratio = 0
else: else:
gpu_ratio = float(item.general_gpu_time) / gpu_total_time gpu_ratio = float(item.gpu_time) / gpu_total_time
name = '{}'.format( name = '{}'.format(
name) if 'ProfileStep' in name else ' {}'.format(name) name) if 'ProfileStep' in name else ' {}'.format(name)
row_values = [ row_values = [
...@@ -949,7 +950,7 @@ def _build_table(statistic_data, ...@@ -949,7 +950,7 @@ def _build_table(statistic_data,
all_row_values.append(row_values) all_row_values.append(row_values)
if 'ProfileStep' not in name: if 'ProfileStep' not in name:
accmulation_time += item.cpu_time accmulation_time += item.cpu_time
gpu_accmulation_time += item.general_gpu_time gpu_accmulation_time += item.gpu_time
other_time = total_time - accmulation_time other_time = total_time - accmulation_time
other_gpu_time = gpu_total_time - gpu_accmulation_time other_gpu_time = gpu_total_time - gpu_accmulation_time
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册