未验证 提交 c27c9268 编写于 作者: M Mihai Maruseac 提交者: GitHub

Merge pull request #44411 from yisitu/cherrypicks_6XDIY

[Cherrypick r2.4] Delay start of profiler when configured
......@@ -55,6 +55,7 @@ cc_library(
"//tensorflow/core/profiler/protobuf:xplane_proto_cc",
"//tensorflow/core/profiler:profiler_options_proto_cc",
"@com_google_absl//absl/memory",
"@com_google_absl//absl/time",
"//tensorflow/core:protos_all_cc",
] + if_not_android([
":profiler_lock",
......
......@@ -18,7 +18,10 @@ limitations under the License.
#include <memory>
#include "absl/memory/memory.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "tensorflow/core/platform/env_time.h"
#include "tensorflow/core/platform/errors.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/platform.h"
......@@ -65,6 +68,7 @@ tensorflow::Status ProfilerSession::Status() {
Status ProfilerSession::CollectData(profiler::XSpace* space) {
mutex_lock l(mutex_);
if (!status_.ok()) return status_;
LOG(INFO) << "Profiler session collecting data.";
for (auto& profiler : profilers_) {
profiler->Stop().IgnoreError();
}
......@@ -116,7 +120,6 @@ ProfilerSession::ProfilerSession(ProfileOptions options)
#else
: active_(false),
#endif
start_time_ns_(EnvTime::NowNanos()),
options_(std::move(options)) {
if (!active_) {
#if !defined(IS_MOBILE_PLATFORM)
......@@ -130,6 +133,25 @@ ProfilerSession::ProfilerSession(ProfileOptions options)
return;
}
LOG(INFO) << "Profiler session initializing.";
// Sleep until it is time to start profiling.
const bool delayed_start = options_.start_timestamp_ns() > 0;
if (delayed_start) {
absl::Time scheduled_start =
absl::FromUnixNanos(options_.start_timestamp_ns());
auto now = absl::Now();
if (scheduled_start < now) {
LOG(WARNING) << "Profiling is late (" << now
<< ") for the scheduled start (" << scheduled_start
<< ") and will start immediately.";
} else {
absl::Duration sleep_duration = scheduled_start - now;
LOG(INFO) << "Delaying start of profiler session by " << sleep_duration;
absl::SleepFor(sleep_duration);
}
}
start_time_ns_ = EnvTime::NowNanos();
LOG(INFO) << "Profiler session started.";
#if !defined(IS_MOBILE_PLATFORM)
......@@ -147,7 +169,7 @@ ProfilerSession::ProfilerSession(ProfileOptions options)
}
ProfilerSession::~ProfilerSession() {
VLOG(1) << "Profiler session stopping.";
LOG(INFO) << "Profiler session tear down.";
for (auto& profiler : profilers_) {
profiler->Stop().IgnoreError();
}
......
......@@ -79,7 +79,7 @@ class ProfilerSession {
bool active_ TF_GUARDED_BY(mutex_);
tensorflow::Status status_ TF_GUARDED_BY(mutex_);
const uint64 start_time_ns_;
uint64 start_time_ns_;
mutex mutex_;
ProfileOptions options_;
};
......
......@@ -63,7 +63,7 @@ message ProfileOptions {
}
// Options for remote profiler session manager.
// Next ID: 5
// Next ID: 6
message RemoteProfilerSessionManagerOptions {
// Options for each local profiler.
ProfileOptions profiler_options = 1;
......@@ -79,4 +79,7 @@ message RemoteProfilerSessionManagerOptions {
// continues until interrupted. Otherwise, value must be greater than
// profiler_options.duration_ms.
uint64 max_session_duration_ms = 4;
// Start of profiling is delayed by this much (in milliseconds).
uint64 delay_ms = 5;
}
......@@ -37,6 +37,7 @@ cc_library(
"//tensorflow/core/profiler/convert:xplane_to_profile_response",
"//tensorflow/core/profiler/protobuf:xplane_proto_cc",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/time",
],
)
......
......@@ -21,6 +21,8 @@ limitations under the License.
#include "absl/strings/str_join.h"
#include "absl/strings/str_split.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "tensorflow/core/platform/errors.h"
#include "tensorflow/core/platform/host_info.h"
#include "tensorflow/core/platform/status.h"
......@@ -176,7 +178,7 @@ Status NewSession(absl::string_view repository_root,
} // namespace
Status Trace(const std::string& logdir, int num_tracing_attempts,
const RemoteProfilerSessionManagerOptions& opts,
RemoteProfilerSessionManagerOptions& opts,
bool is_cloud_tpu_session) {
DCHECK_GT(opts.profiler_options().duration_ms(), 0);
DCHECK(!opts.service_addresses().empty());
......@@ -190,6 +192,14 @@ Status Trace(const std::string& logdir, int num_tracing_attempts,
Status status;
int remaining_attempts = num_tracing_attempts;
while (true) {
auto start_timestamp = absl::Now() + absl::Milliseconds(opts.delay_ms());
opts.mutable_profiler_options()->set_start_timestamp_ns(
absl::ToUnixNanos(start_timestamp));
LOG(INFO) << "Profiler delay_ms was " << opts.delay_ms()
<< ", start_timestamp_ns set to "
<< opts.profiler_options().start_timestamp_ns() << " ["
<< start_timestamp << "]";
std::cout << "Starting to trace for " << duration_ms << " ms. "
<< "Remaining attempt(s): " << --remaining_attempts << std::endl;
......
......@@ -40,7 +40,7 @@ Status Monitor(const std::string& service_addr, int duration_ms,
// in the given logdir. If no trace was collected, retries tracing for
// num_tracing_attempts. Assumes that options have been validated.
Status Trace(const std::string& logdir, int num_tracing_attempts,
const RemoteProfilerSessionManagerOptions& opts,
RemoteProfilerSessionManagerOptions& opts,
bool is_cloud_tpu_session);
} // namespace profiler
......
......@@ -116,8 +116,8 @@ class ProfilerApiTest(test_util.TensorFlowTestCase):
thread_worker.join(120)
self._check_xspace_pb_exist(logdir)
def test_single_worker_sampling_mode_delayed(self):
"""Test single worker sampling mode with delay."""
def test_single_worker_sampling_mode_long_delay(self):
"""Test single worker sampling mode with a long delay."""
self.test_single_worker_sampling_mode(delay_ms=1000)
......
......@@ -146,7 +146,6 @@ RemoteProfilerSessionManagerOptions GetOptionsLocked(absl::string_view logdir,
VLOG(2) << "repository_path set to "
<< options.profiler_options().repository_path();
int delay_ms = 0;
for (const auto& kw : opts) {
std::string key = py::cast<std::string>(kw.first);
if (key == "host_tracer_level") {
......@@ -163,26 +162,15 @@ RemoteProfilerSessionManagerOptions GetOptionsLocked(absl::string_view logdir,
VLOG(1) << "python_tracer_level set to " << value;
} else if (key == "delay_ms") {
if (!kw.second.is_none()) {
delay_ms = py::cast<int>(kw.second);
auto value = py::cast<int>(kw.second);
options.set_delay_ms(value);
VLOG(1) << "delay_ms was set to " << value;
}
} else {
LOG(WARNING) << "Unrecognised key: " << key;
}
}
if (delay_ms) {
absl::Time start_timestamp = now + absl::Milliseconds(delay_ms);
tensorflow::int64 start_timestamp_ns = absl::ToUnixNanos(start_timestamp);
options.mutable_profiler_options()->set_start_timestamp_ns(
start_timestamp_ns);
LOG(INFO) << "delay_ms was " << delay_ms << ", start_timestamp_ns set to "
<< start_timestamp_ns << " [" << start_timestamp << "]";
} else {
DCHECK_EQ(options.mutable_profiler_options()->start_timestamp_ns(), 0);
LOG(INFO) << "Profiling will start immediately because delay_ms was unset "
"or zero.";
}
return options;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册