提交 3971c27b 编写于 作者: A Alexey Surkov 提交者: TensorFlower Gardener

Better logging for errors.

1) for resumable uploads, log the last causing failure in the final AbortedError
2) log all retry attempts

Logging example from the unittest:
W0309 18:00:39.706445  687030 retrying_utils.cc:67] The operation failed and will be automatically retried in 0.995024 seconds (attempt 1 out of 10), caused by: Unavailable: Failed.
W0309 18:00:39.706562  687030 retrying_utils.cc:67] The operation failed and will be automatically retried in 1.39374 seconds (attempt 2 out of 10), caused by: Unavailable: Failed.
W0309 18:00:39.706605  687030 retrying_utils.cc:67] The operation failed and will be automatically retried in 2.63587 seconds (attempt 3 out of 10), caused by: Unavailable: Failed.
W0309 18:00:39.706652  687030 retrying_utils.cc:67] The operation failed and will be automatically retried in 4.30139 seconds (attempt 4 out of 10), caused by: Unavailable: Failed.
W0309 18:00:39.706693  687030 retrying_utils.cc:67] The operation failed and will be automatically retried in 8.32369 seconds (attempt 5 out of 10), caused by: Unavailable: Failed.
W0309 18:00:39.706751  687030 retrying_utils.cc:67] The operation failed and will be automatically retried in 16.4618 seconds (attempt 6 out of 10), caused by: Unavailable: Failed.
W0309 18:00:39.706820  687030 retrying_utils.cc:67] The operation failed and will be automatically retried in 32.2621 seconds (attempt 7 out of 10), caused by: Unavailable: Failed.
W0309 18:00:39.706867  687030 retrying_utils.cc:67] The operation failed and will be automatically retried in 32.2265 seconds (attempt 8 out of 10), caused by: Unavailable: Failed.
W0309 18:00:39.706903  687030 retrying_utils.cc:67] The operation failed and will be automatically retried in 32.8241 seconds (attempt 9 out of 10), caused by: Unavailable: Failed.
W0309 18:00:39.706939  687030 retrying_utils.cc:67] The operation failed and will be automatically retried in 32.4939 seconds (attempt 10 out of 10), caused by: Unavailable: Failed.
Change: 150027572
上级 4aaf956e
......@@ -411,22 +411,14 @@ class GcsWritableFile : public WritableFile {
return UploadToSession(session_uri, already_uploaded);
},
initial_retry_delay_usec_);
switch (upload_status.code()) {
case errors::Code::OK:
return Status::OK();
case errors::Code::NOT_FOUND:
// GCS docs recommend retrying the whole upload. We're relying on the
// RetryingFileSystem to retry the Sync() call.
return errors::Unavailable("Could not upload gs://", bucket_, "/",
object_);
case errors::Code::UNAVAILABLE:
// Return ABORTED so that RetryingFileSystem doesn't retry again.
return errors::Aborted("Upload gs://", bucket_, "/", object_,
" failed.");
default:
// Something unexpected happen, fail.
return upload_status;
if (upload_status.code() == errors::Code::NOT_FOUND) {
// GCS docs recommend retrying the whole upload. We're relying on the
// RetryingFileSystem to retry the Sync() call.
return errors::Unavailable(
strings::StrCat("Upload to gs://", bucket_, "/", object_,
" failed, caused by: ", upload_status.ToString()));
}
return upload_status;
}
Status CheckWritable() const {
......
......@@ -332,18 +332,19 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
"Put body: content1,content2\n",
"", errors::Unavailable("503"), 503)});
for (int i = 0; i < 10; i++) {
requests.emplace_back(new FakeHttpRequest(
"Uri: https://custom/upload/location\n"
"Auth Token: fake_token\n"
"Header Content-Range: bytes */17\n"
"Put: yes\n",
"", errors::Unavailable("308"), nullptr, {{"Range", "0-10"}}, 308));
requests.emplace_back(
new FakeHttpRequest("Uri: https://custom/upload/location\n"
"Auth Token: fake_token\n"
"Header Content-Range: bytes 11-16/17\n"
"Put body: ntent2\n",
"", errors::Unavailable("503"), 503));
"Header Content-Range: bytes */17\n"
"Put: yes\n",
"", errors::Unavailable("important HTTP error 308"),
nullptr, {{"Range", "0-10"}}, 308));
requests.emplace_back(new FakeHttpRequest(
"Uri: https://custom/upload/location\n"
"Auth Token: fake_token\n"
"Header Content-Range: bytes 11-16/17\n"
"Put body: ntent2\n",
"", errors::Unavailable("important HTTP error 503"), 503));
}
// These calls will be made in the Close() attempt from the destructor.
// Letting the destructor succeed.
......@@ -370,7 +371,12 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
TF_EXPECT_OK(file->Append("content1,"));
TF_EXPECT_OK(file->Append("content2"));
EXPECT_EQ(errors::Code::ABORTED, file->Close().code());
const auto& status = file->Close();
EXPECT_EQ(errors::Code::ABORTED, status.code());
EXPECT_TRUE(StringPiece(status.error_message())
.contains("All 10 retry attempts failed. The last failure: "
"Unavailable: important HTTP error 503"))
<< status;
}
TEST(GcsFileSystemTest, NewWritableFile_UploadReturns404) {
......@@ -386,7 +392,8 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns404) {
"Auth Token: fake_token\n"
"Header Content-Range: bytes 0-16/17\n"
"Put body: content1,content2\n",
"", errors::NotFound("404"), 404),
"", errors::NotFound("important HTTP error 404"),
404),
// These calls will be made in the Close() attempt from the destructor.
// Letting the destructor succeed.
new FakeHttpRequest(
......@@ -411,7 +418,14 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns404) {
TF_EXPECT_OK(file->Append("content1,"));
TF_EXPECT_OK(file->Append("content2"));
EXPECT_EQ(errors::Code::UNAVAILABLE, file->Close().code());
const auto& status = file->Close();
EXPECT_EQ(errors::Code::UNAVAILABLE, status.code());
EXPECT_TRUE(
StringPiece(status.error_message())
.contains(
"Upload to gs://bucket/path/writeable.txt failed, caused by: "
"Not found: important HTTP error 404"))
<< status;
}
TEST(GcsFileSystemTest, NewWritableFile_NoObjectName) {
......
......@@ -234,8 +234,10 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_AllRetriesFailed) {
// Use it and check the results.
StringPiece result;
char scratch[10];
EXPECT_EQ("Retriable error #10",
random_access_file->Read(0, 10, &result, scratch).error_message());
const auto& status = random_access_file->Read(0, 10, &result, scratch);
EXPECT_TRUE(
StringPiece(status.error_message()).contains("Retriable error #10"))
<< status;
}
TEST(RetryingFileSystemTest, NewRandomAccessFile_NoRetriesForSomeErrors) {
......@@ -386,7 +388,10 @@ TEST(RetryingFileSystemTest, NewWritableFile_AllRetriesFailed) {
TF_EXPECT_OK(fs.NewWritableFile("filename.txt", &writable_file));
// Use it and check the results.
EXPECT_EQ("Retriable error #10", writable_file->Sync().error_message());
const auto& status = writable_file->Sync();
EXPECT_TRUE(
StringPiece(status.error_message()).contains("Retriable error #10"))
<< status;
}
TEST(RetryingFileSystemTest,
......@@ -411,9 +416,11 @@ TEST(RetryingFileSystemTest, NewReadOnlyMemoryRegionFromFile_AllRetriesFailed) {
RetryingFileSystem fs(std::move(base_fs), 0);
std::unique_ptr<ReadOnlyMemoryRegion> result;
EXPECT_EQ("Retriable error #10",
fs.NewReadOnlyMemoryRegionFromFile("filename.txt", &result)
.error_message());
const auto& status =
fs.NewReadOnlyMemoryRegionFromFile("filename.txt", &result);
EXPECT_TRUE(
StringPiece(status.error_message()).contains("Retriable error #10"))
<< status;
}
TEST(RetryingFileSystemTest, GetChildren_SuccessWith2ndTry) {
......@@ -436,8 +443,10 @@ TEST(RetryingFileSystemTest, GetChildren_AllRetriesFailed) {
RetryingFileSystem fs(std::move(base_fs), 0);
std::vector<string> result;
EXPECT_EQ("Retriable error #10",
fs.GetChildren("gs://path", &result).error_message());
const auto& status = fs.GetChildren("gs://path", &result);
EXPECT_TRUE(
StringPiece(status.error_message()).contains("Retriable error #10"))
<< status;
}
TEST(RetryingFileSystemTest, GetMatchingPaths_SuccessWith2ndTry) {
......@@ -461,8 +470,10 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_AllRetriesFailed) {
RetryingFileSystem fs(std::move(base_fs), 0);
std::vector<string> result;
EXPECT_EQ("Retriable error #10",
fs.GetMatchingPaths("gs://path/dir", &result).error_message());
const auto& status = fs.GetMatchingPaths("gs://path/dir", &result);
EXPECT_TRUE(
StringPiece(status.error_message()).contains("Retriable error #10"))
<< status;
}
TEST(RetryingFileSystemTest, DeleteFile_SuccessWith2ndTry) {
......@@ -484,8 +495,10 @@ TEST(RetryingFileSystemTest, DeleteFile_AllRetriesFailed) {
RetryingFileSystem fs(std::move(base_fs), 0);
std::vector<string> result;
EXPECT_EQ("Retriable error #10",
fs.DeleteFile("gs://path/file.txt").error_message());
const auto& status = fs.DeleteFile("gs://path/file.txt");
EXPECT_TRUE(
StringPiece(status.error_message()).contains("Retriable error #10"))
<< status;
}
TEST(RetryingFileSystemTest, CreateDir_SuccessWith2ndTry) {
......@@ -507,8 +520,10 @@ TEST(RetryingFileSystemTest, CreateDir_AllRetriesFailed) {
RetryingFileSystem fs(std::move(base_fs), 0);
std::vector<string> result;
EXPECT_EQ("Retriable error #10",
fs.CreateDir("gs://path/newdir").error_message());
const auto& status = fs.CreateDir("gs://path/newdir");
EXPECT_TRUE(
StringPiece(status.error_message()).contains("Retriable error #10"))
<< status;
}
TEST(RetryingFileSystemTest, DeleteDir_SuccessWith2ndTry) {
......@@ -530,8 +545,10 @@ TEST(RetryingFileSystemTest, DeleteDir_AllRetriesFailed) {
RetryingFileSystem fs(std::move(base_fs), 0);
std::vector<string> result;
EXPECT_EQ("Retriable error #10",
fs.DeleteDir("gs://path/dir").error_message());
const auto& status = fs.DeleteDir("gs://path/dir");
EXPECT_TRUE(
StringPiece(status.error_message()).contains("Retriable error #10"))
<< status;
}
TEST(RetryingFileSystemTest, GetFileSize_SuccessWith2ndTry) {
......@@ -554,8 +571,10 @@ TEST(RetryingFileSystemTest, GetFileSize_AllRetriesFailed) {
RetryingFileSystem fs(std::move(base_fs), 0);
uint64 size;
EXPECT_EQ("Retriable error #10",
fs.GetFileSize("gs://path/file.txt", &size).error_message());
const auto& status = fs.GetFileSize("gs://path/file.txt", &size);
EXPECT_TRUE(
StringPiece(status.error_message()).contains("Retriable error #10"))
<< status;
}
TEST(RetryingFileSystemTest, RenameFile_SuccessWith2ndTry) {
......@@ -575,8 +594,10 @@ TEST(RetryingFileSystemTest, RenameFile_AllRetriesFailed) {
new MockFileSystem(expected_fs_calls));
RetryingFileSystem fs(std::move(base_fs), 0);
EXPECT_EQ("Retriable error #10",
fs.RenameFile("old_name", "new_name").error_message());
const auto& status = fs.RenameFile("old_name", "new_name");
EXPECT_TRUE(
StringPiece(status.error_message()).contains("Retriable error #10"))
<< status;
}
TEST(RetryingFileSystemTest, Stat_SuccessWith2ndTry) {
......@@ -598,7 +619,10 @@ TEST(RetryingFileSystemTest, Stat_AllRetriesFailed) {
RetryingFileSystem fs(std::move(base_fs), 0);
FileStatistics stat;
EXPECT_EQ("Retriable error #10", fs.Stat("file_name", &stat).error_message());
const auto& status = fs.Stat("file_name", &stat);
EXPECT_TRUE(
StringPiece(status.error_message()).contains("Retriable error #10"))
<< status;
}
TEST(RetryingFileSystemTest, FileExists_AllRetriesFailed) {
......@@ -607,7 +631,10 @@ TEST(RetryingFileSystemTest, FileExists_AllRetriesFailed) {
new MockFileSystem(expected_fs_calls));
RetryingFileSystem fs(std::move(base_fs), 0);
EXPECT_EQ("Retriable error #10", fs.FileExists("file_name").error_message());
const auto& status = fs.FileExists("file_name");
EXPECT_TRUE(
StringPiece(status.error_message()).contains("Retriable error #10"))
<< status;
}
TEST(RetryingFileSystemTest, FileExists_SuccessWith2ndTry) {
......@@ -639,8 +666,10 @@ TEST(RetryingFileSystemTest, IsDirectory_AllRetriesFailed) {
new MockFileSystem(expected_fs_calls));
RetryingFileSystem fs(std::move(base_fs), 0);
EXPECT_EQ("Retriable error #10",
fs.IsDirectory("gs://path/dir").error_message());
const auto& status = fs.IsDirectory("gs://path/dir");
EXPECT_TRUE(
StringPiece(status.error_message()).contains("Retriable error #10"))
<< status;
}
TEST(RetryingFileSystemTest, DeleteRecursively_SuccessWith2ndTry) {
......@@ -665,10 +694,11 @@ TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) {
RetryingFileSystem fs(std::move(base_fs), 0);
int64 undeleted_files, undeleted_dirs;
EXPECT_EQ(
"Retriable error #10",
fs.DeleteRecursively("gs://path/dir", &undeleted_files, &undeleted_dirs)
.error_message());
const auto& status =
fs.DeleteRecursively("gs://path/dir", &undeleted_files, &undeleted_dirs);
EXPECT_TRUE(
StringPiece(status.error_message()).contains("Retriable error #10"))
<< status;
}
} // namespace
......
......@@ -28,8 +28,8 @@ constexpr int kMaxRetries = 10;
// Maximum backoff time in microseconds.
constexpr int64 kMaximumBackoffMicroseconds = 32000000; // 32 seconds.
bool IsRetriable(Status status) {
switch (status.code()) {
bool IsRetriable(error::Code code) {
switch (code) {
case error::UNAVAILABLE:
case error::DEADLINE_EXCEEDED:
case error::UNKNOWN:
......@@ -40,13 +40,6 @@ bool IsRetriable(Status status) {
}
}
void WaitBeforeRetry(const int64 delay_micros,
const std::function<void(int64)>& sleep_usec) {
const int64 random_micros = random::New64() % 1000000;
sleep_usec(std::min(delay_micros, kMaximumBackoffMicroseconds) +
random_micros);
}
} // namespace
Status RetryingUtils::CallWithRetries(const std::function<Status()>& f,
......@@ -62,13 +55,30 @@ Status RetryingUtils::CallWithRetries(
int retries = 0;
while (true) {
auto status = f();
if (!IsRetriable(status) || retries >= kMaxRetries) {
if (!IsRetriable(status.code())) {
return status;
}
const int64 delay_micros = initial_delay_microseconds << retries;
if (delay_micros > 0) {
WaitBeforeRetry(delay_micros, sleep_usec);
if (retries >= kMaxRetries) {
// Return AbortedError, so that it doesn't get retried again somewhere
// at a higher level.
return Status(
error::ABORTED,
strings::StrCat(
"All ", kMaxRetries,
" retry attempts failed. The last failure: ", status.ToString()));
}
int64 delay_micros = 0;
if (initial_delay_microseconds > 0) {
const int64 random_micros = random::New64() % 1000000;
delay_micros = std::min(initial_delay_microseconds << retries,
kMaximumBackoffMicroseconds) +
random_micros;
}
LOG(INFO) << "The operation failed and will be automatically retried in "
<< (delay_micros / 1000000.0) << " seconds (attempt "
<< (retries + 1) << " out of " << kMaxRetries
<< "), caused by: " << status.ToString();
sleep_usec(delay_micros);
retries++;
}
}
......
......@@ -27,10 +27,15 @@ TEST(RetryingUtilsTest, RetryDelays) {
std::function<void(int64)> sleep = [&requested_delays](int64 delay) {
requested_delays.emplace_back(delay / 1000000.0);
};
std::function<Status()> f = []() { return errors::Unavailable(""); };
std::function<Status()> f = []() { return errors::Unavailable("Failed."); };
EXPECT_EQ(errors::Code::UNAVAILABLE,
RetryingUtils::CallWithRetries(f, 500000L, sleep).code());
const auto& status = RetryingUtils::CallWithRetries(f, 500000L, sleep);
EXPECT_EQ(errors::Code::ABORTED, status.code());
EXPECT_TRUE(StringPiece(status.error_message())
.contains("All 10 retry attempts "
"failed. The last failure: "
"Unavailable: Failed."))
<< status;
EXPECT_EQ(10, requested_delays.size());
EXPECT_NEAR(0.5, requested_delays[0], 1.0);
......@@ -48,7 +53,8 @@ TEST(RetryingUtilsTest, RetryDelays) {
}
TEST(RetryingUtilsTest, NotFoundIsNotRetried) {
std::vector<Status> results({errors::Unavailable(""), errors::NotFound("")});
std::vector<Status> results(
{errors::Unavailable("Failed."), errors::NotFound("Not found.")});
std::function<Status()> f = [&results]() {
auto result = results[0];
results.erase(results.begin());
......@@ -59,8 +65,9 @@ TEST(RetryingUtilsTest, NotFoundIsNotRetried) {
}
TEST(RetryingUtilsTest, EventualSuccess) {
std::vector<Status> results(
{errors::Unavailable(""), errors::Unavailable(""), Status::OK()});
std::vector<Status> results({errors::Unavailable("Failed."),
errors::Unavailable("Failed again."),
Status::OK()});
std::function<Status()> f = [&results]() {
auto result = results[0];
results.erase(results.begin());
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册