未验证 提交 9074366f 编写于 作者: G groot 提交者: GitHub

fix potential hang bug (#1884)

* test
Signed-off-by: Ngroot <yihua.mo@zilliz.com>

* test
Signed-off-by: Ngroot <yihua.mo@zilliz.com>

* test
Signed-off-by: Ngroot <yihua.mo@zilliz.com>

* unittest
Signed-off-by: Ngroot <yihua.mo@zilliz.com>

* format code
Signed-off-by: Ngroot <yihua.mo@zilliz.com>

* fix crash
Signed-off-by: Ngroot <yihua.mo@zilliz.com>

* merge master
Signed-off-by: Ngroot <yihua.mo@zilliz.com>
上级 f4626a78
...@@ -52,7 +52,7 @@ XBuildIndexTask::XBuildIndexTask(SegmentSchemaPtr file, TaskLabelPtr label) ...@@ -52,7 +52,7 @@ XBuildIndexTask::XBuildIndexTask(SegmentSchemaPtr file, TaskLabelPtr label)
void void
XBuildIndexTask::Load(milvus::scheduler::LoadType type, uint8_t device_id) { XBuildIndexTask::Load(milvus::scheduler::LoadType type, uint8_t device_id) {
TimeRecorder rc(""); TimeRecorder rc("XBuildIndexTask::Load");
Status stat = Status::OK(); Status stat = Status::OK();
std::string error_msg; std::string error_msg;
std::string type_str; std::string type_str;
...@@ -101,7 +101,7 @@ XBuildIndexTask::Load(milvus::scheduler::LoadType type, uint8_t device_id) { ...@@ -101,7 +101,7 @@ XBuildIndexTask::Load(milvus::scheduler::LoadType type, uint8_t device_id) {
std::string info = "Build index task load file id:" + std::to_string(file_->id_) + " " + type_str + std::string info = "Build index task load file id:" + std::to_string(file_->id_) + " " + type_str +
" file type:" + std::to_string(file_->file_type_) + " size:" + std::to_string(file_size) + " file type:" + std::to_string(file_->file_type_) + " size:" + std::to_string(file_size) +
" bytes from location: " + file_->location_ + " totally cost"; " bytes from location: " + file_->location_ + " totally cost";
double span = rc.ElapseFromBegin(info); rc.ElapseFromBegin(info);
to_index_id_ = file_->id_; to_index_id_ = file_->id_;
to_index_type_ = file_->file_type_; to_index_type_ = file_->file_type_;
...@@ -110,19 +110,21 @@ XBuildIndexTask::Load(milvus::scheduler::LoadType type, uint8_t device_id) { ...@@ -110,19 +110,21 @@ XBuildIndexTask::Load(milvus::scheduler::LoadType type, uint8_t device_id) {
void void
XBuildIndexTask::Execute() { XBuildIndexTask::Execute() {
if (to_index_engine_ == nullptr) { TimeRecorderAuto rc("XBuildIndexTask::Execute " + std::to_string(to_index_id_));
return;
}
TimeRecorder rc("DoBuildIndex file id:" + std::to_string(to_index_id_));
if (auto job = job_.lock()) { if (auto job = job_.lock()) {
auto build_index_job = std::static_pointer_cast<scheduler::BuildIndexJob>(job); auto build_index_job = std::static_pointer_cast<scheduler::BuildIndexJob>(job);
if (to_index_engine_ == nullptr) {
build_index_job->BuildIndexDone(to_index_id_);
build_index_job->GetStatus() = Status(DB_ERROR, "source index is null");
return;
}
std::string location = file_->location_; std::string location = file_->location_;
EngineType engine_type = (EngineType)file_->engine_type_; EngineType engine_type = (EngineType)file_->engine_type_;
std::shared_ptr<engine::ExecutionEngine> index; std::shared_ptr<engine::ExecutionEngine> index;
// step 2: create collection file // step 1: create collection file
engine::meta::SegmentSchema table_file; engine::meta::SegmentSchema table_file;
table_file.collection_id_ = file_->collection_id_; table_file.collection_id_ = file_->collection_id_;
table_file.segment_id_ = file_->file_id_; table_file.segment_id_ = file_->file_id_;
...@@ -131,6 +133,7 @@ XBuildIndexTask::Execute() { ...@@ -131,6 +133,7 @@ XBuildIndexTask::Execute() {
engine::meta::MetaPtr meta_ptr = build_index_job->meta(); engine::meta::MetaPtr meta_ptr = build_index_job->meta();
Status status = meta_ptr->CreateCollectionFile(table_file); Status status = meta_ptr->CreateCollectionFile(table_file);
fiu_do_on("XBuildIndexTask.Execute.create_table_success", status = Status::OK()); fiu_do_on("XBuildIndexTask.Execute.create_table_success", status = Status::OK());
if (!status.ok()) { if (!status.ok()) {
ENGINE_LOG_ERROR << "Failed to create collection file: " << status.ToString(); ENGINE_LOG_ERROR << "Failed to create collection file: " << status.ToString();
...@@ -140,73 +143,63 @@ XBuildIndexTask::Execute() { ...@@ -140,73 +143,63 @@ XBuildIndexTask::Execute() {
return; return;
} }
// step 3: build index auto failed_build_index = [&](std::string log_msg, std::string err_msg) {
table_file.file_type_ = engine::meta::SegmentSchema::TO_DELETE;
status = meta_ptr->UpdateCollectionFile(table_file);
ENGINE_LOG_ERROR << log_msg;
build_index_job->BuildIndexDone(to_index_id_);
build_index_job->GetStatus() = Status(DB_ERROR, err_msg);
to_index_engine_ = nullptr;
};
// step 2: build index
try { try {
ENGINE_LOG_DEBUG << "Begin build index for file:" + table_file.location_; ENGINE_LOG_DEBUG << "Begin build index for file:" + table_file.location_;
index = to_index_engine_->BuildIndex(table_file.location_, (EngineType)table_file.engine_type_); index = to_index_engine_->BuildIndex(table_file.location_, (EngineType)table_file.engine_type_);
fiu_do_on("XBuildIndexTask.Execute.build_index_fail", index = nullptr); fiu_do_on("XBuildIndexTask.Execute.build_index_fail", index = nullptr);
if (index == nullptr) { if (index == nullptr) {
throw Exception(DB_ERROR, "index NULL"); std::string log_msg = "Failed to build index " + table_file.file_id_ + ", reason: source index is null";
failed_build_index(log_msg, "source index is null");
return;
} }
} catch (std::exception& ex) { } catch (std::exception& ex) {
std::string msg = "Build index exception: " + std::string(ex.what()); std::string msg = "Failed to build index " + table_file.file_id_ + ", reason: " + std::string(ex.what());
ENGINE_LOG_ERROR << msg; failed_build_index(msg, ex.what());
table_file.file_type_ = engine::meta::SegmentSchema::TO_DELETE;
status = meta_ptr->UpdateCollectionFile(table_file);
ENGINE_LOG_DEBUG << "Build index fail, mark file: " << table_file.file_id_ << " to to_delete";
build_index_job->BuildIndexDone(to_index_id_);
build_index_job->GetStatus() = Status(DB_ERROR, msg);
to_index_engine_ = nullptr;
return; return;
} }
// step 4: if collection has been deleted, dont save index file // step 3: if collection has been deleted, dont save index file
bool has_collection = false; bool has_collection = false;
meta_ptr->HasCollection(file_->collection_id_, has_collection); meta_ptr->HasCollection(file_->collection_id_, has_collection);
fiu_do_on("XBuildIndexTask.Execute.has_collection", has_collection = true); fiu_do_on("XBuildIndexTask.Execute.has_collection", has_collection = true);
if (!has_collection) { if (!has_collection) {
meta_ptr->DeleteTableFiles(file_->collection_id_); std::string msg = "Failed to build index " + table_file.file_id_ + ", reason: collection has been deleted";
failed_build_index(msg, "Collection has been deleted");
build_index_job->BuildIndexDone(to_index_id_);
build_index_job->GetStatus() = Status(DB_ERROR, "Collection has been deleted, discard index file.");
to_index_engine_ = nullptr;
return; return;
} }
// step 5: save index file // step 4: save index file
try { try {
fiu_do_on("XBuildIndexTask.Execute.throw_std_exception", throw std::exception()); fiu_do_on("XBuildIndexTask.Execute.throw_std_exception", throw std::exception());
status = index->Serialize(); status = index->Serialize();
if (!status.ok()) { if (!status.ok()) {
ENGINE_LOG_ERROR << status.message(); std::string msg =
"Failed to persist index file: " + table_file.location_ + ", reason: " + status.message();
failed_build_index(msg, status.message());
return;
} }
} catch (std::exception& ex) { } catch (std::exception& ex) {
std::string msg = "Serialize index encounter exception: " + std::string(ex.what());
ENGINE_LOG_ERROR << msg;
status = Status(DB_ERROR, msg);
}
fiu_do_on("XBuildIndexTask.Execute.save_index_file_success", status = Status::OK());
if (!status.ok()) {
// if failed to serialize index file to disk // if failed to serialize index file to disk
// typical error: out of disk space, out of memory or permition denied // typical error: out of disk space, out of memory or permition denied
table_file.file_type_ = engine::meta::SegmentSchema::TO_DELETE; std::string msg =
status = meta_ptr->UpdateCollectionFile(table_file); "Failed to persist index file:" + table_file.location_ + ", exception:" + std::string(ex.what());
ENGINE_LOG_DEBUG << "Failed to update file to index, mark file: " << table_file.file_id_ << " to to_delete"; failed_build_index(msg, ex.what());
ENGINE_LOG_ERROR << "Failed to persist index file: " << table_file.location_
<< ", possible out of disk space or memory";
build_index_job->BuildIndexDone(to_index_id_);
build_index_job->GetStatus() = status;
to_index_engine_ = nullptr;
return; return;
} }
// step 6: update meta // step 5: update meta
table_file.file_type_ = engine::meta::SegmentSchema::INDEX; table_file.file_type_ = engine::meta::SegmentSchema::INDEX;
table_file.file_size_ = server::CommonUtil::GetFileSize(table_file.location_); table_file.file_size_ = server::CommonUtil::GetFileSize(table_file.location_);
table_file.row_count_ = file_->row_count_; // index->Count(); table_file.row_count_ = file_->row_count_; // index->Count();
...@@ -243,8 +236,6 @@ XBuildIndexTask::Execute() { ...@@ -243,8 +236,6 @@ XBuildIndexTask::Execute() {
build_index_job->BuildIndexDone(to_index_id_); build_index_job->BuildIndexDone(to_index_id_);
} }
rc.ElapseFromBegin("totally cost");
to_index_engine_ = nullptr; to_index_engine_ = nullptr;
} }
......
...@@ -188,10 +188,7 @@ XSearchTask::Load(LoadType type, uint8_t device_id) { ...@@ -188,10 +188,7 @@ XSearchTask::Load(LoadType type, uint8_t device_id) {
std::string info = "Search task load file id:" + std::to_string(file_->id_) + " " + type_str + std::string info = "Search task load file id:" + std::to_string(file_->id_) + " " + type_str +
" file type:" + std::to_string(file_->file_type_) + " size:" + std::to_string(file_size) + " file type:" + std::to_string(file_->file_type_) + " size:" + std::to_string(file_size) +
" bytes from location: " + file_->location_ + " totally cost"; " bytes from location: " + file_->location_ + " totally cost";
double span = rc.ElapseFromBegin(info); rc.ElapseFromBegin(info);
// for (auto &context : search_contexts_) {
// context->AccumLoadCost(span);
// }
CollectFileMetrics(file_->file_type_, file_size); CollectFileMetrics(file_->file_type_, file_size);
...@@ -205,10 +202,6 @@ void ...@@ -205,10 +202,6 @@ void
XSearchTask::Execute() { XSearchTask::Execute() {
milvus::server::ContextFollower tracer(context_, "XSearchTask::Execute " + std::to_string(index_id_)); milvus::server::ContextFollower tracer(context_, "XSearchTask::Execute " + std::to_string(index_id_));
if (index_engine_ == nullptr) {
return;
}
// ENGINE_LOG_DEBUG << "Searching in file id:" << index_id_ << " with " // ENGINE_LOG_DEBUG << "Searching in file id:" << index_id_ << " with "
// << search_contexts_.size() << " tasks"; // << search_contexts_.size() << " tasks";
...@@ -222,6 +215,12 @@ XSearchTask::Execute() { ...@@ -222,6 +215,12 @@ XSearchTask::Execute() {
if (auto job = job_.lock()) { if (auto job = job_.lock()) {
auto search_job = std::static_pointer_cast<scheduler::SearchJob>(job); auto search_job = std::static_pointer_cast<scheduler::SearchJob>(job);
if (index_engine_ == nullptr) {
search_job->SearchDone(index_id_);
return;
}
// step 1: allocate memory // step 1: allocate memory
uint64_t nq = search_job->nq(); uint64_t nq = search_job->nq();
uint64_t topk = search_job->topk(); uint64_t topk = search_job->topk();
......
...@@ -123,13 +123,6 @@ TEST(TaskTest, TEST_TASK) { ...@@ -123,13 +123,6 @@ TEST(TaskTest, TEST_TASK) {
build_index_task.Execute(); build_index_task.Execute();
fiu_disable("XBuildIndexTask.Execute.throw_std_exception"); fiu_disable("XBuildIndexTask.Execute.throw_std_exception");
// always enable 'save_index_file_success'
fiu_enable("XBuildIndexTask.Execute.save_index_file_success", 1, NULL, 0);
build_index_task.to_index_engine_ =
EngineFactory::Build(file->dimension_, file->location_, (EngineType)file->engine_type_,
(MetricType)file->metric_type_, json);
build_index_task.Execute();
fiu_enable("XBuildIndexTask.Execute.update_table_file_fail", 1, NULL, 0); fiu_enable("XBuildIndexTask.Execute.update_table_file_fail", 1, NULL, 0);
build_index_task.to_index_engine_ = build_index_task.to_index_engine_ =
EngineFactory::Build(file->dimension_, file->location_, (EngineType)file->engine_type_, EngineFactory::Build(file->dimension_, file->location_, (EngineType)file->engine_type_,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册