未验证 提交 85aca5fa 编写于 作者: J Jin Hai 提交者: GitHub

Merge pull request #474 from yhmo/0.6.0

#470 raw files should not be build index
......@@ -44,6 +44,7 @@ Please mark all change in change log and use the ticket from JIRA.
- \#409 - Add a Fallback pass in optimizer
- \#433 - C++ SDK query result is not easy to use
- \#449 - Add ShowPartitions example for C++ SDK
- \#470 - Small raw files should not be build index
## Task
......
......@@ -838,6 +838,25 @@ DBImpl::BackgroundBuildIndex() {
// ENGINE_LOG_TRACE << "Background build index thread exit";
}
Status
DBImpl::GetFilesToBuildIndex(const std::string& table_id, const std::vector<int>& file_types,
meta::TableFilesSchema& files) {
files.clear();
auto status = meta_ptr_->FilesByType(table_id, file_types, files);
// only build index for files that row count greater than certain threshold
for (auto it = files.begin(); it != files.end();) {
if ((*it).file_type_ == static_cast<int>(meta::TableFileSchema::RAW) &&
(*it).row_count_ < meta::BUILD_INDEX_THRESHOLD) {
it = files.erase(it);
} else {
it++;
}
}
return Status::OK();
}
Status
DBImpl::GetFilesToSearch(const std::string& table_id, const std::vector<size_t>& file_ids, const meta::DatesT& dates,
meta::TableFilesSchema& files) {
......@@ -946,18 +965,18 @@ DBImpl::BuildTableIndexRecursively(const std::string& table_id, const TableIndex
}
// get files to build index
std::vector<std::string> file_ids;
auto status = meta_ptr_->FilesByType(table_id, file_types, file_ids);
meta::TableFilesSchema table_files;
auto status = GetFilesToBuildIndex(table_id, file_types, table_files);
int times = 1;
while (!file_ids.empty()) {
while (!table_files.empty()) {
ENGINE_LOG_DEBUG << "Non index files detected! Will build index " << times;
if (index.engine_type_ != (int)EngineType::FAISS_IDMAP) {
status = meta_ptr_->UpdateTableFilesToIndex(table_id);
}
std::this_thread::sleep_for(std::chrono::milliseconds(std::min(10 * 1000, times * 100)));
status = meta_ptr_->FilesByType(table_id, file_types, file_ids);
GetFilesToBuildIndex(table_id, file_types, table_files);
times++;
}
......
......@@ -152,6 +152,10 @@ class DBImpl : public DB {
Status
MemSerialize();
Status
GetFilesToBuildIndex(const std::string& table_id, const std::vector<int>& file_types,
meta::TableFilesSchema& files);
Status
GetFilesToSearch(const std::string& table_id, const std::vector<size_t>& file_ids, const meta::DatesT& dates,
meta::TableFilesSchema& files);
......
......@@ -109,8 +109,7 @@ class Meta {
FilesToIndex(TableFilesSchema&) = 0;
virtual Status
FilesByType(const std::string& table_id, const std::vector<int>& file_types,
std::vector<std::string>& file_ids) = 0;
FilesByType(const std::string& table_id, const std::vector<int>& file_types, TableFilesSchema& table_files) = 0;
virtual Status
Size(uint64_t& result) = 0;
......
......@@ -32,6 +32,13 @@ const size_t H_SEC = 60 * M_SEC;
const size_t D_SEC = 24 * H_SEC;
const size_t W_SEC = 7 * D_SEC;
// This value is to ignore small raw files when building index.
// The reason is:
// 1. The performance of brute-search for small raw files could be better than small index file.
// 2. And small raw files can be merged to larger files, thus reduce fragmented files count.
// We decide the value based on a testing for small size raw/index files.
const size_t BUILD_INDEX_THRESHOLD = 5000;
} // namespace meta
} // namespace engine
} // namespace milvus
......@@ -959,6 +959,7 @@ MySQLMetaImpl::UpdateTableFilesToIndex(const std::string& table_id) {
updateTableFilesToIndexQuery << "UPDATE " << META_TABLEFILES
<< " SET file_type = " << std::to_string(TableFileSchema::TO_INDEX)
<< " WHERE table_id = " << mysqlpp::quote << table_id
<< " AND row_count >= " << std::to_string(meta::BUILD_INDEX_THRESHOLD)
<< " AND file_type = " << std::to_string(TableFileSchema::RAW) << ";";
ENGINE_LOG_DEBUG << "MySQLMetaImpl::UpdateTableFilesToIndex: " << updateTableFilesToIndexQuery.str();
......@@ -1527,13 +1528,13 @@ MySQLMetaImpl::FilesToIndex(TableFilesSchema& files) {
Status
MySQLMetaImpl::FilesByType(const std::string& table_id, const std::vector<int>& file_types,
std::vector<std::string>& file_ids) {
TableFilesSchema& table_files) {
if (file_types.empty()) {
return Status(DB_ERROR, "file types array is empty");
}
try {
file_ids.clear();
table_files.clear();
mysqlpp::StoreQueryResult res;
{
......@@ -1553,9 +1554,10 @@ MySQLMetaImpl::FilesByType(const std::string& table_id, const std::vector<int>&
mysqlpp::Query hasNonIndexFilesQuery = connectionPtr->query();
// since table_id is a unique column we just need to check whether it exists or not
hasNonIndexFilesQuery << "SELECT file_id, file_type"
<< " FROM " << META_TABLEFILES << " WHERE table_id = " << mysqlpp::quote << table_id
<< " AND file_type in (" << types << ");";
hasNonIndexFilesQuery
<< "SELECT id, engine_type, file_id, file_type, file_size, row_count, date, created_on"
<< " FROM " << META_TABLEFILES << " WHERE table_id = " << mysqlpp::quote << table_id
<< " AND file_type in (" << types << ");";
ENGINE_LOG_DEBUG << "MySQLMetaImpl::FilesByType: " << hasNonIndexFilesQuery.str();
......@@ -1566,9 +1568,18 @@ MySQLMetaImpl::FilesByType(const std::string& table_id, const std::vector<int>&
int raw_count = 0, new_count = 0, new_merge_count = 0, new_index_count = 0;
int to_index_count = 0, index_count = 0, backup_count = 0;
for (auto& resRow : res) {
std::string file_id;
resRow["file_id"].to_string(file_id);
file_ids.push_back(file_id);
TableFileSchema file_schema;
file_schema.id_ = resRow["id"];
file_schema.table_id_ = table_id;
file_schema.engine_type_ = resRow["engine_type"];
resRow["file_id"].to_string(file_schema.file_id_);
file_schema.file_type_ = resRow["file_type"];
file_schema.file_size_ = resRow["file_size"];
file_schema.row_count_ = resRow["row_count"];
file_schema.date_ = resRow["date"];
file_schema.created_on_ = resRow["created_on"];
table_files.emplace_back(file_schema);
int32_t file_type = resRow["file_type"];
switch (file_type) {
......
......@@ -108,7 +108,7 @@ class MySQLMetaImpl : public Meta {
Status
FilesByType(const std::string& table_id, const std::vector<int>& file_types,
std::vector<std::string>& file_ids) override;
TableFilesSchema& table_files) override;
Status
Archive() override;
......
此差异已折叠。
......@@ -108,7 +108,7 @@ class SqliteMetaImpl : public Meta {
Status
FilesByType(const std::string& table_id, const std::vector<int>& file_types,
std::vector<std::string>& file_ids) override;
TableFilesSchema& table_files) override;
Status
Size(uint64_t& result) override;
......
......@@ -306,9 +306,9 @@ TEST_F(MetaTest, TABLE_FILES_TEST) {
ASSERT_EQ(dated_files[table_file.date_].size(), 0);
std::vector<int> file_types;
std::vector<std::string> file_ids;
status = impl_->FilesByType(table.table_id_, file_types, file_ids);
ASSERT_TRUE(file_ids.empty());
milvus::engine::meta::TableFilesSchema table_files;
status = impl_->FilesByType(table.table_id_, file_types, table_files);
ASSERT_TRUE(table_files.empty());
ASSERT_FALSE(status.ok());
file_types = {
......@@ -317,11 +317,11 @@ TEST_F(MetaTest, TABLE_FILES_TEST) {
milvus::engine::meta::TableFileSchema::INDEX, milvus::engine::meta::TableFileSchema::RAW,
milvus::engine::meta::TableFileSchema::BACKUP,
};
status = impl_->FilesByType(table.table_id_, file_types, file_ids);
status = impl_->FilesByType(table.table_id_, file_types, table_files);
ASSERT_TRUE(status.ok());
uint64_t total_cnt = new_index_files_cnt + new_merge_files_cnt + backup_files_cnt + new_files_cnt + raw_files_cnt +
to_index_files_cnt + index_files_cnt;
ASSERT_EQ(file_ids.size(), total_cnt);
ASSERT_EQ(table_files.size(), total_cnt);
status = impl_->DeleteTableFiles(table_id);
ASSERT_TRUE(status.ok());
......
......@@ -169,9 +169,9 @@ TEST_F(MySqlMetaTest, ARCHIVE_TEST_DAYS) {
std::vector<int> file_types = {
(int)milvus::engine::meta::TableFileSchema::NEW,
};
std::vector<std::string> file_ids;
status = impl.FilesByType(table_id, file_types, file_ids);
ASSERT_FALSE(file_ids.empty());
milvus::engine::meta::TableFilesSchema table_files;
status = impl.FilesByType(table_id, file_types, table_files);
ASSERT_FALSE(table_files.empty());
status = impl.UpdateTableFilesToIndex(table_id);
ASSERT_TRUE(status.ok());
......@@ -326,9 +326,9 @@ TEST_F(MySqlMetaTest, TABLE_FILES_TEST) {
ASSERT_EQ(dated_files[table_file.date_].size(), 0);
std::vector<int> file_types;
std::vector<std::string> file_ids;
status = impl_->FilesByType(table.table_id_, file_types, file_ids);
ASSERT_TRUE(file_ids.empty());
milvus::engine::meta::TableFilesSchema table_files;
status = impl_->FilesByType(table.table_id_, file_types, table_files);
ASSERT_TRUE(table_files.empty());
ASSERT_FALSE(status.ok());
file_types = {
......@@ -337,11 +337,11 @@ TEST_F(MySqlMetaTest, TABLE_FILES_TEST) {
milvus::engine::meta::TableFileSchema::INDEX, milvus::engine::meta::TableFileSchema::RAW,
milvus::engine::meta::TableFileSchema::BACKUP,
};
status = impl_->FilesByType(table.table_id_, file_types, file_ids);
status = impl_->FilesByType(table.table_id_, file_types, table_files);
ASSERT_TRUE(status.ok());
uint64_t total_cnt = new_index_files_cnt + new_merge_files_cnt + backup_files_cnt + new_files_cnt + raw_files_cnt +
to_index_files_cnt + index_files_cnt;
ASSERT_EQ(file_ids.size(), total_cnt);
ASSERT_EQ(table_files.size(), total_cnt);
status = impl_->DeleteTableFiles(table_id);
ASSERT_TRUE(status.ok());
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册