From c80d8f82ce975272f22614a2a9ca51e77e5c2016 Mon Sep 17 00:00:00 2001 From: "cai.zhang" Date: Sat, 6 May 2023 10:32:39 +0800 Subject: [PATCH] Reduce memory usage for diskann index (#23600) Signed-off-by: cai.zhang --- .../core/src/storage/DiskFileManagerImpl.cpp | 78 +++++++++++++------ .../core/src/storage/DiskFileManagerImpl.h | 14 +++- 2 files changed, 67 insertions(+), 25 deletions(-) diff --git a/internal/core/src/storage/DiskFileManagerImpl.cpp b/internal/core/src/storage/DiskFileManagerImpl.cpp index 36ef1fd4b..12377e122 100644 --- a/internal/core/src/storage/DiskFileManagerImpl.cpp +++ b/internal/core/src/storage/DiskFileManagerImpl.cpp @@ -123,42 +123,72 @@ DiskFileManagerImpl::AddFile(const std::string& file) noexcept { auto fileName = GetFileName(file); auto fileSize = local_chunk_manager.Size(file); - // Split local data to multi part with specified size + std::vector batch_remote_files; + std::vector remote_file_sizes; + std::vector local_file_offsets; + int slice_num = 0; - auto remotePrefix = GetRemoteIndexObjectPrefix(); - std::vector>> futures; + auto parallel_degree = uint64_t(DEFAULT_DISK_INDEX_MAX_MEMORY_LIMIT / + (index_file_slice_size << 20)); for (int64_t offset = 0; offset < fileSize; slice_num++) { + if (batch_remote_files.size() >= parallel_degree) { + AddBatchIndexFiles(file, + local_file_offsets, + batch_remote_files, + remote_file_sizes); + batch_remote_files.clear(); + remote_file_sizes.clear(); + local_file_offsets.clear(); + } + auto batch_size = std::min(index_file_slice_size << 20, int64_t(fileSize) - offset); - // Put file to remote - char objectKey[200]; - snprintf(objectKey, - sizeof(objectKey), - "%s/%s_%d", - remotePrefix.c_str(), - fileName.c_str(), - slice_num); - - // use multi-thread to put part file + batch_remote_files.emplace_back( + GenerateRemoteIndexFile(fileName, slice_num)); + remote_file_sizes.emplace_back(batch_size); + local_file_offsets.emplace_back(offset); + offset += batch_size; + } + if (batch_remote_files.size() > 0) { + AddBatchIndexFiles( + file, local_file_offsets, batch_remote_files, remote_file_sizes); + } + FILEMANAGER_CATCH + FILEMANAGER_END + + return true; +} // namespace knowhere + +void +DiskFileManagerImpl::AddBatchIndexFiles( + const std::string& local_file_name, + const std::vector& local_file_offsets, + const std::vector& remote_files, + const std::vector& remote_file_sizes) { + auto& pool = ThreadPool::GetInstance(); + + std::vector>> futures; + AssertInfo(local_file_offsets.size() == remote_files.size(), + "inconsistent size of offset slices with file slices"); + AssertInfo(remote_files.size() == remote_file_sizes.size(), + "inconsistent size of file slices with size slices"); + + for (int64_t i = 0; i < remote_files.size(); ++i) { futures.push_back(pool.Submit(EncodeAndUploadIndexSlice, rcm_.get(), - file, - offset, - batch_size, + local_file_name, + local_file_offsets[i], + remote_file_sizes[i], index_meta_, field_meta_, - std::string(objectKey))); - offset += batch_size; + remote_files[i])); } + for (auto& future : futures) { auto res = future.get(); remote_paths_to_size_[res.first] = res.second; } - FILEMANAGER_CATCH - FILEMANAGER_END - - return true; -} // namespace knowhere +} void DiskFileManagerImpl::CacheIndexToDisk(std::vector remote_files) { @@ -262,7 +292,7 @@ DiskFileManagerImpl::GetFileName(const std::string& localfile) { } std::string -DiskFileManagerImpl::GetRemoteIndexObjectPrefix() { +DiskFileManagerImpl::GetRemoteIndexObjectPrefix() const { return remote_root_path_ + "/" + std::string(INDEX_ROOT_PATH) + "/" + std::to_string(index_meta_.build_id) + "/" + std::to_string(index_meta_.index_version) + "/" + diff --git a/internal/core/src/storage/DiskFileManagerImpl.h b/internal/core/src/storage/DiskFileManagerImpl.h index baaef28a1..6b34042cb 100644 --- a/internal/core/src/storage/DiskFileManagerImpl.h +++ b/internal/core/src/storage/DiskFileManagerImpl.h @@ -58,7 +58,7 @@ class DiskFileManagerImpl : public FileManagerImpl { } std::string - GetRemoteIndexObjectPrefix(); + GetRemoteIndexObjectPrefix() const; std::string GetLocalIndexObjectPrefix(); @@ -76,6 +76,12 @@ class DiskFileManagerImpl : public FileManagerImpl { return local_paths_; } + std::string + GenerateRemoteIndexFile(std::string file_name, int64_t slice_num) const { + return GetRemoteIndexObjectPrefix() + "/" + file_name + "_" + + std::to_string(slice_num); + } + void CacheIndexToDisk(std::vector remote_files); @@ -84,6 +90,12 @@ class DiskFileManagerImpl : public FileManagerImpl { const std::string& local_file_name, uint64_t local_file_init_offfset); + void + AddBatchIndexFiles(const std::string& local_file_name, + const std::vector& local_file_offsets, + const std::vector& remote_files, + const std::vector& remote_file_sizes); + FieldDataMeta GetFileDataMeta() const { return field_meta_; -- GitLab