diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/clue_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/clue_op.cc index 0081461d88a737dee1d1aafae310e9b0a505ed02..efe2d8d061307279db085eeac80ee4a4d563a225 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/clue_op.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/clue_op.cc @@ -408,8 +408,7 @@ Status ClueOp::FillIOBlockQueue(const std::vector &i_keys) { break; } } - auto file_it = filename_index_->Search(*it); - file_index.emplace_back(std::pair(file_it.value(), *it)); + file_index.emplace_back(std::pair((*filename_index_)[*it], *it)); } } else { for (auto it = filename_index_->begin(); it != filename_index_->end(); ++it) { diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/io_block.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/io_block.cc index 9f45e2179fb9259d0802ddb014c0009aa4ea79e4..0963f1a67a31ac48ee7bfe60e9b86d7b1ee071fa 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/io_block.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/io_block.cc @@ -72,8 +72,9 @@ Status FilenameBlock::GetFilename(std::string *out_filename, const AutoIndexObj< RETURN_IF_NOT_OK(IOBlock::GetKey(&fetched_key)); // Do an index lookup using that key to get the filename. - auto it = index.Search(fetched_key); - if (it != index.end()) { + auto r = index.Search(fetched_key); + if (r.second) { + auto &it = r.first; *out_filename = it.value(); } else { RETURN_STATUS_UNEXPECTED("Could not find filename from index"); diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/text_file_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/text_file_op.cc index d31495a09bd8ebd20a38429468241c054124f9cf..8e22c102dd9513af64804777ae1f503df2727ac3 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/text_file_op.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/text_file_op.cc @@ -314,8 +314,7 @@ Status TextFileOp::FillIOBlockQueue(const std::vector &i_keys) { break; } } - auto file_it = filename_index_->Search(*it); - file_index.emplace_back(std::pair(file_it.value(), *it)); + file_index.emplace_back(std::pair((*filename_index_)[*it], *it)); } } else { for (auto it = filename_index_->begin(); it != filename_index_->end(); ++it) { diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc index 208965e88c748cd5ef4b711fcabd60a4730ccfb5..4d3851488a7c55441de65c3fa424ca004281000a 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc @@ -451,8 +451,7 @@ Status TFReaderOp::FillIOBlockShuffle(const std::vector &i_keys) { } } else { // Do an index lookup using that key to get the filename. - auto file_it = filename_index_->Search(*it); - std::string file_name = file_it.value(); + std::string file_name = (*filename_index_)[*it]; if (NeedPushFileToblockQueue(file_name, &start_offset, &end_offset, pre_count)) { auto ioBlock = std::make_unique(*it, start_offset, end_offset, IOBlock::kDeIoBlockNone); RETURN_IF_NOT_OK(PushIoBlockQueue(queue_index, std::move(ioBlock))); diff --git a/mindspore/ccsrc/dataset/util/btree.h b/mindspore/ccsrc/dataset/util/btree.h index df7cb8516fac965301f09f60019ea9513c024233..ccf642e366df95d4e32e34822421982d3bacfbd7 100644 --- a/mindspore/ccsrc/dataset/util/btree.h +++ b/mindspore/ccsrc/dataset/util/btree.h @@ -40,8 +40,6 @@ struct BPlusTreeTraits { static constexpr slot_type kLeafSlots = 256; // Number of slots in each inner node of the tree static constexpr slot_type kInnerSlots = 128; - // If kAppendMode is true, we will split high instead of 50/50 split - static constexpr bool kAppendMode = false; }; /// Implementation of B+ tree @@ -123,19 +121,14 @@ class BPlusTree { std::unique_ptr DoUpdate(const key_type &key, const value_type &new_value); std::unique_ptr DoUpdate(const key_type &key, std::unique_ptr &&new_value); - void PopulateNumKeys(); - - key_type KeyAtPos(uint64_t inx); - // Statistics struct tree_stats { std::atomic size_; uint32_t leaves_; uint32_t inner_nodes_; uint32_t level_; - bool num_keys_array_valid_; - tree_stats() : size_(0), leaves_(0), inner_nodes_(0), level_(0), num_keys_array_valid_(false) {} + tree_stats() : size_(0), leaves_(0), inner_nodes_(0), level_(0) {} }; private: @@ -160,10 +153,6 @@ class BPlusTree { Node lru_; }; - uint64_t PopulateNumKeys(BaseNode *n); - - key_type KeyAtPos(BaseNode *n, uint64_t inx); - // This control block keeps track of all the nodes we traverse on insert. // To maximize concurrency, internal nodes are latched S. If a node split // is required, we must releases all the latches and redo it again and change @@ -255,7 +244,6 @@ class BPlusTree { slot_type slot_dir_[traits::kInnerSlots] = {0}; key_type keys_[traits::kInnerSlots] = {0}; BaseNode *data_[traits::kInnerSlots + 1] = {nullptr}; - uint64_t num_keys_[traits::kInnerSlots + 1] = {0}; slot_type slotuse_; }; @@ -391,7 +379,6 @@ class BPlusTree { Iterator operator--(int); bool operator==(const Iterator &x) const { return (x.cur_ == cur_) && (x.slot_ == slot_); } - bool operator!=(const Iterator &x) const { return (x.cur_ != cur_) || (x.slot_ != slot_); } private: @@ -441,7 +428,6 @@ class BPlusTree { ConstIterator operator--(int); bool operator==(const ConstIterator &x) const { return (x.cur_ == cur_) && (x.slot_ == slot_); } - bool operator!=(const ConstIterator &x) const { return (x.cur_ != cur_) || (x.slot_ != slot_); } private: @@ -451,20 +437,17 @@ class BPlusTree { }; Iterator begin(); - Iterator end(); ConstIterator begin() const; - ConstIterator end() const; ConstIterator cbegin() const; - ConstIterator cend() const; // Locate the entry with key - ConstIterator Search(const key_type &key) const; - Iterator Search(const key_type &key); + std::pair Search(const key_type &key) const; + std::pair Search(const key_type &key); value_type operator[](key_type key); }; diff --git a/mindspore/ccsrc/dataset/util/btree_impl.tpp b/mindspore/ccsrc/dataset/util/btree_impl.tpp index 54fd112db49c60403c4099a25e77e90445da6d02..8148a8d12cb741c3748fcd25d8afd0d7ba890af5 100644 --- a/mindspore/ccsrc/dataset/util/btree_impl.tpp +++ b/mindspore/ccsrc/dataset/util/btree_impl.tpp @@ -269,26 +269,17 @@ typename BPlusTree::IndexRc BPlusTree::LeafInsertK RETURN_IF_BAD_RC(rc); leaf_nodes_.InsertAfter(node, new_leaf); *split_node = new_leaf; - if (slot == node->slotuse_ && traits::kAppendMode) { - // Split high. Good for bulk load and keys are in asending order on insert - *split_key = key; - // Just insert the new key to the new leaf. No further need to move the keys - // from one leaf to the other. - rc = new_leaf->InsertIntoSlot(nullptr, 0, key, std::move(value)); + // 50/50 split + rc = node->Split(new_leaf); + RETURN_IF_BAD_RC(rc); + *split_key = new_leaf->keys_[0]; + if (LessThan(key, *split_key)) { + rc = node->InsertIntoSlot(nullptr, slot, key, std::move(value)); RETURN_IF_BAD_RC(rc); } else { - // 50/50 split - rc = node->Split(new_leaf); + slot -= node->slotuse_; + rc = new_leaf->InsertIntoSlot(nullptr, slot, key, std::move(value)); RETURN_IF_BAD_RC(rc); - *split_key = new_leaf->keys_[0]; - if (LessThan(key, *split_key)) { - rc = node->InsertIntoSlot(nullptr, slot, key, std::move(value)); - RETURN_IF_BAD_RC(rc); - } else { - slot -= node->slotuse_; - rc = new_leaf->InsertIntoSlot(nullptr, slot, key, std::move(value)); - RETURN_IF_BAD_RC(rc); - } } } return rc; @@ -309,25 +300,18 @@ typename BPlusTree::IndexRc BPlusTree::InnerInsert rc = AllocateInner(&new_inner); RETURN_IF_BAD_RC(rc); *split_node = new_inner; - if (slot == node->slotuse_ && traits::kAppendMode) { - *split_key = key; - new_inner->data_[0] = node->data_[node->slotuse_]; - rc = new_inner->InsertIntoSlot(0, key, ptr); + rc = node->Split(new_inner, split_key); + RETURN_IF_BAD_RC(rc); + if (LessThan(key, *split_key)) { + // Need to readjust the slot position since the split key is no longer in the two children. + slot = FindSlot(node, key); + rc = node->InsertIntoSlot(slot, key, ptr); RETURN_IF_BAD_RC(rc); } else { - rc = node->Split(new_inner, split_key); + // Same reasoning as above + slot = FindSlot(new_inner, key); + rc = new_inner->InsertIntoSlot(slot, key, ptr); RETURN_IF_BAD_RC(rc); - if (LessThan(key, *split_key)) { - // Need to readjust the slot position since the split key is no longer in the two children. - slot = FindSlot(node, key); - rc = node->InsertIntoSlot(slot, key, ptr); - RETURN_IF_BAD_RC(rc); - } else { - // Same reasoning as above - slot = FindSlot(new_inner, key); - rc = new_inner->InsertIntoSlot(slot, key, ptr); - RETURN_IF_BAD_RC(rc); - } } } return rc; @@ -377,8 +361,7 @@ typename BPlusTree::IndexRc BPlusTree::InsertKeyVa } template -typename BPlusTree::IndexRc BPlusTree::Locate(RWLock *parent_lock, - bool forUpdate, +typename BPlusTree::IndexRc BPlusTree::Locate(RWLock *parent_lock, bool forUpdate, BPlusTree::BaseNode *top, const key_type &key, BPlusTree::LeafNode **ln, @@ -481,9 +464,6 @@ Status BPlusTree::DoInsert(const key_type &key, std::unique_ptr BPlusTree::DoUpdate(const key_type &key, std:: } } -template -void BPlusTree::PopulateNumKeys() { - // Start from the root and we calculate how many leaf nodes as pointed to by each inner node. - // The results are stored in the numKeys array in each inner node. - (void)PopulateNumKeys(root_); - // Indicate the result is accurate since we have the tree locked exclusive. - stats_.num_keys_array_valid_ = true; -} - -template -uint64_t BPlusTree::PopulateNumKeys(BPlusTree::BaseNode *n) { - if (n->is_leafnode()) { - auto *leaf = static_cast(n); - return leaf->slotuse_; - } else { - auto *inner = static_cast(n); - uint64_t num_keys = 0; - for (auto i = 0; i < inner->slotuse_ + 1; i++) { - inner->num_keys_[i] = PopulateNumKeys(inner->data_[i]); - num_keys += inner->num_keys_[i]; - } - return num_keys; - } -} - -template -typename BPlusTree::key_type BPlusTree::KeyAtPos(uint64_t inx) { - if (stats_.num_keys_array_valid_ == false) { - // We need exclusive access to the tree. If concurrent insert is going on, it is hard to get accurate numbers - UniqueLock lck(&rw_lock_); - // Check again. - if (stats_.num_keys_array_valid_ == false) { - PopulateNumKeys(); - } - } - // Now we know how many keys each inner branch contains, we can now traverse the correct node in log n time. - return KeyAtPos(root_, inx); -} - -template -typename BPlusTree::key_type BPlusTree::KeyAtPos(BPlusTree::BaseNode *n, - uint64_t inx) { - if (n->is_leafnode()) { - auto *leaf = static_cast(n); - return leaf->keys_[leaf->slot_dir_[inx]]; - } else { - auto *inner = static_cast(n); - if ((inx + 1) > inner->num_keys_[0]) { - inx -= inner->num_keys_[0]; - } else { - return KeyAtPos(inner->data_[0], inx); - } - for (auto i = 0; i < inner->slotuse_; i++) { - if ((inx + 1) > inner->num_keys_[inner->slot_dir_[i] + 1]) { - inx -= inner->num_keys_[inner->slot_dir_[i] + 1]; - } else { - return KeyAtPos(inner->data_[inner->slot_dir_[i] + 1], inx); - } - } - } - // If we get here, inx is way too big. Instead of throwing exception, we will just return the default value - // of key_type whatever it is. - return key_type(); -} } // namespace dataset } // namespace mindspore #endif diff --git a/mindspore/ccsrc/dataset/util/btree_iterator.tpp b/mindspore/ccsrc/dataset/util/btree_iterator.tpp index ef3a47f176229d629f080e21d001f9106d7260ca..91ba2acd7ab8d1d8a80901f01e05f5fb9a8df5ec 100644 --- a/mindspore/ccsrc/dataset/util/btree_iterator.tpp +++ b/mindspore/ccsrc/dataset/util/btree_iterator.tpp @@ -286,7 +286,8 @@ typename BPlusTree::ConstIterator &BPlusTree::Cons } template -typename BPlusTree::ConstIterator BPlusTree::Search(const key_type &key) const { +std::pair::ConstIterator, bool> BPlusTree::Search( + const key_type &key) const { if (root_ != nullptr) { LeafNode *leaf = nullptr; slot_type slot; @@ -294,21 +295,15 @@ typename BPlusTree::ConstIterator BPlusTree::Searc // Lock the tree in S, pass the lock to Locate which will unlock it for us underneath. myLock->LockShared(); IndexRc rc = Locate(myLock, false, root_, key, &leaf, &slot); - if (rc == IndexRc::kOk) { - // All locks from the tree to the parent of leaf are all gone. We still have a S lock - // on the leaf. The unlock will be handled by the iterator when it goes out of scope. - return ConstIterator(leaf, slot, true); - } else { - MS_LOG(DEBUG) << "Key not found. rc = " << static_cast(rc) << "."; - return cend(); - } + bool find = (rc == IndexRc::kOk); + return std::make_pair(ConstIterator(leaf, slot, find), find); } else { - return cend(); + return std::make_pair(cend(), false); } } template -typename BPlusTree::Iterator BPlusTree::Search(const key_type &key) { +std::pair::Iterator, bool> BPlusTree::Search(const key_type &key) { if (root_ != nullptr) { LeafNode *leaf = nullptr; slot_type slot; @@ -316,23 +311,17 @@ typename BPlusTree::Iterator BPlusTree::Search(con // Lock the tree in S, pass the lock to Locate which will unlock it for us underneath. myLock->LockShared(); IndexRc rc = Locate(myLock, false, root_, key, &leaf, &slot); - if (rc == IndexRc::kOk) { - // All locks from the tree to the parent of leaf are all gone. We still have a S lock - // on the leaf. The unlock will be handled by the iterator when it goes out of scope. - return Iterator(leaf, slot, true); - } else { - MS_LOG(DEBUG) << "Key not found. rc = " << static_cast(rc) << "."; - return end(); - } + bool find = (rc == IndexRc::kOk); + return std::make_pair(Iterator(leaf, slot, find), find); } else { - return end(); + return std::make_pair(end(), false); } } template typename BPlusTree::value_type BPlusTree::operator[](key_type key) { - Iterator it = Search(key); - return it.value(); + auto r = Search(key); + return r.first.value(); } template diff --git a/tests/ut/cpp/dataset/btree_test.cc b/tests/ut/cpp/dataset/btree_test.cc index 2e40f4a6618cbb9f477b62f5e7047b8f53ac3243..168f550f34989f6a2563fde6696f56507f0e99f0 100644 --- a/tests/ut/cpp/dataset/btree_test.cc +++ b/tests/ut/cpp/dataset/btree_test.cc @@ -32,13 +32,8 @@ using mindspore::LogStream; // For testing purposes, we will make the branching factor very low. struct mytraits { using slot_type = uint16_t; - static const slot_type kLeafSlots = 6; - static const slot_type kInnerSlots = 3; - - static const bool kAppendMode = false; - }; @@ -95,13 +90,14 @@ TEST_F(MindDataTestBPlusTree, Test1) { // Test search { MS_LOG(INFO) << "Locate key " << 100 << " Expect found."; - auto it = btree.Search(100); - EXPECT_FALSE(it == btree.end()); + auto r = btree.Search(100); + auto &it = r.first; + EXPECT_TRUE(r.second); EXPECT_EQ(it.key(), 100); EXPECT_EQ(it.value(), "Hello World. I am 100"); MS_LOG(INFO) << "Locate key " << 300 << " Expect not found."; - it = btree.Search(300); - EXPECT_TRUE(it == btree.end()); + auto q = btree.Search(300); + EXPECT_FALSE(q.second); } // Test duplicate key @@ -169,26 +165,18 @@ TEST_F(MindDataTestBPlusTree, Test2) { { MS_LOG(INFO) << "Locating key from 0 to 9999. Expect found."; for (int i = 0; i < 10000; i++) { - auto it = btree.Search(i); - bool eoS = (it == btree.end()); - EXPECT_FALSE(eoS); - if (!eoS) { + auto r = btree.Search(i); + EXPECT_TRUE(r.second); + if (r.second) { + auto &it = r.first; EXPECT_EQ(it.key(), i); std::string val = "Hello World. I am " + std::to_string(i); EXPECT_EQ(it.value(), val); } } MS_LOG(INFO) << "Locate key " << 10000 << ". Expect not found"; - auto it = btree.Search(10000); - EXPECT_TRUE(it == btree.end()); - } - - // Test to retrieve key at certain position. - { - for (int i = 0; i < 10000; i++) { - int k = btree.KeyAtPos(i); - EXPECT_EQ(k, i); - } + auto q = btree.Search(10000); + EXPECT_FALSE(q.second); } } @@ -204,7 +192,8 @@ TEST_F(MindDataTestBPlusTree, Test3) { uint64_t max = ai.max_key(); EXPECT_EQ(min, 1); EXPECT_EQ(max, 4); - auto it = ai.Search(3); + auto r = ai.Search(3); + auto &it = r.first; EXPECT_EQ(it.value(), "b"); MS_LOG(INFO) << "Dump all the values using [] operator."; for (uint64_t i = min; i <= max; i++) {