未验证 提交 bafb183a 编写于 作者: J Jiquan Long 提交者: GitHub

Optimize bitset usage (#26096)

Signed-off-by: Nlongjiquan <jiquan.long@zilliz.com>
上级 0e103efc
...@@ -188,15 +188,17 @@ ExecPlanNodeVisitor::visit(RetrievePlanNode& node) { ...@@ -188,15 +188,17 @@ ExecPlanNodeVisitor::visit(RetrievePlanNode& node) {
return; return;
} }
bitset_holder.flip(); bool false_filtered_out = false;
if (GetExprUsePkIndex() && IsTermExpr(node.predicate_.value().get())) { if (GetExprUsePkIndex() && IsTermExpr(node.predicate_.value().get())) {
segment->search_ids_filter( segment->timestamp_filter(
bitset_holder, expr_cached_pk_id_offsets_, timestamp_); bitset_holder, expr_cached_pk_id_offsets_, timestamp_);
} else { } else {
segment->search_ids_filter(bitset_holder, timestamp_); bitset_holder.flip();
false_filtered_out = true;
segment->timestamp_filter(bitset_holder, timestamp_);
} }
retrieve_result.result_offsets_ = retrieve_result.result_offsets_ =
segment->find_first(node.limit_, bitset_holder); segment->find_first(node.limit_, bitset_holder, false_filtered_out);
retrieve_result_opt_ = std::move(retrieve_result); retrieve_result_opt_ = std::move(retrieve_result);
} }
......
...@@ -55,7 +55,9 @@ class OffsetMap { ...@@ -55,7 +55,9 @@ class OffsetMap {
using OffsetType = int64_t; using OffsetType = int64_t;
// TODO: in fact, we can retrieve the pk here. Not sure which way is more efficient. // TODO: in fact, we can retrieve the pk here. Not sure which way is more efficient.
virtual std::vector<OffsetType> virtual std::vector<OffsetType>
find_first(int64_t limit, const BitsetType& bitset) const = 0; find_first(int64_t limit,
const BitsetType& bitset,
bool false_filtered_out) const = 0;
}; };
template <typename T> template <typename T>
...@@ -85,29 +87,36 @@ class OffsetOrderedMap : public OffsetMap { ...@@ -85,29 +87,36 @@ class OffsetOrderedMap : public OffsetMap {
} }
std::vector<OffsetType> std::vector<OffsetType>
find_first(int64_t limit, const BitsetType& bitset) const override { find_first(int64_t limit,
const BitsetType& bitset,
bool false_filtered_out) const override {
if (limit == Unlimited || limit == NoLimit) { if (limit == Unlimited || limit == NoLimit) {
limit = map_.size(); limit = map_.size();
} }
// TODO: we can't retrieve pk by offset very conveniently. // TODO: we can't retrieve pk by offset very conveniently.
// Selectivity should be done outside. // Selectivity should be done outside.
return find_first_by_index(limit, bitset); return find_first_by_index(limit, bitset, false_filtered_out);
} }
private: private:
std::vector<OffsetType> std::vector<OffsetType>
find_first_by_index(int64_t limit, const BitsetType& bitset) const { find_first_by_index(int64_t limit,
const BitsetType& bitset,
bool false_filtered_out) const {
std::vector<int64_t> seg_offsets; std::vector<int64_t> seg_offsets;
seg_offsets.reserve(limit); seg_offsets.reserve(limit);
int64_t hit_num = 0; // avoid counting the number everytime. int64_t hit_num = 0; // avoid counting the number everytime.
auto cnt = bitset.count(); auto cnt = bitset.count();
if (!false_filtered_out) {
cnt = bitset.size() - bitset.count();
}
for (auto it = map_.begin(); it != map_.end(); it++) { for (auto it = map_.begin(); it != map_.end(); it++) {
if (hit_num >= limit || hit_num >= cnt) { if (hit_num >= limit || hit_num >= cnt) {
break; break;
} }
for (auto seg_offset : it->second) { for (auto seg_offset : it->second) {
if (bitset[seg_offset]) { if (!(bitset[seg_offset] ^ false_filtered_out)) {
seg_offsets.push_back(seg_offset); seg_offsets.push_back(seg_offset);
hit_num++; hit_num++;
} }
...@@ -163,7 +172,9 @@ class OffsetOrderedArray : public OffsetMap { ...@@ -163,7 +172,9 @@ class OffsetOrderedArray : public OffsetMap {
} }
std::vector<OffsetType> std::vector<OffsetType>
find_first(int64_t limit, const BitsetType& bitset) const override { find_first(int64_t limit,
const BitsetType& bitset,
bool false_filtered_out) const override {
check_search(); check_search();
if (limit == Unlimited || limit == NoLimit) { if (limit == Unlimited || limit == NoLimit) {
...@@ -172,21 +183,26 @@ class OffsetOrderedArray : public OffsetMap { ...@@ -172,21 +183,26 @@ class OffsetOrderedArray : public OffsetMap {
// TODO: we can't retrieve pk by offset very conveniently. // TODO: we can't retrieve pk by offset very conveniently.
// Selectivity should be done outside. // Selectivity should be done outside.
return find_first_by_index(limit, bitset); return find_first_by_index(limit, bitset, false_filtered_out);
} }
private: private:
std::vector<OffsetType> std::vector<OffsetType>
find_first_by_index(int64_t limit, const BitsetType& bitset) const { find_first_by_index(int64_t limit,
const BitsetType& bitset,
bool false_filtered_out) const {
std::vector<int64_t> seg_offsets; std::vector<int64_t> seg_offsets;
seg_offsets.reserve(limit); seg_offsets.reserve(limit);
int64_t hit_num = 0; // avoid counting the number everytime. int64_t hit_num = 0; // avoid counting the number everytime.
auto cnt = bitset.count(); auto cnt = bitset.count();
if (!false_filtered_out) {
cnt = bitset.size() - bitset.count();
}
for (auto it = array_.begin(); it != array_.end(); it++) { for (auto it = array_.begin(); it != array_.end(); it++) {
if (hit_num >= limit || hit_num >= cnt) { if (hit_num >= limit || hit_num >= cnt) {
break; break;
} }
if (bitset[it->second]) { if (!(bitset[it->second] ^ false_filtered_out)) {
seg_offsets.push_back(it->second); seg_offsets.push_back(it->second);
hit_num++; hit_num++;
} }
......
...@@ -235,8 +235,11 @@ class SegmentGrowingImpl : public SegmentGrowing { ...@@ -235,8 +235,11 @@ class SegmentGrowingImpl : public SegmentGrowing {
} }
std::vector<OffsetMap::OffsetType> std::vector<OffsetMap::OffsetType>
find_first(int64_t limit, const BitsetType& bitset) const override { find_first(int64_t limit,
return insert_record_.pk2offset_->find_first(limit, bitset); const BitsetType& bitset,
bool false_filtered_out) const override {
return insert_record_.pk2offset_->find_first(
limit, bitset, false_filtered_out);
} }
protected: protected:
......
...@@ -179,31 +179,39 @@ SegmentInternalInterface::get_real_count() const { ...@@ -179,31 +179,39 @@ SegmentInternalInterface::get_real_count() const {
} }
void void
SegmentInternalInterface::search_ids_filter(BitsetType& bitset, SegmentInternalInterface::timestamp_filter(BitsetType& bitset,
Timestamp timestamp) const { Timestamp timestamp) const {
auto& timestamps = get_timestamps(); auto& timestamps = get_timestamps();
for (int offset = bitset.find_first(); offset < bitset.size(); auto cnt = bitset.size();
offset = bitset.find_next(offset)) { if (timestamps[cnt - 1] <= timestamp) {
// no need to filter out anything.
return;
}
auto pilot = upper_bound(timestamps, 0, cnt, timestamp);
// offset bigger than pilot should be filtered out.
for (int offset = pilot; offset < cnt; offset = bitset.find_next(offset)) {
if (offset == BitsetType::npos) { if (offset == BitsetType::npos) {
return; return;
} }
// You can't see an entity which is inserted after the point when you search. bitset[offset] = false;
if (timestamps[offset] > timestamp) {
bitset[offset] = false;
}
} }
} }
void void
SegmentInternalInterface::search_ids_filter(BitsetType& bitset, SegmentInternalInterface::timestamp_filter(BitsetType& bitset,
const std::vector<int64_t>& offsets, const std::vector<int64_t>& offsets,
Timestamp timestamp) const { Timestamp timestamp) const {
BitsetType bitset_copy = bitset;
bitset.reset();
auto& timestamps = get_timestamps(); auto& timestamps = get_timestamps();
auto cnt = bitset.size();
if (timestamps[cnt - 1] <= timestamp) {
// no need to filter out anything.
return;
}
// point query, faster than binary search.
for (auto& offset : offsets) { for (auto& offset : offsets) {
// You can't see an entity which is inserted after the point when you search. if (timestamps[offset] > timestamp) {
if (bitset_copy[offset] && timestamps[offset] <= timestamp) {
bitset.set(offset, true); bitset.set(offset, true);
} }
} }
......
...@@ -186,16 +186,47 @@ class SegmentInternalInterface : public SegmentInterface { ...@@ -186,16 +186,47 @@ class SegmentInternalInterface : public SegmentInterface {
virtual std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>> virtual std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
search_ids(const IdArray& id_array, Timestamp timestamp) const = 0; search_ids(const IdArray& id_array, Timestamp timestamp) const = 0;
/**
* Apply timestamp filtering on bitset, the query can't see an entity whose
* timestamp is bigger than the timestamp of query.
*
* @param bitset The final bitset after scalar filtering and delta filtering,
* `false` means that the entity will be filtered out.
* @param timestamp The timestamp of query.
*/
void void
search_ids_filter(BitsetType& bitset, Timestamp timestamp) const; timestamp_filter(BitsetType& bitset, Timestamp timestamp) const;
/**
* Apply timestamp filtering on bitset, the query can't see an entity whose
* timestamp is bigger than the timestamp of query. The passed offsets are
* all candidate entities.
*
* @param bitset The final bitset after scalar filtering and delta filtering,
* `true` means that the entity will be filtered out.
* @param offsets The segment offsets of all candidates.
* @param timestamp The timestamp of query.
*/
void void
search_ids_filter(BitsetType& bitset, timestamp_filter(BitsetType& bitset,
const std::vector<int64_t>& offsets, const std::vector<int64_t>& offsets,
Timestamp timestamp) const; Timestamp timestamp) const;
/**
* Sort all candidates in ascending order, and then return the limit smallest.
* Bitset is used to check if the candidate will be filtered out. `false_filtered_out`
* determines how to filter out candidates. If `false_filtered_out` is true, we will
* filter all candidates whose related bit is false.
*
* @param limit
* @param bitset
* @param false_filtered_out
* @return All candidates offsets.
*/
virtual std::vector<OffsetMap::OffsetType> virtual std::vector<OffsetMap::OffsetType>
find_first(int64_t limit, const BitsetType& bitset) const = 0; find_first(int64_t limit,
const BitsetType& bitset,
bool false_filtered_out) const = 0;
protected: protected:
// internal API: return chunk_data in span // internal API: return chunk_data in span
......
...@@ -111,8 +111,11 @@ class SegmentSealedImpl : public SegmentSealed { ...@@ -111,8 +111,11 @@ class SegmentSealedImpl : public SegmentSealed {
const Timestamp* timestamps) override; const Timestamp* timestamps) override;
std::vector<OffsetMap::OffsetType> std::vector<OffsetMap::OffsetType>
find_first(int64_t limit, const BitsetType& bitset) const override { find_first(int64_t limit,
return insert_record_.pk2offset_->find_first(limit, bitset); const BitsetType& bitset,
bool false_filtered_out) const override {
return insert_record_.pk2offset_->find_first(
limit, bitset, false_filtered_out);
} }
protected: protected:
......
...@@ -588,4 +588,21 @@ LoadFieldDatasFromRemote(std::vector<std::string>& remote_files, ...@@ -588,4 +588,21 @@ LoadFieldDatasFromRemote(std::vector<std::string>& remote_files,
channel->close(); channel->close();
} }
int64_t
upper_bound(const ConcurrentVector<Timestamp>& timestamps,
int64_t first,
int64_t last,
Timestamp value) {
int64_t mid;
while (first < last) {
mid = first + (last - first) / 2;
if (value >= timestamps[mid]) {
first = mid + 1;
} else {
last = mid;
}
}
return first;
}
} // namespace milvus::segcore } // namespace milvus::segcore
...@@ -9,6 +9,8 @@ ...@@ -9,6 +9,8 @@
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License // or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include <unordered_map> #include <unordered_map>
#include <exception> #include <exception>
#include <memory> #include <memory>
...@@ -151,4 +153,20 @@ ReverseDataFromIndex(const index::IndexBase* index, ...@@ -151,4 +153,20 @@ ReverseDataFromIndex(const index::IndexBase* index,
void void
LoadFieldDatasFromRemote(std::vector<std::string>& remote_files, LoadFieldDatasFromRemote(std::vector<std::string>& remote_files,
storage::FieldDataChannelPtr channel); storage::FieldDataChannelPtr channel);
/**
* Returns an index pointing to the first element in the range [first, last) such that `value < element` is true
* (i.e. that is strictly greater than value), or last if no such element is found.
*
* @param timestamps
* @param first
* @param last
* @param value
* @return The index of answer, last will be returned if no timestamp is bigger than the value.
*/
int64_t
upper_bound(const ConcurrentVector<Timestamp>& timestamps,
int64_t first,
int64_t last,
Timestamp value);
} // namespace milvus::segcore } // namespace milvus::segcore
...@@ -68,7 +68,7 @@ TYPED_TEST_P(TypedOffsetOrderedArrayTest, find_first) { ...@@ -68,7 +68,7 @@ TYPED_TEST_P(TypedOffsetOrderedArrayTest, find_first) {
std::vector<int64_t> offsets; std::vector<int64_t> offsets;
// not sealed. // not sealed.
ASSERT_ANY_THROW(this->map_.find_first(Unlimited, {})); ASSERT_ANY_THROW(this->map_.find_first(Unlimited, {}, true));
// insert 10 entities. // insert 10 entities.
int num = 10; int num = 10;
...@@ -83,12 +83,12 @@ TYPED_TEST_P(TypedOffsetOrderedArrayTest, find_first) { ...@@ -83,12 +83,12 @@ TYPED_TEST_P(TypedOffsetOrderedArrayTest, find_first) {
// all is satisfied. // all is satisfied.
BitsetType all(num); BitsetType all(num);
all.set(); all.set();
offsets = this->map_.find_first(num / 2, all); offsets = this->map_.find_first(num / 2, all, true);
ASSERT_EQ(num / 2, offsets.size()); ASSERT_EQ(num / 2, offsets.size());
for (int i = 1; i < offsets.size(); i++) { for (int i = 1; i < offsets.size(); i++) {
ASSERT_TRUE(data[offsets[i - 1]] <= data[offsets[i]]); ASSERT_TRUE(data[offsets[i - 1]] <= data[offsets[i]]);
} }
offsets = this->map_.find_first(Unlimited, all); offsets = this->map_.find_first(Unlimited, all, true);
ASSERT_EQ(num, offsets.size()); ASSERT_EQ(num, offsets.size());
for (int i = 1; i < offsets.size(); i++) { for (int i = 1; i < offsets.size(); i++) {
ASSERT_TRUE(data[offsets[i - 1]] <= data[offsets[i]]); ASSERT_TRUE(data[offsets[i - 1]] <= data[offsets[i]]);
...@@ -97,9 +97,9 @@ TYPED_TEST_P(TypedOffsetOrderedArrayTest, find_first) { ...@@ -97,9 +97,9 @@ TYPED_TEST_P(TypedOffsetOrderedArrayTest, find_first) {
// none is satisfied. // none is satisfied.
BitsetType none(num); BitsetType none(num);
none.reset(); none.reset();
offsets = this->map_.find_first(num / 2, none); offsets = this->map_.find_first(num / 2, none, true);
ASSERT_EQ(0, offsets.size()); ASSERT_EQ(0, offsets.size());
offsets = this->map_.find_first(NoLimit, none); offsets = this->map_.find_first(NoLimit, none, true);
ASSERT_EQ(0, offsets.size()); ASSERT_EQ(0, offsets.size());
} }
......
...@@ -63,7 +63,7 @@ TYPED_TEST_P(TypedOffsetOrderedMapTest, find_first) { ...@@ -63,7 +63,7 @@ TYPED_TEST_P(TypedOffsetOrderedMapTest, find_first) {
std::vector<int64_t> offsets; std::vector<int64_t> offsets;
// no data. // no data.
offsets = this->map_.find_first(Unlimited, {}); offsets = this->map_.find_first(Unlimited, {}, true);
ASSERT_EQ(0, offsets.size()); ASSERT_EQ(0, offsets.size());
// insert 10 entities. // insert 10 entities.
...@@ -76,12 +76,12 @@ TYPED_TEST_P(TypedOffsetOrderedMapTest, find_first) { ...@@ -76,12 +76,12 @@ TYPED_TEST_P(TypedOffsetOrderedMapTest, find_first) {
// all is satisfied. // all is satisfied.
BitsetType all(num); BitsetType all(num);
all.set(); all.set();
offsets = this->map_.find_first(num / 2, all); offsets = this->map_.find_first(num / 2, all, true);
ASSERT_EQ(num / 2, offsets.size()); ASSERT_EQ(num / 2, offsets.size());
for (int i = 1; i < offsets.size(); i++) { for (int i = 1; i < offsets.size(); i++) {
ASSERT_TRUE(data[offsets[i - 1]] <= data[offsets[i]]); ASSERT_TRUE(data[offsets[i - 1]] <= data[offsets[i]]);
} }
offsets = this->map_.find_first(Unlimited, all); offsets = this->map_.find_first(Unlimited, all, true);
ASSERT_EQ(num, offsets.size()); ASSERT_EQ(num, offsets.size());
for (int i = 1; i < offsets.size(); i++) { for (int i = 1; i < offsets.size(); i++) {
ASSERT_TRUE(data[offsets[i - 1]] <= data[offsets[i]]); ASSERT_TRUE(data[offsets[i - 1]] <= data[offsets[i]]);
...@@ -90,9 +90,9 @@ TYPED_TEST_P(TypedOffsetOrderedMapTest, find_first) { ...@@ -90,9 +90,9 @@ TYPED_TEST_P(TypedOffsetOrderedMapTest, find_first) {
// none is satisfied. // none is satisfied.
BitsetType none(num); BitsetType none(num);
none.reset(); none.reset();
offsets = this->map_.find_first(num / 2, none); offsets = this->map_.find_first(num / 2, none, true);
ASSERT_EQ(0, offsets.size()); ASSERT_EQ(0, offsets.size());
offsets = this->map_.find_first(NoLimit, none); offsets = this->map_.find_first(NoLimit, none, true);
ASSERT_EQ(0, offsets.size()); ASSERT_EQ(0, offsets.size());
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "common/Utils.h" #include "common/Utils.h"
#include "query/Utils.h" #include "query/Utils.h"
#include "test_utils/DataGen.h" #include "test_utils/DataGen.h"
#include "common/Types.h"
TEST(Util, StringMatch) { TEST(Util, StringMatch) {
using namespace milvus; using namespace milvus;
...@@ -117,3 +118,18 @@ TEST(Util, OutOfRange) { ...@@ -117,3 +118,18 @@ TEST(Util, OutOfRange) {
ASSERT_TRUE(out_of_range<int32_t>( ASSERT_TRUE(out_of_range<int32_t>(
static_cast<int64_t>(std::numeric_limits<int32_t>::min()) - 1)); static_cast<int64_t>(std::numeric_limits<int32_t>::min()) - 1));
} }
TEST(Util, upper_bound) {
using milvus::Timestamp;
using milvus::segcore::ConcurrentVector;
using milvus::segcore::upper_bound;
std::vector<Timestamp> data{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
ConcurrentVector<Timestamp> timestamps(1);
timestamps.grow_to_at_least(data.size());
timestamps.set_data(0, data.data(), data.size());
ASSERT_EQ(1, upper_bound(timestamps, 0, data.size(), 0));
ASSERT_EQ(5, upper_bound(timestamps, 0, data.size(), 4));
ASSERT_EQ(10, upper_bound(timestamps, 0, data.size(), 10));
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册