From 8f52e5b6c75af284a0ee079610911f7ef71e6c86 Mon Sep 17 00:00:00 2001 From: Letian Jiang Date: Thu, 10 Mar 2022 16:33:59 +0800 Subject: [PATCH] Add unit tests in segcore (#15960) Signed-off-by: Letian Jiang --- internal/core/unittest/test_segcore.cpp | 75 +++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/internal/core/unittest/test_segcore.cpp b/internal/core/unittest/test_segcore.cpp index 64f2b053b..ea668c421 100644 --- a/internal/core/unittest/test_segcore.cpp +++ b/internal/core/unittest/test_segcore.cpp @@ -56,6 +56,7 @@ TEST(SegmentCoreTest, NormalDistributionTest) { segment->PreDelete(N); } +// Test insert row-based data TEST(SegmentCoreTest, MockTest) { using namespace milvus::segcore; using namespace milvus::engine; @@ -95,6 +96,80 @@ TEST(SegmentCoreTest, MockTest) { i++; } +// Test insert column-based data +TEST(SegmentCoreTest, MockTest2) { + using namespace milvus::segcore; + using namespace milvus::engine; + + // schema + auto schema = std::make_shared(); + schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2); + schema->AddDebugField("age", DataType::INT32); + + // generate random row-based data + std::vector row_data; + std::vector timestamps; + std::vector uids; + int N = 10000; // number of records + std::default_random_engine e(67); + for (int i = 0; i < N; ++i) { + uids.push_back(100000 + i); + timestamps.push_back(0); + // append vec + float vec[16]; + for (auto& x : vec) { + x = e() % 2000 * 0.001 - 1.0; + } + row_data.insert(row_data.end(), (const char*)std::begin(vec), (const char*)std::end(vec)); + int age = e() % 100; + row_data.insert(row_data.end(), (const char*)&age, ((const char*)&age) + sizeof(age)); + } + auto line_sizeof = (sizeof(int) + sizeof(float) * 16); + assert(row_data.size() == line_sizeof * N); + + int64_t size = N; + const int64_t* uids_raw = uids.data(); + const Timestamp* timestamps_raw = timestamps.data(); + std::vector> ordering(size); // timestamp, pk, order_index + for (int i = 0; i < size; ++i) { + ordering[i] = std::make_tuple(timestamps_raw[i], uids_raw[i], i); + } + std::sort(ordering.begin(), ordering.end()); // sort according to timestamp + + // convert row-based data to column-based data accordingly + auto sizeof_infos = schema->get_sizeof_infos(); + std::vector offset_infos(schema->size() + 1, 0); + std::partial_sum(sizeof_infos.begin(), sizeof_infos.end(), offset_infos.begin() + 1); + std::vector> entities(schema->size()); + + for (int fid = 0; fid < schema->size(); ++fid) { + auto len = sizeof_infos[fid]; + entities[fid].resize(len * size); + } + + auto raw_data = row_data.data(); + std::vector sorted_uids(size); + std::vector sorted_timestamps(size); + for (int index = 0; index < size; ++index) { + auto [t, uid, order_index] = ordering[index]; + sorted_timestamps[index] = t; + sorted_uids[index] = uid; + for (int fid = 0; fid < schema->size(); ++fid) { + auto len = sizeof_infos[fid]; + auto offset = offset_infos[fid]; + auto src = raw_data + order_index * line_sizeof + offset; + auto dst = entities[fid].data() + index * len; + memcpy(dst, src, len); + } + } + + // insert column-based data + ColumnBasedRawData data_chunk{entities, N}; + auto segment = CreateGrowingSegment(schema); + auto reserved_begin = segment->PreInsert(N); + segment->Insert(reserved_begin, size, sorted_uids.data(), sorted_timestamps.data(), data_chunk); +} + TEST(SegmentCoreTest, SmallIndex) { using namespace milvus::segcore; using namespace milvus::engine; -- GitLab