From 9f53aad13ad840a2b49546bb5832eb74ee268687 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 3 Dec 2018 10:16:40 +0800 Subject: [PATCH] add test for read csv data --- paddle/fluid/operators/reader/ctr_reader.cc | 144 ++++++++++++++++-- .../fluid/operators/reader/ctr_reader_test.cc | 68 +++++++++ 2 files changed, 196 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 9834d7183..3595d771b 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -76,22 +76,6 @@ static inline void parse_line( // label slot1:fea_sign slot2:fea_sign slot1:fea_sign static inline void parse_svm_line(const std::string& line) {} -// label,dense_fea,dense_fea,sparse_fea,sparse_fea -static inline void parse_csv_line(const std::string& line, - const DataDesc& data_desc, int64_t* label, - std::vector* dense_datas, - std::vector* sparse_datas) { - std::vector ret; - string_split(line, ',', &ret); - *label = std::stol(ret[2]) > 0; - for (auto& idx : data_desc.dense_slot_index_) { - dense_datas->push_back(std::stof(ret[idx])); - } - for (auto& idx : data_desc.sparse_slot_index_) { - sparse_datas->push_back(std::stol(ret[idx])); - } -} - class Reader { public: virtual ~Reader() {} @@ -250,6 +234,132 @@ void ReadSvmData(const DataDesc& data_desc, std::shared_ptr reader, } } +// label dense_fea,dense_fea sparse_fea,sparse_fea +static inline void parse_csv_line( + const std::string& line, const DataDesc& data_desc, int64_t* label, + std::vector>* dense_datas, + std::vector>* sparse_datas) { + std::vector ret; + string_split(line, ' ', &ret); + *label = std::stol(ret[0]); + dense_datas->resize(data_desc.dense_slot_index_.size()); + for (size_t i = 0; i < data_desc.dense_slot_index_.size(); ++i) { + int slot_idx = data_desc.dense_slot_index_[i]; + auto& slot_data = ret[slot_idx]; + std::vector data_in_slot_str; + string_split(ret[slot_idx], ',', &data_in_slot_str); + std::vector data_in_slot; + for (auto& data_str : data_in_slot_str) { + (*dense_datas)[i].push_back(std::stof(data_str)); + } + } + sparse_datas->resize(data_desc.sparse_slot_index_.size()); + for (size_t i = 0; i < data_desc.sparse_slot_index_.size(); ++i) { + int slot_idx = data_desc.sparse_slot_index_[i]; + auto& slot_data = ret[slot_idx]; + std::vector data_in_slot_str; + string_split(ret[slot_idx], ',', &data_in_slot_str); + std::vector data_in_slot; + for (auto& data_str : data_in_slot_str) { + (*sparse_datas)[i].push_back(std::stol(data_str)); + } + } +} + +void ReadCsvData(const DataDesc& data_desc, std::shared_ptr reader, + std::shared_ptr queue) { + std::string line; + while (reader->HasNext()) { + std::vector batch_label; + batch_label.reserve(data_desc.batch_size_); + + std::vector>> batch_dense_data; + batch_dense_data.reserve(data_desc.batch_size_); + + std::vector>> batch_sparse_data; + batch_sparse_data.reserve(data_desc.batch_size_); + + // read batch_size data + for (int i = 0; i < data_desc.batch_size_; ++i) { + if (reader->HasNext()) { + reader->NextLine(&line); + int64_t label; + std::vector> dense_datas; + std::vector> sparse_datas; + parse_csv_line(line, data_desc, &label, &dense_datas, &sparse_datas); + batch_label.push_back(label); + if (!batch_dense_data.empty()) { + PADDLE_ENFORCE_EQ(batch_dense_data[0].size(), dense_datas.size(), + "dense data should have the same shape"); + } + batch_dense_data.push_back(dense_datas); + batch_sparse_data.push_back(sparse_datas); + } else { + break; + } + } + + // the order of output data is label, dense_datas, sparse_datas + std::vector lod_datas; + + // insert label tensor + framework::LoDTensor label_tensor; + auto* label_tensor_data = label_tensor.mutable_data( + framework::make_ddim({static_cast(batch_label.size()), 1}), + platform::CPUPlace()); + memcpy(label_tensor_data, batch_label.data(), + batch_label.size() * sizeof(int64_t)); + auto dim = + framework::make_ddim({static_cast(batch_label.size()), 1}); + lod_datas.push_back(label_tensor); + + // insert tensor for each dense_slots + for (size_t i = 0; i < data_desc.dense_slot_index_.size(); ++i) { + framework::LoDTensor lod_tensor; + size_t width = batch_dense_data[0][i].size(); + auto* tensor_data = lod_tensor.mutable_data( + framework::make_ddim( + {static_cast(batch_dense_data.size()), // batch_size + static_cast(width)}), + platform::CPUPlace()); + + for (size_t j = 0; j < batch_dense_data.size(); ++j) { + auto& dense_data_row = batch_dense_data[j][i]; + memcpy(tensor_data + j * width, dense_data_row.data(), + width * sizeof(float)); + } + + lod_datas.push_back(lod_tensor); + } + + // insert tensor for each sparse_slots + for (size_t i = 0; i < data_desc.sparse_slot_index_.size(); ++i) { + std::vector lod_data{0}; + std::vector batch_feasign; + + for (size_t row_idx = 0; row_idx < batch_sparse_data.size(); ++row_idx) { + auto& sparse_ids = batch_sparse_data[row_idx][i]; + lod_data.push_back(lod_data.back() + sparse_ids.size()); + batch_feasign.insert(batch_feasign.end(), sparse_ids.begin(), + sparse_ids.end()); + } + + framework::LoDTensor lod_tensor; + framework::LoD lod{lod_data}; + lod_tensor.set_lod(lod); + int64_t* tensor_data = lod_tensor.mutable_data( + framework::make_ddim({static_cast(batch_feasign.size()), 1}), + platform::CPUPlace()); + memcpy(tensor_data, batch_feasign.data(), + batch_feasign.size() * sizeof(int64_t)); + lod_datas.push_back(lod_tensor); + } + + queue->Push(lod_datas); + VLOG(4) << "push one data, queue_size=" << queue->Size(); + } +} + void ReadThread(const std::vector& file_list, const DataDesc& data_desc, int thread_id, std::vector* thread_status, @@ -276,6 +386,8 @@ void ReadThread(const std::vector& file_list, if (data_desc.file_format_ == "svm") { ReadSvmData(data_desc, reader, queue); + } else if (data_desc.file_format_ == "csv") { + ReadCsvData(data_desc, reader, queue); } (*thread_status)[thread_id] = Stopped; diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index dfdaae3a0..9f3a254c8 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -159,3 +159,71 @@ TEST(CTR_READER, read_data) { &reader); reader.Shutdown(); } + +static void GenereteCsvData(const std::string& file_name, + const std::vector& data) { + std::ofstream out(file_name.c_str()); + PADDLE_ENFORCE(out.good(), "open file %s failed!", file_name); + for (auto& c : data) { + out << c; + } + out.close(); + PADDLE_ENFORCE(out.good(), "save file %s failed!", file_name); +} + +static void CheckReadCsvOut(const std::vector& out) { + ASSERT_EQ(out.size(), 3); + ASSERT_EQ(out[0].dims()[1], 1); + ASSERT_EQ(out[1].dims()[1], 2); + ASSERT_EQ(out[2].dims()[1], 1); + for (size_t i = 0; i < out[0].numel(); ++i) { + int64_t label = out[0].data()[i]; + auto& dense_dim = out[1].dims(); + for (size_t j = 0; j < dense_dim[1]; ++j) { + ASSERT_EQ(out[1].data()[i * dense_dim[1] + j], + static_cast(label + 0.1)); + } + auto& sparse_lod = out[2].lod(); + for (size_t j = sparse_lod[0][i]; j < sparse_lod[0][i + 1]; ++j) { + ASSERT_EQ(out[2].data()[j], label); + } + } +} + +TEST(CTR_READER, read_csv_data) { + std::string file_name = "test_ctr_reader_data.csv"; + const std::vector csv_data = { + "0 0.1,0.1 0,0,0,0\n", "1 1.1,1.1 1,1,1,1\n", "2 2.1,2.1 2,2,2,2\n", + "3 3.1,3.1 3,3,3,3\n", + }; + GenereteCsvData(file_name, csv_data); + + LoDTensorBlockingQueueHolder queue_holder; + int capacity = 64; + queue_holder.InitOnce(capacity, false); + + std::shared_ptr queue = queue_holder.GetQueue(); + + int batch_size = 3; + int thread_num = 1; + std::vector file_list; + for (int i = 0; i < thread_num; ++i) { + file_list.push_back(file_name); + } + DataDesc data_desc(batch_size, file_list, "plain", "csv", {1}, {2}, {}); + + CTRReader reader(queue, thread_num, data_desc); + + for (size_t i = 0; i < 2; ++i) { + reader.Start(); + std::vector out; + while (true) { + reader.ReadNext(&out); + if (out.empty()) { + break; + } + CheckReadCsvOut(out); + } + reader.Shutdown(); + } +} -- GitLab