diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt index 341aeda4e41a533f517e47ad16a3868714775c3c..4ad376c6170b47b547a9cad62d81c8e0871005d8 100644 --- a/paddle/fluid/operators/reader/CMakeLists.txt +++ b/paddle/fluid/operators/reader/CMakeLists.txt @@ -16,7 +16,7 @@ function(reader_library TARGET_NAME) endfunction() cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool) -cc_library(ctr_reader SRCS ctr_reader.cc DEPS reader simple_threadpool boost) +cc_library(ctr_reader SRCS ctr_reader.cc DEPS reader simple_threadpool boost gzstream) reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader) reader_library(create_ctr_reader_op SRCS create_ctr_reader_op.cc DEPS ctr_reader) reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index a4197a54349eeb1582bbba2cf0c65bc8e0d20b9c..8be9f68c9410ac9dede0a70f8137e552d3009ef8 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/operators/reader/ctr_reader.h" +#include + #include #include #include @@ -24,10 +26,6 @@ #include #include -#include -#include -#include - namespace paddle { namespace operators { namespace reader { @@ -75,23 +73,19 @@ static inline void parse_line( class GzipReader { public: - explicit GzipReader(const std::string& file_name) : instream_(&inbuf_) { - file_ = std::ifstream(file_name, std::ios_base::in | std::ios_base::binary); - inbuf_.push(boost::iostreams::gzip_decompressor()); - inbuf_.push(file_); - // Convert streambuf to istream - } + explicit GzipReader(const std::string& file_name) + : gzstream_(file_name.c_str()) {} - ~GzipReader() { file_.close(); } + ~GzipReader() {} - bool HasNext() { return instream_.peek() != EOF; } + bool HasNext() { return gzstream_.peek() != EOF; } - void NextLine(std::string& line) { std::getline(instream_, line); } // NOLINT + void NextLine(std::string* line) { // NOLINT + std::getline(gzstream_, line); + } private: - boost::iostreams::filtering_streambuf inbuf_; - std::ifstream file_; - std::istream instream_; + igzstream gzstream_; }; class MultiGzipReader { @@ -113,8 +107,8 @@ class MultiGzipReader { return true; } - void NextLine(std::string& line) { // NOLINT - readers_[current_reader_index_]->NextLine(line); + void NextLine(std::string* line) { + readers_[current_reader_index_]->NextLine(*line); } private: @@ -122,12 +116,6 @@ class MultiGzipReader { size_t current_reader_index_ = 0; }; -// void CTRReader::ReadThread( -// const std::vector &file_list, -// const std::vector& slots, -// int batch_size, -// std::shared_ptr& queue) {} - void CTRReader::ReadThread(const std::vector& file_list, const std::vector& slots, int batch_size, @@ -135,14 +123,12 @@ void CTRReader::ReadThread(const std::vector& file_list, std::string line; // read all files - std::vector all_lines; MultiGzipReader reader(file_list); + reader.NextLine(&line); - for (int j = 0; j < all_lines.size(); ++j) { - std::unordered_map> slots_to_data; - int64_t label; - parse_line(all_lines[j], slots, &label, &slots_to_data); - } + std::unordered_map> slots_to_data; + int64_t label; + parse_line(line, slots, &label, &slots_to_data); } } // namespace reader diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 1ef6e6d551fb155e7de12a370303a67992415b80..11eb4f97864a849942088694840b98a3a808877b 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -22,10 +22,6 @@ #include #include -#include -#include -#include - #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"