提交 fe183415 编写于 作者: D dongzhihong

"seperate internal library and exported library"

上级 7364348d
cc_library(header SRCS header.cc) # internal library.
cc_test(header_test SRCS header_test.cc DEPS header)
cc_library(io SRCS io.cc DEPS stringpiece) cc_library(io SRCS io.cc DEPS stringpiece)
cc_test(io_test SRCS io_test.cc DEPS io) cc_test(io_test SRCS io_test.cc DEPS io)
cc_library(header SRCS header.cc DEPS io)
cc_test(header_test SRCS header_test.cc DEPS header)
cc_library(chunk SRCS chunk.cc DEPS snappy) cc_library(chunk SRCS chunk.cc DEPS snappy)
cc_test(chunk_test SRCS chunk_test.cc DEPS chunk)
cc_library(range_scanner SRCS range_scanner.cc DEPS io chunk)
cc_test(range_scanner_test SRCS range_scanner_test.cc DEPS range_scanner)
cc_library(scanner SRCS scanner.cc DEPS range_scanner)
cc_test(scanner_test SRCS scanner_test.cc DEPS scanner)
# exported library.
cc_library(recordio SRCS recordio.cc DEPS scanner chunk header)
cc_test(recordio_test SRCS recordio_test.cc DEPS scanner)
...@@ -32,9 +32,10 @@ public: ...@@ -32,9 +32,10 @@ public:
bool Dump(Stream* fo, Compressor ct); bool Dump(Stream* fo, Compressor ct);
void Parse(Stream* fi, size_t offset); void Parse(Stream* fi, size_t offset);
size_t NumBytes() { return num_bytes_; } size_t NumBytes() { return num_bytes_; }
const std::string Record(int i) { return records_[i]; }
private: private:
std::forward_list<std::string> records_; std::forward_list<const std::string> records_;
// sum of record lengths in bytes. // sum of record lengths in bytes.
size_t num_bytes_; size_t num_bytes_;
DISABLE_COPY_AND_ASSIGN(Chunk); DISABLE_COPY_AND_ASSIGN(Chunk);
......
...@@ -22,12 +22,18 @@ using namespace paddle::recordio; ...@@ -22,12 +22,18 @@ using namespace paddle::recordio;
TEST(Recordio, ChunkHead) { TEST(Recordio, ChunkHead) {
Header hdr(0, 1, Compressor::kGzip, 3); Header hdr(0, 1, Compressor::kGzip, 3);
Stream* oss = Stream::Open("/tmp/record_1", "w"); {
hdr->Write(oss); Stream* oss = Stream::Open("/tmp/record_1", "w");
hdr.Write(oss);
delete oss;
}
// Stream* iss = Stream::Open("/tmp/record_1", "r"); Header hdr2;
// Header hdr2; {
// hdr2.Parse(iss); Stream* iss = Stream::Open("/tmp/record_1", "r");
hdr2.Parse(iss);
delete iss;
}
// EXPECT_TRUE(hdr == hdr2); EXPECT_TRUE(hdr == hdr2);
} }
...@@ -17,10 +17,37 @@ ...@@ -17,10 +17,37 @@
namespace paddle { namespace paddle {
namespace recordio { namespace recordio {
void Index::LoadIndex(FileStream* fi) {
int64_t offset = 0;
while (!fi->Eof()) {
Header hdr;
hdr.Parse(fi);
chunk_offsets_.push_back(offset);
chunk_lens_.push_back(hdr.NumRecords());
chunk_records_.push_back(hdr.NumRecords());
num_records_ += hdr.NumRecords();
offset += hdr.CompressSize();
}
}
Index Index::ChunkIndex(int i) { Index idx; } Index Index::ChunkIndex(int i) { Index idx; }
RangeScanner::RangeScanner(std::istream is, Index idx, int start, int len) std::pair<int, int> Index::Locate(int record_idx) {
: stream_(is.rdbuf()), index_(idx) { std::pair<int, int> range(-1, -1);
int sum = 0;
for (size_t i = 0; i < chunk_lens_.size(); ++i) {
int len = static_cast<int>(chunk_lens_[i]);
sum += len;
if (record_idx < sum) {
range.first = static_cast<int>(i);
range.second = record_idx - sum + len;
}
}
return range;
}
RangeScanner::RangeScanner(Stream* fi, Index idx, int start, int len)
: stream_(fi), index_(idx) {
if (start < 0) { if (start < 0) {
start = 0; start = 0;
} }
...@@ -30,16 +57,28 @@ RangeScanner::RangeScanner(std::istream is, Index idx, int start, int len) ...@@ -30,16 +57,28 @@ RangeScanner::RangeScanner(std::istream is, Index idx, int start, int len)
start_ = start; start_ = start;
end_ = start + len; end_ = start + len;
cur_ = start - 1; cur_ = start - 1; // The intial status required by Scan
chunk_index_ = -1; chunk_index_ = -1;
// chunk_->reset(new Chunk()); chunk_.reset(new Chunk);
} }
bool RangeScanner::Scan() {} bool RangeScanner::Scan() {
++cur_;
if (cur_ >= end_) {
return false;
} else {
auto cursor = index_.Locate(cur_);
if (chunk_index_ != cursor.first) {
chunk_index_ = cursor.first;
chunk_->Parse(fi, index_.ChunkOffsets[chunk_index_]);
}
}
return true;
}
const std::string RangeScanner::Record() { const std::string RangeScanner::Record() {
// int i = index_.Locate(cur_); auto cursor = index_.Locate(cur_);
// return chunk_->Record(i); return chunk_->Record(cursor.second);
} }
} // namespace recordio } // namespace recordio
......
...@@ -14,6 +14,9 @@ ...@@ -14,6 +14,9 @@
#pragma once #pragma once
#include <utility>
#include "paddle/fluid/recordio/chunk.h"
#include "paddle/fluid/recordio/io.h" #include "paddle/fluid/recordio/io.h"
namespace paddle { namespace paddle {
...@@ -26,29 +29,22 @@ namespace recordio { ...@@ -26,29 +29,22 @@ namespace recordio {
// for the correct encoding and decoding using Gob. // for the correct encoding and decoding using Gob.
class Index { class Index {
public: public:
Index() : num_records_(0) {}
// LoadIndex scans the file and parse chunkOffsets, chunkLens, and len.
void LoadIndex(Stream* fi);
// NumRecords returns the total number of all records in a RecordIO file.
int NumRecords() { return num_records_; } int NumRecords() { return num_records_; }
// NumChunks returns the total number of chunks in a RecordIO file. // NumChunks returns the total number of chunks in a RecordIO file.
int NumChunks() { return chunk_lens_.size(); } int NumChunks() { return chunk_lens_.size(); }
// ChunkIndex return the Index of i-th Chunk. // ChunkIndex return the Index of i-th Chunk.
int ChunkIndex(int i); int ChunkIndex(int i);
int64_t ChunkOffsets(int i) { return chunk_offsets_[i]; }
// Locate returns the index of chunk that contains the given record, // Locate returns the index of chunk that contains the given record,
// and the record index within the chunk. It returns (-1, -1) if the // and the record index within the chunk. It returns (-1, -1) if the
// record is out of range. // record is out of range.
void Locate(int record_idx, std::pair<int, int>* out) { std::pair<int, int> Locate(int record_idx);
size_t sum = 0;
for (size_t i = 0; i < chunk_lens_.size(); ++i) {
sum += chunk_lens_[i];
if (static_cast<size_t>(record_idx) < sum) {
out->first = i;
out->second = record_idx - sum + chunk_lens_[i];
return;
}
}
// out->swap(std::make_pair<int,int>(-1, -1));
out->first = -1;
out->second = -1;
}
private: private:
// the offset of each chunk in a file. // the offset of each chunk in a file.
...@@ -62,12 +58,14 @@ private: ...@@ -62,12 +58,14 @@ private:
}; };
// RangeScanner // RangeScanner
// creates a scanner that sequencially reads records in the
// range [start, start+len). If start < 0, it scans from the
// beginning. If len < 0, it scans till the end of file.
class RangeScanner { class RangeScanner {
public: public:
// creates a scanner that sequencially reads records in the
// range [start, start+len). If start < 0, it scans from the
// beginning. If len < 0, it scans till the end of file.
RangeScanner(Stream* fi, Index idx, int start, int end); RangeScanner(Stream* fi, Index idx, int start, int end);
// Scan moves the cursor forward for one record and loads the chunk
// containing the record if not yet.
bool Scan(); bool Scan();
const std::string Record(); const std::string Record();
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/recordio/range_scanner.h"
#include "gtest/gtest.h"
using namespace paddle::recordio;
TEST(RangeScanner, Recordio) {
Stream* fo = Stream::Open("/tmp/record_range", "w");
}
...@@ -12,13 +12,9 @@ ...@@ -12,13 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once #include "paddle/fluid/recordio/io.h"
#include "paddle/fluid/string/piece.h"
#include <fcntl.h> namespace paddle {
#include <stdio.h> namespace recordio {} // namespace recordio
#include <unistd.h> } // namespace paddle
class DefaultFileSys {
public:
private:
};
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/recordio/chunk.h"
#include "paddle/fluid/recordio/header.h"
#include "paddle/fluid/recordio/io.h"
#include "paddle/fluid/recordio/scanner.h"
#include "paddle/fluid/recordio/writer.h"
...@@ -31,7 +31,7 @@ Scanner::Scanner(const char* paths) ...@@ -31,7 +31,7 @@ Scanner::Scanner(const char* paths)
} }
bool Scanner::Scan() { bool Scanner::Scan() {
if (err_ == -1 || end_ == true) { if (end_ == true) {
return false; return false;
} }
if (cur_scanner_ == nullptr) { if (cur_scanner_ == nullptr) {
...@@ -39,20 +39,30 @@ bool Scanner::Scan() { ...@@ -39,20 +39,30 @@ bool Scanner::Scan() {
end_ = true; end_ = true;
return false; return false;
} }
if (err_ == -1) {
return false;
}
} }
if (!cur_scanner_->Scan()) { if (!cur_scanner_->Scan()) {
if (err_ == -1) { end_ = true;
return false; cur_file_ = nullptr;
} return false;
} }
return true; return true;
} }
bool Scanner::NextFile() {} bool Scanner::NextFile() {
if (path_idx_ >= paths_.size()) {
return false;
}
std::string path = paths_[path_idx_];
++path_idx_;
cur_file_ = Stream::Open(path);
if (cur_file_ == nullptr) {
return false;
}
Index idx;
idx.LoadIndex(cur_file_);
cur_scanner_ = RangeScanner(cur_file_, idx, 0, -1);
return true;
}
} // namespace recordio } // namespace recordio
} // namespace paddle } // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册