range_scanner.h 2.6 KB
Newer Older
D
"init"  
dzhwinter 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

17 18 19
#include <utility>

#include "paddle/fluid/recordio/chunk.h"
20
#include "paddle/fluid/recordio/io.h"
D
"init"  
dzhwinter 已提交
21

22 23 24 25 26 27 28 29
namespace paddle {
namespace recordio {

// Index consists offsets and sizes of the consequetive chunks in a RecordIO
// file.
//
// Index supports Gob. Every field in the Index needs to be exported
// for the correct encoding and decoding using Gob.
D
"init"  
dzhwinter 已提交
30 31
class Index {
public:
32 33 34 35
  Index() : num_records_(0) {}
  // LoadIndex scans the file and parse chunkOffsets, chunkLens, and len.
  void LoadIndex(Stream* fi);
  // NumRecords returns the total number of all records in a RecordIO file.
D
"init"  
dzhwinter 已提交
36
  int NumRecords() { return num_records_; }
37 38 39 40
  // NumChunks returns the total number of chunks in a RecordIO file.
  int NumChunks() { return chunk_lens_.size(); }
  // ChunkIndex return the Index of i-th Chunk.
  int ChunkIndex(int i);
D
"init"  
dzhwinter 已提交
41

42 43
  int64_t ChunkOffsets(int i) { return chunk_offsets_[i]; }

D
"init"  
dzhwinter 已提交
44 45 46
  // Locate returns the index of chunk that contains the given record,
  // and the record index within the chunk.  It returns (-1, -1) if the
  // record is out of range.
47
  std::pair<int, int> Locate(int record_idx);
D
"init"  
dzhwinter 已提交
48 49

private:
50
  // the offset of each chunk in a file.
D
"init"  
dzhwinter 已提交
51
  std::vector<int64_t> chunk_offsets_;
52
  // the length of each chunk in a file.
D
"init"  
dzhwinter 已提交
53
  std::vector<uint32_t> chunk_lens_;
54
  // the numer of all records in a file.
D
"init"  
dzhwinter 已提交
55
  int num_records_;
56
  // the number of records in chunks.
D
"init"  
dzhwinter 已提交
57 58 59 60 61 62
  std::vector<int> chunk_records_;
};

// RangeScanner
class RangeScanner {
public:
63 64 65
  // creates a scanner that sequencially reads records in the
  // range [start, start+len).  If start < 0, it scans from the
  // beginning.  If len < 0, it scans till the end of file.
66
  RangeScanner(Stream* fi, Index idx, int start, int end);
67 68
  // Scan moves the cursor forward for one record and loads the chunk
  // containing the record if not yet.
D
"init"  
dzhwinter 已提交
69 70 71 72
  bool Scan();
  const std::string Record();

private:
73
  Stream* fi;
D
"init"  
dzhwinter 已提交
74 75 76 77 78
  Index index_;
  int start_, end_, cur_;
  int chunk_index_;
  std::unique_ptr<Chunk> chunk_;
};
79 80 81

}  // namespace recordio
}  // namespace paddle