ctr_reader.cc 7.9 KB
Newer Older
Q
Qiao Longfei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/operators/reader/ctr_reader.h"

Q
Qiao Longfei 已提交
17 18
#include <gzstream.h>

Q
Qiao Longfei 已提交
19 20 21 22 23 24 25 26 27 28
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <unordered_map>

#include <algorithm>
#include <random>

Q
Qiao Longfei 已提交
29 30
namespace paddle {
namespace operators {
Q
Qiao Longfei 已提交
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
namespace reader {

static inline void string_split(const std::string& s, const char delimiter,
                                std::vector<std::string>* output) {
  size_t start = 0;
  size_t end = s.find_first_of(delimiter);

  while (end <= std::string::npos) {
    output->emplace_back(s.substr(start, end - start));
    if (end == std::string::npos) {
      break;
    }
    start = end + 1;
    end = s.find_first_of(delimiter, start);
  }
}

static inline void parse_line(
Q
Qiao Longfei 已提交
49 50
    const std::string& line,
    const std::unordered_map<std::string, size_t>& slot_to_index,
Q
Qiao Longfei 已提交
51
    int64_t* label,
Q
Qiao Longfei 已提交
52
    std::unordered_map<std::string, std::vector<int64_t>>* slot_to_data) {
Q
Qiao Longfei 已提交
53 54 55
  std::vector<std::string> ret;
  string_split(line, ' ', &ret);
  *label = std::stoi(ret[2]) > 0;
Q
Qiao Longfei 已提交
56

Q
Qiao Longfei 已提交
57 58
  for (size_t i = 3; i < ret.size(); ++i) {
    const std::string& item = ret[i];
Q
Qiao Longfei 已提交
59 60 61
    std::vector<std::string> feasign_and_slot;
    string_split(item, ':', &feasign_and_slot);
    if (feasign_and_slot.size() == 2 &&
Q
Qiao Longfei 已提交
62
        slot_to_index.find(feasign_and_slot[1]) != slot_to_index.end()) {
Q
Qiao Longfei 已提交
63 64
      int64_t feasign = std::strtoll(feasign_and_slot[0].c_str(), NULL, 10);
      (*slot_to_data)[feasign_and_slot[1]].push_back(feasign);
Q
Qiao Longfei 已提交
65 66
    }
  }
Q
Qiao Longfei 已提交
67 68

  // NOTE:: if the slot has no value, then fill [0] as it's data.
Q
Qiao Longfei 已提交
69 70 71
  for (auto& item : slot_to_index) {
    if (slot_to_data->find(item.first) == slot_to_data->end()) {
      (*slot_to_data)[item.first].push_back(0);
Q
Qiao Longfei 已提交
72 73
    }
  }
Q
Qiao Longfei 已提交
74 75
}

Q
Qiao Longfei 已提交
76 77 78 79 80 81 82 83
class Reader {
 public:
  virtual ~Reader() {}
  virtual bool HasNext() = 0;
  virtual void NextLine(std::string* line) = 0;
};

class GzipReader : public Reader {
Q
Qiao Longfei 已提交
84
 public:
Q
Qiao Longfei 已提交
85 86
  explicit GzipReader(const std::string& file_name)
      : gzstream_(file_name.c_str()) {}
Q
Qiao Longfei 已提交
87

Q
Qiao Longfei 已提交
88
  ~GzipReader() {}
Q
Qiao Longfei 已提交
89

Q
Qiao Longfei 已提交
90
  bool HasNext() override { return gzstream_.peek() != EOF; }
Q
Qiao Longfei 已提交
91

Q
Qiao Longfei 已提交
92
  void NextLine(std::string* line) override { std::getline(gzstream_, *line); }
Q
Qiao Longfei 已提交
93 94

 private:
Q
Qiao Longfei 已提交
95
  igzstream gzstream_;
Q
Qiao Longfei 已提交
96 97
};

Q
Qiao Longfei 已提交
98
class PlainFileReader : public Reader {
Q
Qiao Longfei 已提交
99
 public:
Q
Qiao Longfei 已提交
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
  explicit PlainFileReader(const std::string& file_name)
      : myfile_(file_name.c_str()) {}

  ~PlainFileReader() {}

  bool HasNext() override { return myfile_.peek() != EOF; }

  void NextLine(std::string* line) override { std::getline(myfile_, *line); }

 private:
  std::ifstream myfile_;
};

template <typename SingleFileReader>
class MultiFileReader : public Reader {
 public:
  explicit MultiFileReader(const std::vector<std::string>& file_list) {
Q
Qiao Longfei 已提交
117
    for (auto& file : file_list) {
Q
Qiao Longfei 已提交
118
      readers_.emplace_back(std::make_shared<SingleFileReader>(file));
Q
Qiao Longfei 已提交
119 120 121
    }
  }

Q
Qiao Longfei 已提交
122
  bool HasNext() override {
Q
Qiao Longfei 已提交
123 124 125 126 127 128 129 130 131 132
    if (current_reader_index_ >= readers_.size()) {
      return false;
    }
    if (!readers_[current_reader_index_]->HasNext()) {
      current_reader_index_++;
      return HasNext();
    }
    return true;
  }

Q
Qiao Longfei 已提交
133
  void NextLine(std::string* line) override {
Q
Qiao Longfei 已提交
134
    readers_[current_reader_index_]->NextLine(line);
Q
Qiao Longfei 已提交
135 136 137
  }

 private:
Q
Qiao Longfei 已提交
138
  std::vector<std::shared_ptr<SingleFileReader>> readers_;
Q
Qiao Longfei 已提交
139 140 141
  size_t current_reader_index_ = 0;
};

Q
Qiao Longfei 已提交
142 143
void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
                   std::shared_ptr<LoDTensorBlockingQueue> queue) {
Q
Qiao Longfei 已提交
144
  VLOG(3) << "monitor thread in";
Q
Qiao Longfei 已提交
145 146
  bool reader_thread_is_running = true;
  while (reader_thread_is_running) {
Q
Qiao Longfei 已提交
147
    VLOG(3) << "reader_thread_is_running";
Q
Qiao Longfei 已提交
148 149 150
    reader_thread_is_running = false;
    for (size_t i = 0; i < (*thread_status).size(); ++i) {
      if ((*thread_status)[i] == Running) {
Q
Qiao Longfei 已提交
151
        VLOG(3) << "reader is running!";
Q
Qiao Longfei 已提交
152 153 154 155 156
        reader_thread_is_running = true;
      }
    }
    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
  }
Q
Qiao Longfei 已提交
157
  VLOG(3) << "all reader thread is stopped, push empty data into queue";
Q
Qiao Longfei 已提交
158
  queue->Push({});
Q
Qiao Longfei 已提交
159
  VLOG(3) << "monitor thread exited";
Q
Qiao Longfei 已提交
160 161
}

Q
Qiao Longfei 已提交
162
void ReadThread(const std::vector<std::string>& file_list,
Q
Qiao Longfei 已提交
163 164 165
                const std::string& file_type, const std::string& file_format,
                const std::vector<std::string>& dense_slots,
                const std::vector<std::string>& sparse_slots, int batch_size,
Q
Qiao Longfei 已提交
166
                int thread_id, std::vector<ReaderThreadStatus>* thread_status,
Q
Qiao Longfei 已提交
167
                std::shared_ptr<LoDTensorBlockingQueue> queue) {
Q
Qiao Longfei 已提交
168 169
  VLOG(3) << "[" << thread_id << "]"
          << " reader thread start! thread_id = " << thread_id;
Q
Qiao Longfei 已提交
170
  for (auto& file : file_list) {
Q
Qiao Longfei 已提交
171 172
    VLOG(3) << "[" << thread_id << "]"
            << " file " << file;
Q
Qiao Longfei 已提交
173
  }
Q
Qiao Longfei 已提交
174
  (*thread_status)[thread_id] = Running;
Q
Qiao Longfei 已提交
175
  VLOG(3) << "set status to running";
Q
Qiao Longfei 已提交
176 177

  std::unordered_map<std::string, size_t> slot_to_index;
Q
Qiao Longfei 已提交
178 179
  for (size_t i = 0; i < sparse_slots.size(); ++i) {
    slot_to_index[sparse_slots[i]] = i;
Q
Qiao Longfei 已提交
180
  }
Q
Qiao Longfei 已提交
181

Q
Qiao Longfei 已提交
182
  std::string line;
Q
Qiao Longfei 已提交
183 184 185

  std::vector<std::unordered_map<std::string, std::vector<int64_t>>> batch_data;
  std::vector<int64_t> batch_label;
Q
Qiao Longfei 已提交
186

Q
Qiao Longfei 已提交
187 188 189 190 191 192 193 194
  std::unique_ptr<Reader> reader;
  if (file_type == "gzip") {
    reader.reset(new MultiFileReader<GzipReader>(file_list));
  } else if (file_type == "plain") {
    reader.reset(new MultiFileReader<PlainFileReader>(file_list));
  } else {
    PADDLE_THROW("do not support file format %s", file_type);
  }
Q
Qiao Longfei 已提交
195

Q
Qiao Longfei 已提交
196
  VLOG(3) << "reader inited";
Q
Qiao Longfei 已提交
197

Q
Qiao Longfei 已提交
198
  while (reader->HasNext()) {
Q
Qiao Longfei 已提交
199
    batch_data.clear();
Q
Qiao Longfei 已提交
200 201
    batch_data.reserve(batch_size);

Q
Qiao Longfei 已提交
202
    batch_label.clear();
Q
Qiao Longfei 已提交
203
    batch_label.reserve(batch_size);
Q
Qiao Longfei 已提交
204 205

    // read batch_size data
Q
Qiao Longfei 已提交
206
    for (int i = 0; i < batch_size; ++i) {
Q
Qiao Longfei 已提交
207 208
      if (reader->HasNext()) {
        reader->NextLine(&line);
Q
Qiao Longfei 已提交
209
        std::unordered_map<std::string, std::vector<int64_t>> slot_to_data;
Q
Qiao Longfei 已提交
210
        int64_t label;
Q
Qiao Longfei 已提交
211 212
        parse_line(line, slot_to_index, &label, &slot_to_data);
        batch_data.push_back(slot_to_data);
Q
Qiao Longfei 已提交
213 214 215 216
        batch_label.push_back(label);
      } else {
        break;
      }
Q
Qiao Longfei 已提交
217
    }
Q
Qiao Longfei 已提交
218

Q
Qiao Longfei 已提交
219 220
    std::vector<framework::LoDTensor> lod_datas;

Q
Qiao Longfei 已提交
221 222
    // first insert tensor for each sparse_slots
    for (auto& slot : sparse_slots) {
Q
Qiao Longfei 已提交
223 224 225
      std::vector<size_t> lod_data{0};
      std::vector<int64_t> batch_feasign;

Q
Qiao Longfei 已提交
226 227 228
      for (size_t i = 0; i < batch_data.size(); ++i) {
        auto& feasign = batch_data[i][slot];
        lod_data.push_back(lod_data.back() + feasign.size());
Q
Qiao Longfei 已提交
229 230
        batch_feasign.insert(batch_feasign.end(), feasign.begin(),
                             feasign.end());
Q
Qiao Longfei 已提交
231
      }
Q
Qiao Longfei 已提交
232 233 234 235 236 237 238

      framework::LoDTensor lod_tensor;
      framework::LoD lod{lod_data};
      lod_tensor.set_lod(lod);
      int64_t* tensor_data = lod_tensor.mutable_data<int64_t>(
          framework::make_ddim({1, static_cast<int64_t>(batch_feasign.size())}),
          platform::CPUPlace());
Q
Qiao Longfei 已提交
239 240
      memcpy(tensor_data, batch_feasign.data(),
             batch_feasign.size() * sizeof(int64_t));
Q
Qiao Longfei 已提交
241 242
      lod_datas.push_back(lod_tensor);
    }
Q
Qiao Longfei 已提交
243 244 245

    // insert label tensor
    framework::LoDTensor label_tensor;
Q
Qiao Longfei 已提交
246
    auto* label_tensor_data = label_tensor.mutable_data<int64_t>(
Q
Qiao Longfei 已提交
247 248
        framework::make_ddim({1, static_cast<int64_t>(batch_label.size())}),
        platform::CPUPlace());
Q
Qiao Longfei 已提交
249 250
    memcpy(label_tensor_data, batch_label.data(),
           batch_label.size() * sizeof(int64_t));
Q
Qiao Longfei 已提交
251 252
    lod_datas.push_back(label_tensor);

Q
Qiao Longfei 已提交
253
    queue->Push(lod_datas);
Q
Qiao Longfei 已提交
254
    VLOG(4) << "push one data, queue_size=" << queue->Size();
Q
Qiao Longfei 已提交
255
  }
Q
Qiao Longfei 已提交
256 257

  (*thread_status)[thread_id] = Stopped;
Q
Qiao Longfei 已提交
258
  VLOG(3) << "set status to stopped, thread " << thread_id << " exited";
Q
Qiao Longfei 已提交
259 260 261
}

}  // namespace reader
Q
Qiao Longfei 已提交
262 263
}  // namespace operators
}  // namespace paddle