ctr_reader_test.cc 4.5 KB
Newer Older
Q
Qiao Longfei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

Q
Qiao Longfei 已提交
15 16
#include "paddle/fluid/operators/reader/ctr_reader.h"

Q
Qiao Longfei 已提交
17
#include <gzstream.h>
Q
Qiao Longfei 已提交
18 19
#include <time.h>

Q
Qiao Longfei 已提交
20 21 22 23 24 25
#include <math.h>
#include <stdio.h>
#include <cstring>
#include <fstream>
#include <tuple>

Q
Qiao Longfei 已提交
26 27 28 29 30 31 32 33
#include "gtest/gtest.h"

#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/operators/reader/blocking_queue.h"

using paddle::operators::reader::LoDTensorBlockingQueue;
using paddle::operators::reader::LoDTensorBlockingQueueHolder;
using paddle::operators::reader::CTRReader;
Q
Qiao Longfei 已提交
34
using paddle::framework::LoDTensor;
Q
Qiao Longfei 已提交
35
using paddle::framework::LoD;
Q
Qiao Longfei 已提交
36
using paddle::framework::DDim;
Q
Qiao Longfei 已提交
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
using paddle::platform::CPUPlace;

static void generatedata(const std::vector<std::string>& data,
                         const std::string& file_name) {
  std::ifstream in(file_name.c_str());
  if (in.good()) {
    VLOG(3) << "file " << file_name << " exist, delete it first!";
    remove(file_name.c_str());
  } else {
    in.close();
  }

  ogzstream out(file_name.c_str());
  PADDLE_ENFORCE(out.good(), "open file %s failed!", file_name);
  for (auto& c : data) {
    out << c;
  }
  out.close();
  PADDLE_ENFORCE(out.good(), "save file %s failed!", file_name);
}
Q
Qiao Longfei 已提交
57 58

TEST(CTR_READER, read_data) {
Q
Qiao Longfei 已提交
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
  const std::vector<std::string> ctr_data = {
      "aaaa 1 0 0:6002 1:6003 2:6004 3:6005 4:6006 -1\n",
      "bbbb 1 0 5:6003 6:6003 7:6003 8:6004 9:6004 -1\n",
      "cccc 1 1 10:6002 11:6002 12:6002 13:6002 14:6002 -2\n",
      "dddd 1 0 15:6003 16:6003 17:6003 18:6003 19:6004 -3\n",
      "1111 1 1 20:6001 21:6001 22:6001 23:6001 24:6001 12\n",
      "2222 1 1 25:6004 26:6004 27:6004 28:6005 29:6005 aa\n",
      "3333 1 0 30:6002 31:6003 32:6004 33:6004 34:6005 er\n",
      "eeee 1 1 35:6003 36:6003 37:6005 38:6005 39:6005 dd\n",
      "ffff 1 1 40:6002 41:6003 42:6004 43:6004 44:6005 66\n",
      "gggg 1 1 46:6006 45:6006 47:6003 48:6003 49:6003 ba\n",
  };
  std::string gz_file_name = "test_ctr_reader_data.gz";
  generatedata(ctr_data, gz_file_name);

  std::vector<int64_t> label_value = {0, 0, 1, 0, 1, 1, 0, 1, 1, 1};

  std::vector<std::tuple<LoD, std::vector<int64_t>>> data_slot_6002{
Q
Qiao Longfei 已提交
77 78 79 80
      {{{0, 1, 2, 7}}, {0, 0, 10, 11, 12, 13, 14}},
      {{{0, 1, 2, 3}}, {0, 0, 0}},
      {{{0, 1, 2, 3}}, {30, 0, 40}},
      {{{0, 1}}, {0}}};
Q
Qiao Longfei 已提交
81
  std::vector<std::tuple<LoD, std::vector<int64_t>>> data_slot_6003{
Q
Qiao Longfei 已提交
82 83 84 85 86 87
      {{{0, 1, 4, 5}}, {1, 5, 6, 7, 0}},
      {{{0, 4, 5, 6}}, {15, 16, 17, 18, 0, 0}},
      {{{0, 1, 3, 4}}, {31, 35, 36, 41}},
      {{{0, 3}}, {47, 48, 49}}};

  std::vector<DDim> label_dims = {{1, 3}, {1, 3}, {1, 3}, {1, 1}};
Q
Qiao Longfei 已提交
88

Q
Qiao Longfei 已提交
89 90
  LoDTensorBlockingQueueHolder queue_holder;
  int capacity = 64;
Q
Qiao Longfei 已提交
91
  queue_holder.InitOnce(capacity, {}, false);
Q
Qiao Longfei 已提交
92 93 94

  std::shared_ptr<LoDTensorBlockingQueue> queue = queue_holder.GetQueue();

Q
Qiao Longfei 已提交
95
  int batch_size = 3;
Q
Qiao Longfei 已提交
96 97 98 99 100 101
  int thread_num = 1;
  std::vector<std::string> slots = {"6002", "6003"};
  std::vector<std::string> file_list;
  for (int i = 0; i < thread_num; ++i) {
    file_list.push_back(gz_file_name);
  }
Q
Qiao Longfei 已提交
102 103 104 105

  CTRReader reader(queue, batch_size, thread_num, slots, file_list);

  reader.Start();
Q
Qiao Longfei 已提交
106

Q
Qiao Longfei 已提交
107 108
  size_t batch_num =
      std::ceil(static_cast<float>(ctr_data.size()) / batch_size) * thread_num;
Q
Qiao Longfei 已提交
109 110 111

  for (size_t i = 0; i < batch_num; ++i) {
    std::vector<LoDTensor> out;
Q
Qiao Longfei 已提交
112
    reader.ReadNext(&out);
Q
Qiao Longfei 已提交
113 114
    ASSERT_EQ(out.size(), slots.size() + 1);
    auto& label_tensor = out.back();
Q
Qiao Longfei 已提交
115
    ASSERT_EQ(label_tensor.dims(), label_dims[i]);
Q
Qiao Longfei 已提交
116 117 118 119 120
    for (size_t j = 0; j < batch_size && i * batch_num + j < ctr_data.size();
         ++j) {
      auto& label = label_tensor.data<int64_t>()[j];
      ASSERT_TRUE(label == 0 || label == 1);
      ASSERT_EQ(label, label_value[i * batch_size + j]);
Q
Qiao Longfei 已提交
121
    }
Q
Qiao Longfei 已提交
122 123 124 125 126 127
    auto& tensor_6002 = out[0];
    ASSERT_EQ(std::get<0>(data_slot_6002[i]), tensor_6002.lod());
    ASSERT_EQ(std::memcmp(std::get<1>(data_slot_6002[i]).data(),
                          tensor_6002.data<int64_t>(),
                          tensor_6002.dims()[1] * sizeof(int64_t)),
              0);
Q
Qiao Longfei 已提交
128
  }
Q
Qiao Longfei 已提交
129
  ASSERT_EQ(queue->Size(), 0);
Q
Qiao Longfei 已提交
130
}