device_worker.cc 15.6 KB
Newer Older
D
dongdaxiang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/framework/device_worker.h"

D
danleifeng 已提交
17
#include <chrono>
18
#include "paddle/fluid/framework/convert_utils.h"
19
namespace phi {
20
class DenseTensor;
21
}  // namespace phi
22

D
dongdaxiang 已提交
23 24 25
namespace paddle {
namespace framework {

W
wanghuancoder 已提交
26 27
class Scope;

D
dongdaxiang 已提交
28 29
void DeviceWorker::SetRootScope(Scope* root_scope) { root_scope_ = root_scope; }

J
jiaqi 已提交
30
void DeviceWorker::SetDataFeed(DataFeed* data_feed) {
D
dongdaxiang 已提交
31 32 33
  device_reader_ = data_feed;
}

34
template <typename T>
35
std::string PrintLodTensorType(phi::DenseTensor* tensor,
D
danleifeng 已提交
36 37 38 39
                               int64_t start,
                               int64_t end,
                               char separator = ',',
                               bool need_leading_separator = true) {
40 41 42 43 44
  auto count = tensor->numel();
  if (start < 0 || end > count) {
    VLOG(3) << "access violation";
    return "access violation";
  }
D
danleifeng 已提交
45
  if (start >= end) return "";
46
  std::ostringstream os;
D
danleifeng 已提交
47 48 49 50
  if (!need_leading_separator) {
    os << tensor->data<T>()[start];
    start++;
  }
51
  for (int64_t i = start; i < end; i++) {
D
danleifeng 已提交
52 53
    // os << ":" << tensor->data<T>()[i];
    os << separator << tensor->data<T>()[i];
54 55 56
  }
  return os.str();
}
D
danleifeng 已提交
57
template <typename T>
58
void PrintLodTensorType(phi::DenseTensor* tensor,
D
danleifeng 已提交
59 60
                        int64_t start,
                        int64_t end,
61
                        std::string& out_val,  // NOLINT
D
danleifeng 已提交
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
                        char separator = ',',
                        bool need_leading_separator = true) {
  auto count = tensor->numel();
  if (start < 0 || end > count) {
    VLOG(3) << "access violation";
    out_val += "access violation";
    return;
  }
  if (start >= end) return;
  if (!need_leading_separator) {
    out_val += std::to_string(tensor->data<T>()[start]);
    // os << tensor->data<T>()[start];
    start++;
  }
  for (int64_t i = start; i < end; i++) {
    // os << ":" << tensor->data<T>()[i];
    // os << separator << tensor->data<T>()[i];
    out_val += separator;
    out_val += std::to_string(tensor->data<T>()[i]);
  }
}
83

D
danleifeng 已提交
84 85 86
#define FLOAT_EPS 1e-8
#define MAX_FLOAT_BUFF_SIZE 40
template <>
87
void PrintLodTensorType<float>(phi::DenseTensor* tensor,
D
danleifeng 已提交
88 89
                               int64_t start,
                               int64_t end,
90
                               std::string& out_val,  // NOLINT
D
danleifeng 已提交
91 92 93 94 95 96 97 98 99 100 101 102 103
                               char separator,
                               bool need_leading_separator) {
  char buf[MAX_FLOAT_BUFF_SIZE];
  auto count = tensor->numel();
  if (start < 0 || end > count) {
    VLOG(3) << "access violation";
    out_val += "access violation";
    return;
  }
  if (start >= end) return;
  for (int64_t i = start; i < end; i++) {
    if (i != start || need_leading_separator) out_val += separator;
    if (tensor->data<float>()[i] > -FLOAT_EPS &&
104
        tensor->data<float>()[i] < FLOAT_EPS) {
D
danleifeng 已提交
105
      out_val += "0";
106 107
    } else {
      sprintf(buf, "%.9f", tensor->data<float>()[i]);  // NOLINT
D
danleifeng 已提交
108 109 110 111
      out_val += buf;
    }
  }
}
112
std::string PrintLodTensorIntType(phi::DenseTensor* tensor,
D
danleifeng 已提交
113 114 115 116
                                  int64_t start,
                                  int64_t end,
                                  char separator = ',',
                                  bool need_leading_separator = true) {
117 118 119 120 121
  auto count = tensor->numel();
  if (start < 0 || end > count) {
    VLOG(3) << "access violation";
    return "access violation";
  }
D
danleifeng 已提交
122
  if (start >= end) return "";
123
  std::ostringstream os;
D
danleifeng 已提交
124 125 126 127
  if (!need_leading_separator) {
    os << static_cast<uint64_t>(tensor->data<int64_t>()[start]);
    start++;
  }
128
  for (int64_t i = start; i < end; i++) {
D
danleifeng 已提交
129 130
    // os << ":" << static_cast<uint64_t>(tensor->data<int64_t>()[i]);
    os << separator << static_cast<uint64_t>(tensor->data<int64_t>()[i]);
131 132 133 134
  }
  return os.str();
}

135
void PrintLodTensorIntType(phi::DenseTensor* tensor,
D
danleifeng 已提交
136 137
                           int64_t start,
                           int64_t end,
138
                           std::string& out_val,  // NOLINT
D
danleifeng 已提交
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
                           char separator = ',',
                           bool need_leading_separator = true) {
  auto count = tensor->numel();
  if (start < 0 || end > count) {
    VLOG(3) << "access violation";
    out_val += "access violation";
    return;
  }
  if (start >= end) return;
  if (!need_leading_separator) {
    out_val +=
        std::to_string(static_cast<uint64_t>(tensor->data<int64_t>()[start]));
    start++;
  }
  for (int64_t i = start; i < end; i++) {
    // os << ":" << static_cast<uint64_t>(tensor->data<int64_t>()[i]);
    // os << separator << static_cast<uint64_t>(tensor->data<int64_t>()[i]);
    out_val += separator;
    out_val +=
        std::to_string(static_cast<uint64_t>(tensor->data<int64_t>()[i]));
  }
  // return os.str();
}

163
std::string PrintLodTensor(phi::DenseTensor* tensor,
D
danleifeng 已提交
164 165 166 167
                           int64_t start,
                           int64_t end,
                           char separator,
                           bool need_leading_separator) {
168
  std::string out_val;
169
  if (framework::TransToProtoVarType(tensor->dtype()) == proto::VarType::FP32) {
D
danleifeng 已提交
170 171
    out_val = PrintLodTensorType<float>(
        tensor, start, end, separator, need_leading_separator);
172 173
  } else if (framework::TransToProtoVarType(tensor->dtype()) ==
             proto::VarType::INT64) {
D
danleifeng 已提交
174 175
    out_val = PrintLodTensorIntType(
        tensor, start, end, separator, need_leading_separator);
176 177
  } else if (framework::TransToProtoVarType(tensor->dtype()) ==
             proto::VarType::FP64) {
D
danleifeng 已提交
178 179
    out_val = PrintLodTensorType<double>(
        tensor, start, end, separator, need_leading_separator);
180 181 182 183 184 185
  } else {
    out_val = "unsupported type";
  }
  return out_val;
}

186
void PrintLodTensor(phi::DenseTensor* tensor,
D
danleifeng 已提交
187 188
                    int64_t start,
                    int64_t end,
189
                    std::string& out_val,  // NOLINT
D
danleifeng 已提交
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
                    char separator,
                    bool need_leading_separator) {
  if (framework::TransToProtoVarType(tensor->dtype()) == proto::VarType::FP32) {
    PrintLodTensorType<float>(
        tensor, start, end, out_val, separator, need_leading_separator);
  } else if (framework::TransToProtoVarType(tensor->dtype()) ==
             proto::VarType::INT64) {
    PrintLodTensorIntType(
        tensor, start, end, out_val, separator, need_leading_separator);
  } else if (framework::TransToProtoVarType(tensor->dtype()) ==
             proto::VarType::FP64) {
    PrintLodTensorType<double>(
        tensor, start, end, out_val, separator, need_leading_separator);
  } else {
    out_val += "unsupported type";
  }
}

208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
std::pair<int64_t, int64_t> GetTensorBound(LoDTensor* tensor, int index) {
  auto& dims = tensor->dims();
  if (tensor->lod().size() != 0) {
    auto& lod = tensor->lod()[0];
    return {lod[index] * dims[1], lod[index + 1] * dims[1]};
  } else {
    return {index * dims[1], (index + 1) * dims[1]};
  }
}

bool CheckValidOutput(LoDTensor* tensor, size_t batch_size) {
  auto& dims = tensor->dims();
  if (dims.size() != 2) return false;
  if (tensor->lod().size() != 0) {
    auto& lod = tensor->lod()[0];
    if (lod.size() != batch_size + 1) {
      return false;
    }
  } else {
    if (dims[0] != static_cast<int>(batch_size)) {
      return false;
    }
  }
  return true;
}

H
hutuxian 已提交
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253
void DeviceWorker::DumpParam(const Scope& scope, const int batch_id) {
  std::ostringstream os;
  for (auto& param : *dump_param_) {
    os.str("");
    Variable* var = scope.FindVar(param);
    if (var == nullptr) {
      continue;
    }
    LoDTensor* tensor = var->GetMutable<LoDTensor>();
    framework::LoDTensor cpu_tensor;
    if (platform::is_gpu_place(tensor->place())) {
      TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
      tensor = &cpu_tensor;
    }
    int64_t len = tensor->numel();
    os << "(" << batch_id << "," << param << ")"
       << PrintLodTensor(tensor, 0, len);
    writer_ << os.str();
  }
}
X
xujiaqi01 已提交
254

H
hutuxian 已提交
255
void DeviceWorker::InitRandomDumpConfig(const TrainerDesc& desc) {
D
danleifeng 已提交
256 257 258 259 260
  bool is_dump_in_simple_mode = desc.is_dump_in_simple_mode();
  if (is_dump_in_simple_mode) {
    dump_mode_ = 3;
    return;
  }
H
hutuxian 已提交
261 262 263 264 265 266 267 268 269 270 271 272 273
  bool enable_random_dump = desc.enable_random_dump();
  if (!enable_random_dump) {
    dump_mode_ = 0;
  } else {
    if (desc.random_with_lineid()) {
      dump_mode_ = 1;
    } else {
      dump_mode_ = 2;
    }
  }
  dump_interval_ = desc.dump_interval();
}

274 275
void DeviceWorker::DumpField(const Scope& scope,
                             int dump_mode,
H
hutuxian 已提交
276 277 278
                             int dump_interval) {  // dump_mode: 0: no random,
                                                   // 1: random with insid hash,
                                                   // 2: random with random
D
danleifeng 已提交
279 280 281
  // 3: simple mode using multi-threads, for gpugraphps-mode
  auto start1 = std::chrono::steady_clock::now();

H
hutuxian 已提交
282 283 284
  size_t batch_size = device_reader_->GetCurBatchSize();
  auto& ins_id_vec = device_reader_->GetInsIdVec();
  auto& ins_content_vec = device_reader_->GetInsContentVec();
D
danleifeng 已提交
285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310
  if (dump_mode_ == 3) {
    batch_size = std::string::npos;
    bool has_valid_batch = false;
    for (auto& field : *dump_fields_) {
      Variable* var = scope.FindVar(field);
      if (var == nullptr) {
        VLOG(0) << "Note: field[" << field
                << "] cannot be find in scope, so it was skipped.";
        continue;
      }
      LoDTensor* tensor = var->GetMutable<LoDTensor>();
      if (!tensor->IsInitialized()) {
        VLOG(0) << "Note: field[" << field
                << "] is not initialized, so it was skipped.";
        continue;
      }
      auto& dims = tensor->dims();
      if (dims.size() == 2 && dims[0] > 0) {
        batch_size = std::min(batch_size, static_cast<size_t>(dims[0]));
        // VLOG(0)<<"in dump field ---> "<<field<<" dim_size = "<<dims[0]<<"
        // "<<dims[1]<<" batch_size = "<<batch_size;
        has_valid_batch = true;
      }
    }
    if (!has_valid_batch) return;
  } else if (ins_id_vec.size() > 0) {
H
hutuxian 已提交
311 312 313
    batch_size = ins_id_vec.size();
  }
  std::vector<std::string> ars(batch_size);
D
danleifeng 已提交
314 315 316 317 318 319 320 321 322 323 324
  if (dump_mode_ == 3) {
    if (dump_fields_ == NULL || (*dump_fields_).size() == 0) {
      return;
    }
    auto set_output_str = [&, this](
                              size_t begin, size_t end, LoDTensor* tensor) {
      std::pair<int64_t, int64_t> bound;
      auto& dims = tensor->dims();
      for (size_t i = begin; i < end; ++i) {
        bound = {i * dims[1], (i + 1) * dims[1]};
        // auto bound = GetTensorBound(tensor, i);
H
hutuxian 已提交
325

D
danleifeng 已提交
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
        if (ars[i].size() > 0) ars[i] += "\t";
        // ars[i] += '[';
        PrintLodTensor(tensor, bound.first, bound.second, ars[i], ' ', false);
        // ars[i] += ']';
        // ars[i] += "<" + PrintLodTensor(tensor, bound.first, bound.second, '
        // ', false) + ">";
      }
    };
    std::vector<std::thread> threads(tensor_iterator_thread_num);
    for (auto& field : *dump_fields_) {
      Variable* var = scope.FindVar(field);
      if (var == nullptr) {
        VLOG(0) << "Note: field[" << field
                << "] cannot be find in scope, so it was skipped.";
        continue;
      }
      LoDTensor* tensor = var->GetMutable<LoDTensor>();
      if (!tensor->IsInitialized()) {
        VLOG(0) << "Note: field[" << field
                << "] is not initialized, so it was skipped.";
        continue;
      }
      framework::LoDTensor cpu_tensor;
      if (platform::is_gpu_place(tensor->place())) {
        TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
        cpu_tensor.set_lod(tensor->lod());
        tensor = &cpu_tensor;
      }
      auto& dims = tensor->dims();
      if (dims.size() != 2 || dims[0] <= 0) {
        VLOG(0) << "Note: field[" << field
                << "] cannot pass check, so it was "
                   "skipped. Maybe the dimension is "
                   "wrong ";
        VLOG(0) << dims.size() << " " << dims[0] << " * " << dims[1];
        continue;
      }
      size_t acutal_thread_num =
364
          std::min(static_cast<size_t>(batch_size), tensor_iterator_thread_num);
D
danleifeng 已提交
365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380
      for (size_t i = 0; i < acutal_thread_num; i++) {
        size_t average_size = batch_size / acutal_thread_num;
        size_t begin =
            average_size * i + std::min(batch_size % acutal_thread_num, i);
        size_t end =
            begin + average_size + (i < batch_size % acutal_thread_num ? 1 : 0);
        threads[i] = std::thread(set_output_str, begin, end, tensor);
      }
      for (size_t i = 0; i < acutal_thread_num; i++) threads[i].join();
    }
    auto end1 = std::chrono::steady_clock::now();
    auto tt =
        std::chrono::duration_cast<std::chrono::microseconds>(end1 - start1);
    VLOG(1) << "writing a batch takes " << tt.count() << " us";

    size_t acutal_thread_num =
381
        std::min(static_cast<size_t>(batch_size), tensor_iterator_thread_num);
D
danleifeng 已提交
382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
    for (size_t i = 0; i < acutal_thread_num; i++) {
      size_t average_size = batch_size / acutal_thread_num;
      size_t begin =
          average_size * i + std::min(batch_size % acutal_thread_num, i);
      size_t end =
          begin + average_size + (i < batch_size % acutal_thread_num ? 1 : 0);
      for (size_t j = begin + 1; j < end; j++) {
        if (ars[begin].size() > 0 && ars[j].size() > 0) ars[begin] += "\n";
        ars[begin] += ars[j];
      }
      if (ars[begin].size() > 0) writer_ << ars[begin];
    }
    return;
  }
  std::vector<bool> hit(batch_size, false);
H
hutuxian 已提交
397 398 399 400 401 402 403 404 405 406 407 408 409
  std::default_random_engine engine(0);
  std::uniform_int_distribution<size_t> dist(0U, INT_MAX);
  for (size_t i = 0; i < batch_size; i++) {
    size_t r = 0;
    if (dump_mode == 1) {
      r = XXH64(ins_id_vec[i].data(), ins_id_vec[i].length(), 0);
    } else if (dump_mode == 2) {
      r = dist(engine);
    }
    if (r % dump_interval != 0) {
      continue;
    }
    hit[i] = true;
410
  }  // dump_mode = 0
H
hutuxian 已提交
411 412 413 414 415
  for (size_t i = 0; i < ins_id_vec.size(); i++) {
    if (!hit[i]) {
      continue;
    }
    ars[i] += ins_id_vec[i];
416
    ars[i] += "\t" + ins_content_vec[i];
H
hutuxian 已提交
417 418 419 420
  }
  for (auto& field : *dump_fields_) {
    Variable* var = scope.FindVar(field);
    if (var == nullptr) {
421 422
      VLOG(0) << "Note: field[" << field
              << "] cannot be find in scope, so it was skipped.";
H
hutuxian 已提交
423 424 425
      continue;
    }
    LoDTensor* tensor = var->GetMutable<LoDTensor>();
426 427 428 429 430
    if (!tensor->IsInitialized()) {
      VLOG(0) << "Note: field[" << field
              << "] is not initialized, so it was skipped.";
      continue;
    }
H
hutuxian 已提交
431 432 433
    framework::LoDTensor cpu_tensor;
    if (platform::is_gpu_place(tensor->place())) {
      TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
434
      cpu_tensor.set_lod(tensor->lod());
H
hutuxian 已提交
435 436 437
      tensor = &cpu_tensor;
    }
    if (!CheckValidOutput(tensor, batch_size)) {
438 439 440 441
      VLOG(0) << "Note: field[" << field
              << "] cannot pass check, so it was "
                 "skipped. Maybe the dimension is "
                 "wrong ";
H
hutuxian 已提交
442 443 444 445 446 447 448
      continue;
    }
    for (size_t i = 0; i < batch_size; ++i) {
      if (!hit[i]) {
        continue;
      }
      auto bound = GetTensorBound(tensor, i);
449
      ars[i] += "\t" + field + ":" + std::to_string(bound.second - bound.first);
H
hutuxian 已提交
450 451 452
      ars[i] += PrintLodTensor(tensor, bound.first, bound.second);
    }
  }
D
danleifeng 已提交
453

H
hutuxian 已提交
454 455 456 457 458 459 460 461 462
  // #pragma omp parallel for
  for (size_t i = 0; i < ars.size(); i++) {
    if (ars[i].length() == 0) {
      continue;
    }
    writer_ << ars[i];
  }
}

D
dongdaxiang 已提交
463 464
}  // namespace framework
}  // namespace paddle