trainer.h 12.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

W
wanghuancoder 已提交
17
#include <ctime>
18
#include <fstream>
W
wanghuancoder 已提交
19
#include <map>
20 21 22 23 24 25 26
#include <memory>
#include <mutex>  // NOLINT
#include <string>
#include <thread>  // NOLINT
#include <vector>

#include "paddle/fluid/framework/data_feed.h"
D
dongdaxiang 已提交
27
#include "paddle/fluid/framework/data_set.h"
28
#include "paddle/fluid/framework/device_worker.h"
T
Thunderbrook 已提交
29 30
#include "paddle/fluid/framework/fleet/heter_context.h"
#include "paddle/fluid/framework/heter_util.h"
31 32 33 34 35 36
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/trainer_desc.pb.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/reader/blocking_queue.h"
37
#include "paddle/pten/backends/dynload/port.h"
38 39 40 41

namespace paddle {
namespace framework {

W
wanghuancoder 已提交
42 43 44 45 46
class Dataset;
class ProgramDesc;
class PullDenseWorker;
class Scope;
class VarDesc;
47
class DeviceWorker;
T
Thunderbrook 已提交
48 49 50 51
class HeterWrapper;
class HeterRequest;
class HeterResponse;

W
wanghuancoder 已提交
52 53 54
template <class T>
class ChannelObject;

55 56 57 58 59 60 61
class TrainerBase {
 public:
  TrainerBase() {}
  virtual ~TrainerBase() {}
  // model memory are hosted in root_scope
  void SetScope(Scope* root_scope);
  void SetDebug(const bool debug) { debug_ = debug; }
62
  void SetDataset(Dataset* dataset_ptr) { dataset_ptr_ = dataset_ptr; }
D
dongdaxiang 已提交
63
  virtual void Initialize(const TrainerDesc& trainer_desc,
64
                          Dataset* data_set) = 0;
65 66 67 68 69
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place) = 0;
  virtual void InitOtherEnv(const ProgramDesc& main_program) = 0;
  virtual void Run() = 0;
  virtual void Finalize() = 0;
70
  virtual Scope* GetWorkerScope(int thread_id) = 0;
H
hutuxian 已提交
71 72
  virtual void InitDumpEnv() = 0;
  virtual void DumpWork(int tid);
73
  virtual void ResetDataset(Dataset* dataset_ptr) {}
74 75

 protected:
H
hutuxian 已提交
76 77 78 79
  virtual std::string GetDumpPath(int tid) = 0;
  virtual void ParseDumpConfig(const TrainerDesc& trainer_desc);
  virtual void FinalizeDumpEnv();

80 81
  Scope* root_scope_;
  bool debug_;
82
  Dataset* dataset_ptr_;
T
Thunderbrook 已提交
83
  TrainerDesc trainer_desc_;
H
hutuxian 已提交
84 85 86

  // For dump param or field
  bool need_dump_field_ = false;
Y
yaoxuefeng 已提交
87
  std::string user_define_dump_filename_;
H
hutuxian 已提交
88 89 90 91 92 93 94 95
  bool need_dump_param_ = false;
  std::string dump_fields_path_;
  std::string dump_converter_;
  std::vector<std::string> dump_param_;
  std::vector<std::string> dump_fields_;
  int dump_thread_num_;
  std::vector<std::thread> dump_thread_;
  std::shared_ptr<paddle::framework::ChannelObject<std::string>> queue_;
96 97 98 99 100 101 102 103 104
};

// general trainer for async execution
// local trainer and distributed trainer are supported
// depends on the assigned device_worker
class MultiTrainer : public TrainerBase {
 public:
  MultiTrainer() {}
  virtual ~MultiTrainer() {}
D
dongdaxiang 已提交
105
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
106 107
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
108
  virtual void InitOtherEnv(const ProgramDesc& main_program);
109 110
  virtual void Run();
  virtual void Finalize();
111
  virtual void InitDumpEnv();
112
  virtual Scope* GetWorkerScope(int thread_id);
H
hutuxian 已提交
113
  virtual std::string GetDumpPath(int tid);
114

T
Thunderbrook 已提交
115 116 117 118 119 120 121
  template <typename T>
  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
#ifdef PADDLE_WITH_HETERPS

  void MergeDenseParam();
#endif

122 123 124
 protected:
  int thread_num_;
  std::vector<std::thread> threads_;
J
jiaqi 已提交
125
  std::vector<DataFeed*> readers_;
126
  std::vector<std::shared_ptr<DeviceWorker>> workers_;
127
  std::vector<std::string> need_merge_var_names_;
T
Thunderbrook 已提交
128 129 130
#ifdef PADDLE_WITH_HETERPS
  std::vector<platform::Place> places_;
#endif
131 132 133
  int mpi_rank_;
  int mpi_size_;
  int dump_file_num_;
134 135 136 137 138 139
};

class DistMultiTrainer : public MultiTrainer {
 public:
  DistMultiTrainer() {}
  virtual ~DistMultiTrainer() {}
D
dongdaxiang 已提交
140
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
141 142
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
143
  virtual void InitOtherEnv(const ProgramDesc& main_program);
144
  virtual void Run();
145
  virtual void Finalize();
146 147
  template <typename T>
  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
148
  virtual void InitDumpEnv();
149
  virtual Scope* GetWorkerScope(int thread_id);
T
Thunderbrook 已提交
150
  virtual void RegisterHeterCallback();
151 152 153 154 155

 protected:
  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
};

156 157
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
     defined PADDLE_WITH_XPU) &&                            \
T
Thunderbrook 已提交
158
    (defined PADDLE_WITH_PSLIB)
T
Thunderbrook 已提交
159 160 161 162 163 164 165 166 167 168 169 170
class HeterServiceContext {
 public:
  HeterServiceContext() {}
  virtual ~HeterServiceContext() {
    for (OperatorBase* op : ops_) {
      delete op;
    }
    std::vector<OperatorBase*>().swap(ops_);
  }
  void Reset() { push_dense_status_.clear(); }
  int place_num_;
  Scope* scope_{nullptr};
171 172 173

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  gpuEvent_t event_;
T
Thunderbrook 已提交
174
#endif
T
Thunderbrook 已提交
175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
  std::vector<OperatorBase*> ops_;
  std::vector<::std::future<int32_t>> push_dense_status_;
};

class HeterXpuTrainer : public TrainerBase {
 public:
  HeterXpuTrainer() {}
  virtual ~HeterXpuTrainer() {
    for (OperatorBase* op : ops_) {
      delete op;
    }
    std::vector<OperatorBase*>().swap(ops_);
  }
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
  virtual void InitOtherEnv(const ProgramDesc& main_program);
  virtual void Run();
  virtual void Finalize();
  virtual void DumpWork(int tid);
  virtual void RegisterServiceHandler();
  virtual int RunTask(const HeterRequest* request, HeterResponse* response);
  virtual Scope* GetWorkerScope(int thread_id);
  virtual void CacheProgram(const ProgramDesc& main_program) {
    new (&program_) ProgramDesc(main_program);
  }
T
Thunderbrook 已提交
201 202
  virtual std::string GetDumpPath(int tid) { return ""; }
  virtual void InitDumpEnv() {}
T
Thunderbrook 已提交
203
  template <typename T>
204
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
T
Thunderbrook 已提交
205 206
  void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
                   const paddle::platform::Place& thread_place,
207
                   gpuStream_t stream);
T
Thunderbrook 已提交
208 209 210 211 212
#endif
#ifdef PADDLE_WITH_XPU
  void HeterMemCpy(LoDTensor* thread_tensor, LoDTensor* root_tensor,
                   const paddle::platform::Place& thread_place);
#endif
T
Thunderbrook 已提交
213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
  void CreateThreadParam(const ProgramDesc& program, int num);
  template <typename T>
  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
  int EndPass(const HeterRequest* request, HeterResponse* response);
  int StopService(const HeterRequest* request, HeterResponse* response);

 protected:
  DownpourWorkerParameter param_;
  std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
  std::vector<std::string> need_merge_var_names_;
  float scale_datanorm_;
  int xpu_begin_op_index_;
  int xpu_end_op_index_;
  bool running_;
  paddle::platform::Place place_;
  std::mutex mutex_;
  ProgramDesc program_;
  std::condition_variable cond_;
  std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
  std::shared_ptr<paddle::framework::HeterWrapper> heter_ptr_;
  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
  std::vector<OperatorBase*> ops_;
  std::vector<std::string> op_names_;
  std::vector<Scope*> place_scopes_;
  BtObjectPool<HeterServiceContext> object_pool_;
  std::vector<platform::Place> places_;
239 240 241
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  std::vector<gpuStream_t> copy_streams_;
  std::vector<gpuEvent_t> events_;
T
Thunderbrook 已提交
242
#endif
T
Thunderbrook 已提交
243
};
T
Thunderbrook 已提交
244

T
Thunderbrook 已提交
245 246
#endif

247 248
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
    (defined PADDLE_WITH_PSLIB)
T
Thunderbrook 已提交
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
class PSGPUTrainer : public TrainerBase {
 public:
  PSGPUTrainer() {}
  virtual ~PSGPUTrainer() {}
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
  virtual void InitOtherEnv(const ProgramDesc& main_program);
  virtual void Run();
  virtual void Finalize();
  virtual void RegisterHeterCallback();
  virtual Scope* GetWorkerScope(int thread_id);
  virtual void CacheProgram(const ProgramDesc& main_program) {
    new (&program_) ProgramDesc(main_program);
  }
T
Thunderbrook 已提交
264
  virtual std::string GetDumpPath(int tid);
265
  void InitDumpEnv() override;
266
  virtual void MergeDenseParam();
T
Thunderbrook 已提交
267 268 269 270 271 272 273 274 275

  template <typename T>
  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);

 protected:
  Dataset* dataset_;
  DownpourWorkerParameter param_;
  std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
  std::vector<std::string> need_merge_var_names_;
276
  std::vector<std::string> trainable_param_;
T
Thunderbrook 已提交
277 278 279 280 281 282 283 284 285 286
  float scale_datanorm_;
  paddle::platform::Place place_;
  ProgramDesc program_;
  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
  std::vector<std::shared_ptr<DeviceWorker>> workers_;
  std::vector<platform::Place> places_;
  // ps-gpu
  std::vector<std::thread> threads_;
  int use_ps_gpu_;
  int thread_num_;
T
Thunderbrook 已提交
287 288 289
  int mpi_rank_;
  int mpi_size_;
  int dump_file_num_;
T
Thunderbrook 已提交
290 291 292
};
#endif

293
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
294
    defined(PADDLE_WITH_ASCEND_CL)
H
hutuxian 已提交
295 296 297 298 299 300 301
class PipelineTrainer : public TrainerBase {
 public:
  PipelineTrainer() {}
  ~PipelineTrainer() override {}
  void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set) override;
  void InitTrainerEnv(const ProgramDesc& main_program,
                      const platform::Place& place) override;
H
hutuxian 已提交
302
  void InitOtherEnv(const ProgramDesc& main_program) override;
H
hutuxian 已提交
303 304
  void Run() override;
  void Finalize() override;
305
  virtual Scope* GetWorkerScope(int thread_id);
H
hutuxian 已提交
306 307
  void InitDumpEnv() override;
  virtual std::string GetDumpPath(int tid);
308
  void GetSkipVars(const ProgramDesc& main_program);
H
hutuxian 已提交
309 310

 protected:
L
lilong12 已提交
311
  int num_microbatches_;
312 313
  platform::Place place_;
  std::vector<std::string> skip_vars_;
L
lilong12 已提交
314
  TrainerDesc trainer_desc_;
H
hutuxian 已提交
315

316 317 318 319 320
  std::future<void> section_thread_;
  std::shared_ptr<paddle::framework::DeviceWorker> worker_;
  Scope* minibatch_scope_;
  // microbatch_scopes_: [microbatch_id]
  std::vector<Scope*> microbatch_scopes_;
L
lilong12 已提交
321

322 323
  void CopyParameters(int microbatch_id, const ProgramDesc& program,
                      const platform::Place& place);
H
hutuxian 已提交
324 325
};
#endif
L
lilong12 已提交
326

327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372
#if defined(PADDLE_WITH_PSCORE)
class HeterPipelineTrainer : public TrainerBase {
 public:
  HeterPipelineTrainer() {}
  ~HeterPipelineTrainer() override {}
  void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set) override;
  void InitTrainerEnv(const ProgramDesc& main_program,
                      const platform::Place& place) override;
  void InitOtherEnv(const ProgramDesc& main_program) override;
  void Run() override;
  void Finalize() override;
  Scope* GetWorkerScope(int thread_id) override;
  void InitDumpEnv() override;
  std::string GetDumpPath(int tid) override;
  void ResetDataset(Dataset* dataset_ptr) override;

 protected:
  int trainer_id_;             // stage_trainer_id
  std::vector<int> trainers_;  //  std::vector<int> trainers
  int thread_num_;
  std::vector<std::thread> threads_;

  int num_microbatches_;
  platform::Place place_;
  TrainerDesc trainer_desc_;

  int num_pipeline_stages_;
  int pipeline_stage_;
  std::unordered_map<int, std::shared_ptr<paddle::framework::DeviceWorker>>
      workers_;

  std::shared_ptr<std::unordered_map<
      int, std::shared_ptr<::paddle::framework::BlockingQueue<
               std::pair<std::string, int>>>>>
      task_queue_;

  platform::DeviceContext* dev_ctx_ = nullptr;

  std::shared_ptr<std::unordered_map<int, Scope*>> mini_scopes_;
  std::shared_ptr<std::unordered_map<int, std::shared_ptr<std::vector<Scope*>>>>
      micro_scopes_;

  std::unique_ptr<std::thread> listen_ptr_ = nullptr;
};
#endif

373 374
}  // namespace framework
}  // namespace paddle