trainer.h 12.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

W
wanghuancoder 已提交
17
#include <ctime>
18
#include <fstream>
W
wanghuancoder 已提交
19
#include <map>
20 21 22 23 24 25 26
#include <memory>
#include <mutex>  // NOLINT
#include <string>
#include <thread>  // NOLINT
#include <vector>

#include "paddle/fluid/framework/data_feed.h"
D
dongdaxiang 已提交
27
#include "paddle/fluid/framework/data_set.h"
28
#include "paddle/fluid/framework/device_worker.h"
T
Thunderbrook 已提交
29 30
#include "paddle/fluid/framework/fleet/heter_context.h"
#include "paddle/fluid/framework/heter_util.h"
31 32 33 34 35 36
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/trainer_desc.pb.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/reader/blocking_queue.h"
37
#include "paddle/phi/backends/dynload/port.h"
38

39
#ifdef PADDLE_WITH_PSLIB
Z
zmxdream 已提交
40
#include "proto/ps.pb.h"
41 42
#endif

43 44 45
namespace paddle {
namespace framework {

W
wanghuancoder 已提交
46 47 48 49 50
class Dataset;
class ProgramDesc;
class PullDenseWorker;
class Scope;
class VarDesc;
51
class DeviceWorker;
T
Thunderbrook 已提交
52 53 54 55
class HeterWrapper;
class HeterRequest;
class HeterResponse;

W
wanghuancoder 已提交
56 57 58
template <class T>
class ChannelObject;

59 60 61 62 63 64 65
class TrainerBase {
 public:
  TrainerBase() {}
  virtual ~TrainerBase() {}
  // model memory are hosted in root_scope
  void SetScope(Scope* root_scope);
  void SetDebug(const bool debug) { debug_ = debug; }
66
  void SetDataset(Dataset* dataset_ptr) { dataset_ptr_ = dataset_ptr; }
D
dongdaxiang 已提交
67
  virtual void Initialize(const TrainerDesc& trainer_desc,
68
                          Dataset* data_set) = 0;
69 70 71 72 73
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place) = 0;
  virtual void InitOtherEnv(const ProgramDesc& main_program) = 0;
  virtual void Run() = 0;
  virtual void Finalize() = 0;
74
  virtual Scope* GetWorkerScope(int thread_id) = 0;
H
hutuxian 已提交
75 76
  virtual void InitDumpEnv() = 0;
  virtual void DumpWork(int tid);
77
  virtual void ResetDataset(Dataset* dataset_ptr) {}
78 79

 protected:
H
hutuxian 已提交
80 81 82 83
  virtual std::string GetDumpPath(int tid) = 0;
  virtual void ParseDumpConfig(const TrainerDesc& trainer_desc);
  virtual void FinalizeDumpEnv();

84 85
  Scope* root_scope_;
  bool debug_;
86
  Dataset* dataset_ptr_;
T
Thunderbrook 已提交
87
  TrainerDesc trainer_desc_;
H
hutuxian 已提交
88 89 90

  // For dump param or field
  bool need_dump_field_ = false;
Y
yaoxuefeng 已提交
91
  std::string user_define_dump_filename_;
H
hutuxian 已提交
92 93 94 95 96 97 98 99
  bool need_dump_param_ = false;
  std::string dump_fields_path_;
  std::string dump_converter_;
  std::vector<std::string> dump_param_;
  std::vector<std::string> dump_fields_;
  int dump_thread_num_;
  std::vector<std::thread> dump_thread_;
  std::shared_ptr<paddle::framework::ChannelObject<std::string>> queue_;
100 101 102 103 104 105 106 107 108
};

// general trainer for async execution
// local trainer and distributed trainer are supported
// depends on the assigned device_worker
class MultiTrainer : public TrainerBase {
 public:
  MultiTrainer() {}
  virtual ~MultiTrainer() {}
D
dongdaxiang 已提交
109
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
110 111
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
112
  virtual void InitOtherEnv(const ProgramDesc& main_program);
113 114
  virtual void Run();
  virtual void Finalize();
115
  virtual void InitDumpEnv();
116
  virtual Scope* GetWorkerScope(int thread_id);
H
hutuxian 已提交
117
  virtual std::string GetDumpPath(int tid);
118

T
Thunderbrook 已提交
119 120 121 122 123 124 125
  template <typename T>
  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
#ifdef PADDLE_WITH_HETERPS

  void MergeDenseParam();
#endif

126 127 128
 protected:
  int thread_num_;
  std::vector<std::thread> threads_;
J
jiaqi 已提交
129
  std::vector<DataFeed*> readers_;
130
  std::vector<std::shared_ptr<DeviceWorker>> workers_;
131
  std::vector<std::string> need_merge_var_names_;
D
danleifeng 已提交
132
  std::vector<std::string> trainable_param_;
T
Thunderbrook 已提交
133 134 135
#ifdef PADDLE_WITH_HETERPS
  std::vector<platform::Place> places_;
#endif
136 137 138
  int mpi_rank_;
  int mpi_size_;
  int dump_file_num_;
139 140 141 142 143 144
};

class DistMultiTrainer : public MultiTrainer {
 public:
  DistMultiTrainer() {}
  virtual ~DistMultiTrainer() {}
D
dongdaxiang 已提交
145
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
146 147
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
148
  virtual void InitOtherEnv(const ProgramDesc& main_program);
149
  virtual void Run();
150
  virtual void Finalize();
151 152
  template <typename T>
  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
153
  virtual void InitDumpEnv();
154
  virtual Scope* GetWorkerScope(int thread_id);
T
Thunderbrook 已提交
155
  virtual void RegisterHeterCallback();
156 157 158 159 160

 protected:
  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
};

161 162
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
     defined PADDLE_WITH_XPU) &&                            \
T
Thunderbrook 已提交
163
    (defined PADDLE_WITH_PSLIB) && (!defined(PADDLE_WITH_HETERPS))
T
Thunderbrook 已提交
164 165 166 167 168 169 170 171 172 173 174 175
class HeterServiceContext {
 public:
  HeterServiceContext() {}
  virtual ~HeterServiceContext() {
    for (OperatorBase* op : ops_) {
      delete op;
    }
    std::vector<OperatorBase*>().swap(ops_);
  }
  void Reset() { push_dense_status_.clear(); }
  int place_num_;
  Scope* scope_{nullptr};
176 177 178

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  gpuEvent_t event_;
T
Thunderbrook 已提交
179
#endif
T
Thunderbrook 已提交
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
  std::vector<OperatorBase*> ops_;
  std::vector<::std::future<int32_t>> push_dense_status_;
};

class HeterXpuTrainer : public TrainerBase {
 public:
  HeterXpuTrainer() {}
  virtual ~HeterXpuTrainer() {
    for (OperatorBase* op : ops_) {
      delete op;
    }
    std::vector<OperatorBase*>().swap(ops_);
  }
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
  virtual void InitOtherEnv(const ProgramDesc& main_program);
  virtual void Run();
  virtual void Finalize();
  virtual void DumpWork(int tid);
  virtual void RegisterServiceHandler();
  virtual int RunTask(const HeterRequest* request, HeterResponse* response);
  virtual Scope* GetWorkerScope(int thread_id);
  virtual void CacheProgram(const ProgramDesc& main_program) {
    new (&program_) ProgramDesc(main_program);
  }
T
Thunderbrook 已提交
206 207
  virtual std::string GetDumpPath(int tid) { return ""; }
  virtual void InitDumpEnv() {}
T
Thunderbrook 已提交
208
  template <typename T>
209
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
T
Thunderbrook 已提交
210 211
  void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
                   const paddle::platform::Place& thread_place,
212
                   gpuStream_t stream);
T
Thunderbrook 已提交
213 214 215 216 217
#endif
#ifdef PADDLE_WITH_XPU
  void HeterMemCpy(LoDTensor* thread_tensor, LoDTensor* root_tensor,
                   const paddle::platform::Place& thread_place);
#endif
T
Thunderbrook 已提交
218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
  void CreateThreadParam(const ProgramDesc& program, int num);
  template <typename T>
  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
  int EndPass(const HeterRequest* request, HeterResponse* response);
  int StopService(const HeterRequest* request, HeterResponse* response);

 protected:
  DownpourWorkerParameter param_;
  std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
  std::vector<std::string> need_merge_var_names_;
  float scale_datanorm_;
  int xpu_begin_op_index_;
  int xpu_end_op_index_;
  bool running_;
  paddle::platform::Place place_;
  std::mutex mutex_;
  ProgramDesc program_;
  std::condition_variable cond_;
  std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
  std::shared_ptr<paddle::framework::HeterWrapper> heter_ptr_;
  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
  std::vector<OperatorBase*> ops_;
  std::vector<std::string> op_names_;
  std::vector<Scope*> place_scopes_;
  BtObjectPool<HeterServiceContext> object_pool_;
  std::vector<platform::Place> places_;
244 245 246
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  std::vector<gpuStream_t> copy_streams_;
  std::vector<gpuEvent_t> events_;
T
Thunderbrook 已提交
247
#endif
T
Thunderbrook 已提交
248
};
T
Thunderbrook 已提交
249

T
Thunderbrook 已提交
250 251
#endif

F
Fan Zhang 已提交
252 253
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
     defined PADDLE_WITH_XPU_BKCL) &&                        \
254
    (defined PADDLE_WITH_PSLIB)
T
Thunderbrook 已提交
255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
class PSGPUTrainer : public TrainerBase {
 public:
  PSGPUTrainer() {}
  virtual ~PSGPUTrainer() {}
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
  virtual void InitOtherEnv(const ProgramDesc& main_program);
  virtual void Run();
  virtual void Finalize();
  virtual void RegisterHeterCallback();
  virtual Scope* GetWorkerScope(int thread_id);
  virtual void CacheProgram(const ProgramDesc& main_program) {
    new (&program_) ProgramDesc(main_program);
  }
T
Thunderbrook 已提交
270
  virtual std::string GetDumpPath(int tid);
271
  void InitDumpEnv() override;
272
  virtual void MergeDenseParam();
T
Thunderbrook 已提交
273 274 275

  template <typename T>
  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
276
  void InitializeGPUServer(const TrainerDesc& trainer_desc);
T
Thunderbrook 已提交
277 278 279 280 281 282

 protected:
  Dataset* dataset_;
  DownpourWorkerParameter param_;
  std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
  std::vector<std::string> need_merge_var_names_;
283
  std::vector<std::string> trainable_param_;
T
Thunderbrook 已提交
284 285 286 287 288 289 290 291 292 293
  float scale_datanorm_;
  paddle::platform::Place place_;
  ProgramDesc program_;
  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
  std::vector<std::shared_ptr<DeviceWorker>> workers_;
  std::vector<platform::Place> places_;
  // ps-gpu
  std::vector<std::thread> threads_;
  int use_ps_gpu_;
  int thread_num_;
T
Thunderbrook 已提交
294 295 296
  int mpi_rank_;
  int mpi_size_;
  int dump_file_num_;
297 298 299

  // _ps_param for gpups optimizer config
  ::paddle::PSParameter _ps_param;
T
Thunderbrook 已提交
300 301 302
};
#endif

303
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
304
    defined(PADDLE_WITH_ASCEND_CL)
H
hutuxian 已提交
305 306 307 308 309 310 311
class PipelineTrainer : public TrainerBase {
 public:
  PipelineTrainer() {}
  ~PipelineTrainer() override {}
  void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set) override;
  void InitTrainerEnv(const ProgramDesc& main_program,
                      const platform::Place& place) override;
H
hutuxian 已提交
312
  void InitOtherEnv(const ProgramDesc& main_program) override;
H
hutuxian 已提交
313 314
  void Run() override;
  void Finalize() override;
315
  virtual Scope* GetWorkerScope(int thread_id);
H
hutuxian 已提交
316 317
  void InitDumpEnv() override;
  virtual std::string GetDumpPath(int tid);
318
  void GetSkipVars(const ProgramDesc& main_program);
H
hutuxian 已提交
319 320

 protected:
L
lilong12 已提交
321
  int num_microbatches_;
322 323
  platform::Place place_;
  std::vector<std::string> skip_vars_;
L
lilong12 已提交
324
  TrainerDesc trainer_desc_;
H
hutuxian 已提交
325

326 327 328 329 330
  std::future<void> section_thread_;
  std::shared_ptr<paddle::framework::DeviceWorker> worker_;
  Scope* minibatch_scope_;
  // microbatch_scopes_: [microbatch_id]
  std::vector<Scope*> microbatch_scopes_;
L
lilong12 已提交
331

332 333
  void CopyParameters(int microbatch_id, const ProgramDesc& program,
                      const platform::Place& place);
H
hutuxian 已提交
334 335
};
#endif
L
lilong12 已提交
336

337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
#if defined(PADDLE_WITH_PSCORE)
class HeterPipelineTrainer : public TrainerBase {
 public:
  HeterPipelineTrainer() {}
  ~HeterPipelineTrainer() override {}
  void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set) override;
  void InitTrainerEnv(const ProgramDesc& main_program,
                      const platform::Place& place) override;
  void InitOtherEnv(const ProgramDesc& main_program) override;
  void Run() override;
  void Finalize() override;
  Scope* GetWorkerScope(int thread_id) override;
  void InitDumpEnv() override;
  std::string GetDumpPath(int tid) override;
  void ResetDataset(Dataset* dataset_ptr) override;

 protected:
  int trainer_id_;             // stage_trainer_id
  std::vector<int> trainers_;  //  std::vector<int> trainers
  int thread_num_;
  std::vector<std::thread> threads_;

  int num_microbatches_;
  platform::Place place_;
  TrainerDesc trainer_desc_;

  int num_pipeline_stages_;
  int pipeline_stage_;
  std::unordered_map<int, std::shared_ptr<paddle::framework::DeviceWorker>>
      workers_;

  std::shared_ptr<std::unordered_map<
      int, std::shared_ptr<::paddle::framework::BlockingQueue<
               std::pair<std::string, int>>>>>
      task_queue_;

  platform::DeviceContext* dev_ctx_ = nullptr;

  std::shared_ptr<std::unordered_map<int, Scope*>> mini_scopes_;
  std::shared_ptr<std::unordered_map<int, std::shared_ptr<std::vector<Scope*>>>>
      micro_scopes_;

  std::unique_ptr<std::thread> listen_ptr_ = nullptr;
};
#endif

383 384
}  // namespace framework
}  // namespace paddle