trainer.h 12.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

W
wanghuancoder 已提交
17
#include <ctime>
18
#include <fstream>
W
wanghuancoder 已提交
19
#include <map>
20 21 22 23 24 25 26
#include <memory>
#include <mutex>  // NOLINT
#include <string>
#include <thread>  // NOLINT
#include <vector>

#include "paddle/fluid/framework/data_feed.h"
D
dongdaxiang 已提交
27
#include "paddle/fluid/framework/data_set.h"
28
#include "paddle/fluid/framework/device_worker.h"
T
Thunderbrook 已提交
29 30
#include "paddle/fluid/framework/fleet/heter_context.h"
#include "paddle/fluid/framework/heter_util.h"
31 32 33 34 35 36
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/trainer_desc.pb.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/reader/blocking_queue.h"
37
#include "paddle/phi/backends/dynload/port.h"
38

39 40 41 42
#ifdef PADDLE_WITH_PSLIB
#include <pslib.h>
#endif

43 44 45
namespace paddle {
namespace framework {

W
wanghuancoder 已提交
46 47 48 49 50
class Dataset;
class ProgramDesc;
class PullDenseWorker;
class Scope;
class VarDesc;
51
class DeviceWorker;
T
Thunderbrook 已提交
52 53 54 55
class HeterWrapper;
class HeterRequest;
class HeterResponse;

W
wanghuancoder 已提交
56 57 58
template <class T>
class ChannelObject;

59 60 61 62 63 64 65
class TrainerBase {
 public:
  TrainerBase() {}
  virtual ~TrainerBase() {}
  // model memory are hosted in root_scope
  void SetScope(Scope* root_scope);
  void SetDebug(const bool debug) { debug_ = debug; }
66
  void SetDataset(Dataset* dataset_ptr) { dataset_ptr_ = dataset_ptr; }
D
dongdaxiang 已提交
67
  virtual void Initialize(const TrainerDesc& trainer_desc,
68
                          Dataset* data_set) = 0;
69 70 71 72 73
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place) = 0;
  virtual void InitOtherEnv(const ProgramDesc& main_program) = 0;
  virtual void Run() = 0;
  virtual void Finalize() = 0;
74
  virtual Scope* GetWorkerScope(int thread_id) = 0;
H
hutuxian 已提交
75 76
  virtual void InitDumpEnv() = 0;
  virtual void DumpWork(int tid);
77
  virtual void ResetDataset(Dataset* dataset_ptr) {}
78 79

 protected:
H
hutuxian 已提交
80 81 82 83
  virtual std::string GetDumpPath(int tid) = 0;
  virtual void ParseDumpConfig(const TrainerDesc& trainer_desc);
  virtual void FinalizeDumpEnv();

84 85
  Scope* root_scope_;
  bool debug_;
86
  Dataset* dataset_ptr_;
T
Thunderbrook 已提交
87
  TrainerDesc trainer_desc_;
H
hutuxian 已提交
88 89 90

  // For dump param or field
  bool need_dump_field_ = false;
Y
yaoxuefeng 已提交
91
  std::string user_define_dump_filename_;
H
hutuxian 已提交
92 93 94 95 96 97 98 99
  bool need_dump_param_ = false;
  std::string dump_fields_path_;
  std::string dump_converter_;
  std::vector<std::string> dump_param_;
  std::vector<std::string> dump_fields_;
  int dump_thread_num_;
  std::vector<std::thread> dump_thread_;
  std::shared_ptr<paddle::framework::ChannelObject<std::string>> queue_;
100 101 102 103 104 105 106 107 108
};

// general trainer for async execution
// local trainer and distributed trainer are supported
// depends on the assigned device_worker
class MultiTrainer : public TrainerBase {
 public:
  MultiTrainer() {}
  virtual ~MultiTrainer() {}
D
dongdaxiang 已提交
109
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
110 111
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
112
  virtual void InitOtherEnv(const ProgramDesc& main_program);
113 114
  virtual void Run();
  virtual void Finalize();
115
  virtual void InitDumpEnv();
116
  virtual Scope* GetWorkerScope(int thread_id);
H
hutuxian 已提交
117
  virtual std::string GetDumpPath(int tid);
118

T
Thunderbrook 已提交
119 120 121 122 123 124 125
  template <typename T>
  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
#ifdef PADDLE_WITH_HETERPS

  void MergeDenseParam();
#endif

126 127 128
 protected:
  int thread_num_;
  std::vector<std::thread> threads_;
J
jiaqi 已提交
129
  std::vector<DataFeed*> readers_;
130
  std::vector<std::shared_ptr<DeviceWorker>> workers_;
131
  std::vector<std::string> need_merge_var_names_;
T
Thunderbrook 已提交
132 133 134
#ifdef PADDLE_WITH_HETERPS
  std::vector<platform::Place> places_;
#endif
135 136 137
  int mpi_rank_;
  int mpi_size_;
  int dump_file_num_;
138 139 140 141 142 143
};

class DistMultiTrainer : public MultiTrainer {
 public:
  DistMultiTrainer() {}
  virtual ~DistMultiTrainer() {}
D
dongdaxiang 已提交
144
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
145 146
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
147
  virtual void InitOtherEnv(const ProgramDesc& main_program);
148
  virtual void Run();
149
  virtual void Finalize();
150 151
  template <typename T>
  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
152
  virtual void InitDumpEnv();
153
  virtual Scope* GetWorkerScope(int thread_id);
T
Thunderbrook 已提交
154
  virtual void RegisterHeterCallback();
155 156 157 158 159

 protected:
  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
};

160 161
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
     defined PADDLE_WITH_XPU) &&                            \
T
Thunderbrook 已提交
162
    (defined PADDLE_WITH_PSLIB) && (!defined(PADDLE_WITH_HETERPS))
T
Thunderbrook 已提交
163 164 165 166 167 168 169 170 171 172 173 174
class HeterServiceContext {
 public:
  HeterServiceContext() {}
  virtual ~HeterServiceContext() {
    for (OperatorBase* op : ops_) {
      delete op;
    }
    std::vector<OperatorBase*>().swap(ops_);
  }
  void Reset() { push_dense_status_.clear(); }
  int place_num_;
  Scope* scope_{nullptr};
175 176 177

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  gpuEvent_t event_;
T
Thunderbrook 已提交
178
#endif
T
Thunderbrook 已提交
179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
  std::vector<OperatorBase*> ops_;
  std::vector<::std::future<int32_t>> push_dense_status_;
};

class HeterXpuTrainer : public TrainerBase {
 public:
  HeterXpuTrainer() {}
  virtual ~HeterXpuTrainer() {
    for (OperatorBase* op : ops_) {
      delete op;
    }
    std::vector<OperatorBase*>().swap(ops_);
  }
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
  virtual void InitOtherEnv(const ProgramDesc& main_program);
  virtual void Run();
  virtual void Finalize();
  virtual void DumpWork(int tid);
  virtual void RegisterServiceHandler();
  virtual int RunTask(const HeterRequest* request, HeterResponse* response);
  virtual Scope* GetWorkerScope(int thread_id);
  virtual void CacheProgram(const ProgramDesc& main_program) {
    new (&program_) ProgramDesc(main_program);
  }
T
Thunderbrook 已提交
205 206
  virtual std::string GetDumpPath(int tid) { return ""; }
  virtual void InitDumpEnv() {}
T
Thunderbrook 已提交
207
  template <typename T>
208
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
T
Thunderbrook 已提交
209 210
  void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
                   const paddle::platform::Place& thread_place,
211
                   gpuStream_t stream);
T
Thunderbrook 已提交
212 213 214 215 216
#endif
#ifdef PADDLE_WITH_XPU
  void HeterMemCpy(LoDTensor* thread_tensor, LoDTensor* root_tensor,
                   const paddle::platform::Place& thread_place);
#endif
T
Thunderbrook 已提交
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
  void CreateThreadParam(const ProgramDesc& program, int num);
  template <typename T>
  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
  int EndPass(const HeterRequest* request, HeterResponse* response);
  int StopService(const HeterRequest* request, HeterResponse* response);

 protected:
  DownpourWorkerParameter param_;
  std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
  std::vector<std::string> need_merge_var_names_;
  float scale_datanorm_;
  int xpu_begin_op_index_;
  int xpu_end_op_index_;
  bool running_;
  paddle::platform::Place place_;
  std::mutex mutex_;
  ProgramDesc program_;
  std::condition_variable cond_;
  std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
  std::shared_ptr<paddle::framework::HeterWrapper> heter_ptr_;
  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
  std::vector<OperatorBase*> ops_;
  std::vector<std::string> op_names_;
  std::vector<Scope*> place_scopes_;
  BtObjectPool<HeterServiceContext> object_pool_;
  std::vector<platform::Place> places_;
243 244 245
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  std::vector<gpuStream_t> copy_streams_;
  std::vector<gpuEvent_t> events_;
T
Thunderbrook 已提交
246
#endif
T
Thunderbrook 已提交
247
};
T
Thunderbrook 已提交
248

T
Thunderbrook 已提交
249 250
#endif

F
Fan Zhang 已提交
251 252
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
     defined PADDLE_WITH_XPU_BKCL) &&                        \
253
    (defined PADDLE_WITH_PSLIB)
T
Thunderbrook 已提交
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
class PSGPUTrainer : public TrainerBase {
 public:
  PSGPUTrainer() {}
  virtual ~PSGPUTrainer() {}
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
  virtual void InitOtherEnv(const ProgramDesc& main_program);
  virtual void Run();
  virtual void Finalize();
  virtual void RegisterHeterCallback();
  virtual Scope* GetWorkerScope(int thread_id);
  virtual void CacheProgram(const ProgramDesc& main_program) {
    new (&program_) ProgramDesc(main_program);
  }
T
Thunderbrook 已提交
269
  virtual std::string GetDumpPath(int tid);
270
  void InitDumpEnv() override;
271
  virtual void MergeDenseParam();
T
Thunderbrook 已提交
272 273 274

  template <typename T>
  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
275
  void InitializeGPUServer(const TrainerDesc& trainer_desc);
T
Thunderbrook 已提交
276 277 278 279 280 281

 protected:
  Dataset* dataset_;
  DownpourWorkerParameter param_;
  std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
  std::vector<std::string> need_merge_var_names_;
282
  std::vector<std::string> trainable_param_;
T
Thunderbrook 已提交
283 284 285 286 287 288 289 290 291 292
  float scale_datanorm_;
  paddle::platform::Place place_;
  ProgramDesc program_;
  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
  std::vector<std::shared_ptr<DeviceWorker>> workers_;
  std::vector<platform::Place> places_;
  // ps-gpu
  std::vector<std::thread> threads_;
  int use_ps_gpu_;
  int thread_num_;
T
Thunderbrook 已提交
293 294 295
  int mpi_rank_;
  int mpi_size_;
  int dump_file_num_;
296 297 298

  // _ps_param for gpups optimizer config
  ::paddle::PSParameter _ps_param;
T
Thunderbrook 已提交
299 300 301
};
#endif

302
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
303
    defined(PADDLE_WITH_ASCEND_CL)
H
hutuxian 已提交
304 305 306 307 308 309 310
class PipelineTrainer : public TrainerBase {
 public:
  PipelineTrainer() {}
  ~PipelineTrainer() override {}
  void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set) override;
  void InitTrainerEnv(const ProgramDesc& main_program,
                      const platform::Place& place) override;
H
hutuxian 已提交
311
  void InitOtherEnv(const ProgramDesc& main_program) override;
H
hutuxian 已提交
312 313
  void Run() override;
  void Finalize() override;
314
  virtual Scope* GetWorkerScope(int thread_id);
H
hutuxian 已提交
315 316
  void InitDumpEnv() override;
  virtual std::string GetDumpPath(int tid);
317
  void GetSkipVars(const ProgramDesc& main_program);
H
hutuxian 已提交
318 319

 protected:
L
lilong12 已提交
320
  int num_microbatches_;
321 322
  platform::Place place_;
  std::vector<std::string> skip_vars_;
L
lilong12 已提交
323
  TrainerDesc trainer_desc_;
H
hutuxian 已提交
324

325 326 327 328 329
  std::future<void> section_thread_;
  std::shared_ptr<paddle::framework::DeviceWorker> worker_;
  Scope* minibatch_scope_;
  // microbatch_scopes_: [microbatch_id]
  std::vector<Scope*> microbatch_scopes_;
L
lilong12 已提交
330

331 332
  void CopyParameters(int microbatch_id, const ProgramDesc& program,
                      const platform::Place& place);
H
hutuxian 已提交
333 334
};
#endif
L
lilong12 已提交
335

336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
#if defined(PADDLE_WITH_PSCORE)
class HeterPipelineTrainer : public TrainerBase {
 public:
  HeterPipelineTrainer() {}
  ~HeterPipelineTrainer() override {}
  void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set) override;
  void InitTrainerEnv(const ProgramDesc& main_program,
                      const platform::Place& place) override;
  void InitOtherEnv(const ProgramDesc& main_program) override;
  void Run() override;
  void Finalize() override;
  Scope* GetWorkerScope(int thread_id) override;
  void InitDumpEnv() override;
  std::string GetDumpPath(int tid) override;
  void ResetDataset(Dataset* dataset_ptr) override;

 protected:
  int trainer_id_;             // stage_trainer_id
  std::vector<int> trainers_;  //  std::vector<int> trainers
  int thread_num_;
  std::vector<std::thread> threads_;

  int num_microbatches_;
  platform::Place place_;
  TrainerDesc trainer_desc_;

  int num_pipeline_stages_;
  int pipeline_stage_;
  std::unordered_map<int, std::shared_ptr<paddle::framework::DeviceWorker>>
      workers_;

  std::shared_ptr<std::unordered_map<
      int, std::shared_ptr<::paddle::framework::BlockingQueue<
               std::pair<std::string, int>>>>>
      task_queue_;

  platform::DeviceContext* dev_ctx_ = nullptr;

  std::shared_ptr<std::unordered_map<int, Scope*>> mini_scopes_;
  std::shared_ptr<std::unordered_map<int, std::shared_ptr<std::vector<Scope*>>>>
      micro_scopes_;

  std::unique_ptr<std::thread> listen_ptr_ = nullptr;
};
#endif

382 383
}  // namespace framework
}  // namespace paddle