trainer.h 12.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

W
wanghuancoder 已提交
17
#include <ctime>
18
#include <fstream>
W
wanghuancoder 已提交
19
#include <map>
20 21 22 23 24 25 26
#include <memory>
#include <mutex>  // NOLINT
#include <string>
#include <thread>  // NOLINT
#include <vector>

#include "paddle/fluid/framework/data_feed.h"
D
dongdaxiang 已提交
27
#include "paddle/fluid/framework/data_set.h"
28
#include "paddle/fluid/framework/device_worker.h"
T
Thunderbrook 已提交
29 30
#include "paddle/fluid/framework/fleet/heter_context.h"
#include "paddle/fluid/framework/heter_util.h"
31 32 33 34 35 36
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/trainer_desc.pb.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/reader/blocking_queue.h"
37
#include "paddle/phi/backends/dynload/port.h"
38

39
#ifdef PADDLE_WITH_PSLIB
40
#include "proto/the_one_ps.pb.h"
41 42
#endif

43 44 45
namespace paddle {
namespace framework {

W
wanghuancoder 已提交
46 47 48 49 50
class Dataset;
class ProgramDesc;
class PullDenseWorker;
class Scope;
class VarDesc;
51
class DeviceWorker;
T
Thunderbrook 已提交
52 53 54 55
class HeterWrapper;
class HeterRequest;
class HeterResponse;

W
wanghuancoder 已提交
56 57 58
template <class T>
class ChannelObject;

59 60 61 62 63 64 65
class TrainerBase {
 public:
  TrainerBase() {}
  virtual ~TrainerBase() {}
  // model memory are hosted in root_scope
  void SetScope(Scope* root_scope);
  void SetDebug(const bool debug) { debug_ = debug; }
66
  void SetDataset(Dataset* dataset_ptr) { dataset_ptr_ = dataset_ptr; }
D
dongdaxiang 已提交
67
  virtual void Initialize(const TrainerDesc& trainer_desc,
68
                          Dataset* data_set) = 0;
69 70 71 72 73
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place) = 0;
  virtual void InitOtherEnv(const ProgramDesc& main_program) = 0;
  virtual void Run() = 0;
  virtual void Finalize() = 0;
74
  virtual Scope* GetWorkerScope(int thread_id) = 0;
H
hutuxian 已提交
75 76
  virtual void InitDumpEnv() = 0;
  virtual void DumpWork(int tid);
77
  virtual void ResetDataset(Dataset* dataset_ptr) {}
78 79

 protected:
H
hutuxian 已提交
80 81 82 83
  virtual std::string GetDumpPath(int tid) = 0;
  virtual void ParseDumpConfig(const TrainerDesc& trainer_desc);
  virtual void FinalizeDumpEnv();

84 85
  Scope* root_scope_;
  bool debug_;
86
  Dataset* dataset_ptr_;
T
Thunderbrook 已提交
87
  TrainerDesc trainer_desc_;
H
hutuxian 已提交
88 89 90

  // For dump param or field
  bool need_dump_field_ = false;
Y
yaoxuefeng 已提交
91
  std::string user_define_dump_filename_;
H
hutuxian 已提交
92 93 94 95 96 97 98 99
  bool need_dump_param_ = false;
  std::string dump_fields_path_;
  std::string dump_converter_;
  std::vector<std::string> dump_param_;
  std::vector<std::string> dump_fields_;
  int dump_thread_num_;
  std::vector<std::thread> dump_thread_;
  std::shared_ptr<paddle::framework::ChannelObject<std::string>> queue_;
100 101 102 103 104 105 106 107 108
};

// general trainer for async execution
// local trainer and distributed trainer are supported
// depends on the assigned device_worker
class MultiTrainer : public TrainerBase {
 public:
  MultiTrainer() {}
  virtual ~MultiTrainer() {}
D
dongdaxiang 已提交
109
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
110 111
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
112
  virtual void InitOtherEnv(const ProgramDesc& main_program);
113 114
  virtual void Run();
  virtual void Finalize();
115
  virtual void InitDumpEnv();
116
  virtual Scope* GetWorkerScope(int thread_id);
H
hutuxian 已提交
117
  virtual std::string GetDumpPath(int tid);
118

T
Thunderbrook 已提交
119 120 121 122 123 124 125
  template <typename T>
  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
#ifdef PADDLE_WITH_HETERPS

  void MergeDenseParam();
#endif

126 127 128
 protected:
  int thread_num_;
  std::vector<std::thread> threads_;
J
jiaqi 已提交
129
  std::vector<DataFeed*> readers_;
130
  std::vector<std::shared_ptr<DeviceWorker>> workers_;
131
  std::vector<std::string> need_merge_var_names_;
D
danleifeng 已提交
132
  std::vector<std::string> trainable_param_;
T
Thunderbrook 已提交
133 134 135
#ifdef PADDLE_WITH_HETERPS
  std::vector<platform::Place> places_;
#endif
136 137 138
  int mpi_rank_;
  int mpi_size_;
  int dump_file_num_;
139 140 141 142 143 144
};

class DistMultiTrainer : public MultiTrainer {
 public:
  DistMultiTrainer() {}
  virtual ~DistMultiTrainer() {}
D
dongdaxiang 已提交
145
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
146 147
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
148
  virtual void InitOtherEnv(const ProgramDesc& main_program);
149
  virtual void Run();
150
  virtual void Finalize();
151 152
  template <typename T>
  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
153
  virtual void InitDumpEnv();
154
  virtual Scope* GetWorkerScope(int thread_id);
T
Thunderbrook 已提交
155
  virtual void RegisterHeterCallback();
156 157 158 159 160

 protected:
  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
};

161 162
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
     defined PADDLE_WITH_XPU) &&                            \
T
Thunderbrook 已提交
163
    (defined PADDLE_WITH_PSLIB) && (!defined(PADDLE_WITH_HETERPS))
T
Thunderbrook 已提交
164 165 166 167 168 169 170 171 172 173 174 175
class HeterServiceContext {
 public:
  HeterServiceContext() {}
  virtual ~HeterServiceContext() {
    for (OperatorBase* op : ops_) {
      delete op;
    }
    std::vector<OperatorBase*>().swap(ops_);
  }
  void Reset() { push_dense_status_.clear(); }
  int place_num_;
  Scope* scope_{nullptr};
176 177 178

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  gpuEvent_t event_;
T
Thunderbrook 已提交
179
#endif
T
Thunderbrook 已提交
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
  std::vector<OperatorBase*> ops_;
  std::vector<::std::future<int32_t>> push_dense_status_;
};

class HeterXpuTrainer : public TrainerBase {
 public:
  HeterXpuTrainer() {}
  virtual ~HeterXpuTrainer() {
    for (OperatorBase* op : ops_) {
      delete op;
    }
    std::vector<OperatorBase*>().swap(ops_);
  }
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
  virtual void InitOtherEnv(const ProgramDesc& main_program);
  virtual void Run();
  virtual void Finalize();
  virtual void DumpWork(int tid);
  virtual void RegisterServiceHandler();
  virtual int RunTask(const HeterRequest* request, HeterResponse* response);
  virtual Scope* GetWorkerScope(int thread_id);
  virtual void CacheProgram(const ProgramDesc& main_program) {
    new (&program_) ProgramDesc(main_program);
  }
T
Thunderbrook 已提交
206 207
  virtual std::string GetDumpPath(int tid) { return ""; }
  virtual void InitDumpEnv() {}
T
Thunderbrook 已提交
208
  template <typename T>
209
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
210 211
  void HeterMemCpy(LoDTensor* tensor,
                   LoDTensor* root_tensor,
T
Thunderbrook 已提交
212
                   const paddle::platform::Place& thread_place,
213
                   gpuStream_t stream);
T
Thunderbrook 已提交
214 215
#endif
#ifdef PADDLE_WITH_XPU
216 217
  void HeterMemCpy(LoDTensor* thread_tensor,
                   LoDTensor* root_tensor,
T
Thunderbrook 已提交
218 219
                   const paddle::platform::Place& thread_place);
#endif
T
Thunderbrook 已提交
220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
  void CreateThreadParam(const ProgramDesc& program, int num);
  template <typename T>
  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
  int EndPass(const HeterRequest* request, HeterResponse* response);
  int StopService(const HeterRequest* request, HeterResponse* response);

 protected:
  DownpourWorkerParameter param_;
  std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
  std::vector<std::string> need_merge_var_names_;
  float scale_datanorm_;
  int xpu_begin_op_index_;
  int xpu_end_op_index_;
  bool running_;
  paddle::platform::Place place_;
  std::mutex mutex_;
  ProgramDesc program_;
  std::condition_variable cond_;
  std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
  std::shared_ptr<paddle::framework::HeterWrapper> heter_ptr_;
  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
  std::vector<OperatorBase*> ops_;
  std::vector<std::string> op_names_;
  std::vector<Scope*> place_scopes_;
  BtObjectPool<HeterServiceContext> object_pool_;
  std::vector<platform::Place> places_;
246 247 248
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  std::vector<gpuStream_t> copy_streams_;
  std::vector<gpuEvent_t> events_;
T
Thunderbrook 已提交
249
#endif
T
Thunderbrook 已提交
250
};
T
Thunderbrook 已提交
251

T
Thunderbrook 已提交
252 253
#endif

F
Fan Zhang 已提交
254 255
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
     defined PADDLE_WITH_XPU_BKCL) &&                        \
256
    (defined PADDLE_WITH_PSLIB)
T
Thunderbrook 已提交
257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
class PSGPUTrainer : public TrainerBase {
 public:
  PSGPUTrainer() {}
  virtual ~PSGPUTrainer() {}
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
  virtual void InitOtherEnv(const ProgramDesc& main_program);
  virtual void Run();
  virtual void Finalize();
  virtual void RegisterHeterCallback();
  virtual Scope* GetWorkerScope(int thread_id);
  virtual void CacheProgram(const ProgramDesc& main_program) {
    new (&program_) ProgramDesc(main_program);
  }
T
Thunderbrook 已提交
272
  virtual std::string GetDumpPath(int tid);
273
  void InitDumpEnv() override;
274
  virtual void MergeDenseParam();
T
Thunderbrook 已提交
275 276 277

  template <typename T>
  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
278
  void InitializeGPUServer(const TrainerDesc& trainer_desc);
T
Thunderbrook 已提交
279 280 281 282 283 284

 protected:
  Dataset* dataset_;
  DownpourWorkerParameter param_;
  std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
  std::vector<std::string> need_merge_var_names_;
285
  std::vector<std::string> trainable_param_;
T
Thunderbrook 已提交
286 287 288 289 290 291 292 293 294 295
  float scale_datanorm_;
  paddle::platform::Place place_;
  ProgramDesc program_;
  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
  std::vector<std::shared_ptr<DeviceWorker>> workers_;
  std::vector<platform::Place> places_;
  // ps-gpu
  std::vector<std::thread> threads_;
  int use_ps_gpu_;
  int thread_num_;
T
Thunderbrook 已提交
296 297 298
  int mpi_rank_;
  int mpi_size_;
  int dump_file_num_;
299 300 301

  // _ps_param for gpups optimizer config
  ::paddle::PSParameter _ps_param;
T
Thunderbrook 已提交
302 303 304
};
#endif

305
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
306
    defined(PADDLE_WITH_ASCEND_CL)
H
hutuxian 已提交
307 308 309 310 311 312 313
class PipelineTrainer : public TrainerBase {
 public:
  PipelineTrainer() {}
  ~PipelineTrainer() override {}
  void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set) override;
  void InitTrainerEnv(const ProgramDesc& main_program,
                      const platform::Place& place) override;
H
hutuxian 已提交
314
  void InitOtherEnv(const ProgramDesc& main_program) override;
H
hutuxian 已提交
315 316
  void Run() override;
  void Finalize() override;
317
  virtual Scope* GetWorkerScope(int thread_id);
H
hutuxian 已提交
318 319
  void InitDumpEnv() override;
  virtual std::string GetDumpPath(int tid);
320
  void GetSkipVars(const ProgramDesc& main_program);
H
hutuxian 已提交
321 322

 protected:
L
lilong12 已提交
323
  int num_microbatches_;
324 325
  platform::Place place_;
  std::vector<std::string> skip_vars_;
L
lilong12 已提交
326
  TrainerDesc trainer_desc_;
H
hutuxian 已提交
327

328 329 330 331 332
  std::future<void> section_thread_;
  std::shared_ptr<paddle::framework::DeviceWorker> worker_;
  Scope* minibatch_scope_;
  // microbatch_scopes_: [microbatch_id]
  std::vector<Scope*> microbatch_scopes_;
L
lilong12 已提交
333

334 335
  void CopyParameters(int microbatch_id,
                      const ProgramDesc& program,
336
                      const platform::Place& place);
H
hutuxian 已提交
337 338
};
#endif
L
lilong12 已提交
339

340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370
#if defined(PADDLE_WITH_PSCORE)
class HeterPipelineTrainer : public TrainerBase {
 public:
  HeterPipelineTrainer() {}
  ~HeterPipelineTrainer() override {}
  void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set) override;
  void InitTrainerEnv(const ProgramDesc& main_program,
                      const platform::Place& place) override;
  void InitOtherEnv(const ProgramDesc& main_program) override;
  void Run() override;
  void Finalize() override;
  Scope* GetWorkerScope(int thread_id) override;
  void InitDumpEnv() override;
  std::string GetDumpPath(int tid) override;
  void ResetDataset(Dataset* dataset_ptr) override;

 protected:
  int trainer_id_;             // stage_trainer_id
  std::vector<int> trainers_;  //  std::vector<int> trainers
  int thread_num_;
  std::vector<std::thread> threads_;

  int num_microbatches_;
  platform::Place place_;
  TrainerDesc trainer_desc_;

  int num_pipeline_stages_;
  int pipeline_stage_;
  std::unordered_map<int, std::shared_ptr<paddle::framework::DeviceWorker>>
      workers_;

371 372 373 374
  std::shared_ptr<
      std::unordered_map<int,
                         std::shared_ptr<::paddle::framework::BlockingQueue<
                             std::pair<std::string, int>>>>>
375 376 377 378 379 380 381 382 383 384 385 386
      task_queue_;

  platform::DeviceContext* dev_ctx_ = nullptr;

  std::shared_ptr<std::unordered_map<int, Scope*>> mini_scopes_;
  std::shared_ptr<std::unordered_map<int, std::shared_ptr<std::vector<Scope*>>>>
      micro_scopes_;

  std::unique_ptr<std::thread> listen_ptr_ = nullptr;
};
#endif

387 388
}  // namespace framework
}  // namespace paddle