trainer.h 12.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

W
wanghuancoder 已提交
17
#include <ctime>
18
#include <fstream>
W
wanghuancoder 已提交
19
#include <map>
20 21 22 23 24 25 26
#include <memory>
#include <mutex>  // NOLINT
#include <string>
#include <thread>  // NOLINT
#include <vector>

#include "paddle/fluid/framework/data_feed.h"
D
dongdaxiang 已提交
27
#include "paddle/fluid/framework/data_set.h"
28
#include "paddle/fluid/framework/device_worker.h"
T
Thunderbrook 已提交
29 30
#include "paddle/fluid/framework/fleet/heter_context.h"
#include "paddle/fluid/framework/heter_util.h"
31 32 33 34 35 36
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/trainer_desc.pb.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/reader/blocking_queue.h"
37
#include "paddle/phi/backends/dynload/port.h"
38

39
#ifdef PADDLE_WITH_PSLIB
40
#include "proto/the_one_ps.pb.h"
41 42
#endif

43 44 45
namespace paddle {
namespace framework {

W
wanghuancoder 已提交
46 47 48 49 50
class Dataset;
class ProgramDesc;
class PullDenseWorker;
class Scope;
class VarDesc;
51
class DeviceWorker;
T
Thunderbrook 已提交
52 53 54 55
class HeterWrapper;
class HeterRequest;
class HeterResponse;

W
wanghuancoder 已提交
56 57 58
template <class T>
class ChannelObject;

59 60 61 62 63 64 65
class TrainerBase {
 public:
  TrainerBase() {}
  virtual ~TrainerBase() {}
  // model memory are hosted in root_scope
  void SetScope(Scope* root_scope);
  void SetDebug(const bool debug) { debug_ = debug; }
66
  void SetDataset(Dataset* dataset_ptr) { dataset_ptr_ = dataset_ptr; }
D
dongdaxiang 已提交
67
  virtual void Initialize(const TrainerDesc& trainer_desc,
68
                          Dataset* data_set) = 0;
69 70 71 72 73
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place) = 0;
  virtual void InitOtherEnv(const ProgramDesc& main_program) = 0;
  virtual void Run() = 0;
  virtual void Finalize() = 0;
74
  virtual Scope* GetWorkerScope(int thread_id) = 0;
H
hutuxian 已提交
75 76
  virtual void InitDumpEnv() = 0;
  virtual void DumpWork(int tid);
77
  virtual void ResetDataset(Dataset* dataset_ptr) {}
78 79

 protected:
H
hutuxian 已提交
80 81 82 83
  virtual std::string GetDumpPath(int tid) = 0;
  virtual void ParseDumpConfig(const TrainerDesc& trainer_desc);
  virtual void FinalizeDumpEnv();

84 85
  Scope* root_scope_;
  bool debug_;
86
  Dataset* dataset_ptr_;
T
Thunderbrook 已提交
87
  TrainerDesc trainer_desc_;
H
hutuxian 已提交
88 89 90

  // For dump param or field
  bool need_dump_field_ = false;
Y
yaoxuefeng 已提交
91
  std::string user_define_dump_filename_;
H
hutuxian 已提交
92 93 94 95 96 97 98 99
  bool need_dump_param_ = false;
  std::string dump_fields_path_;
  std::string dump_converter_;
  std::vector<std::string> dump_param_;
  std::vector<std::string> dump_fields_;
  int dump_thread_num_;
  std::vector<std::thread> dump_thread_;
  std::shared_ptr<paddle::framework::ChannelObject<std::string>> queue_;
100 101 102 103 104 105 106 107 108
};

// general trainer for async execution
// local trainer and distributed trainer are supported
// depends on the assigned device_worker
class MultiTrainer : public TrainerBase {
 public:
  MultiTrainer() {}
  virtual ~MultiTrainer() {}
D
dongdaxiang 已提交
109
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
110 111
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
112
  virtual void InitOtherEnv(const ProgramDesc& main_program);
113 114
  virtual void Run();
  virtual void Finalize();
115
  virtual void InitDumpEnv();
116
  virtual Scope* GetWorkerScope(int thread_id);
H
hutuxian 已提交
117
  virtual std::string GetDumpPath(int tid);
118

T
Thunderbrook 已提交
119
  template <typename T>
120 121
  void MergeToRootScope(phi::DenseTensor* root_tensor,
                        phi::DenseTensor* thread_tensor);
T
Thunderbrook 已提交
122 123 124 125 126
#ifdef PADDLE_WITH_HETERPS

  void MergeDenseParam();
#endif

127 128 129
 protected:
  int thread_num_;
  std::vector<std::thread> threads_;
J
jiaqi 已提交
130
  std::vector<DataFeed*> readers_;
131
  std::vector<std::shared_ptr<DeviceWorker>> workers_;
132
  std::vector<std::string> need_merge_var_names_;
D
danleifeng 已提交
133
  std::vector<std::string> trainable_param_;
T
Thunderbrook 已提交
134 135 136
#ifdef PADDLE_WITH_HETERPS
  std::vector<platform::Place> places_;
#endif
137 138 139
  int mpi_rank_;
  int mpi_size_;
  int dump_file_num_;
140 141 142 143 144 145
};

class DistMultiTrainer : public MultiTrainer {
 public:
  DistMultiTrainer() {}
  virtual ~DistMultiTrainer() {}
D
dongdaxiang 已提交
146
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
147 148
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
149
  virtual void InitOtherEnv(const ProgramDesc& main_program);
150
  virtual void Run();
151
  virtual void Finalize();
152
  template <typename T>
153 154
  void MergeToRootScope(phi::DenseTensor* root_tensor,
                        phi::DenseTensor* thread_tensor);
155
  virtual void InitDumpEnv();
156
  virtual Scope* GetWorkerScope(int thread_id);
T
Thunderbrook 已提交
157
  virtual void RegisterHeterCallback();
158 159 160 161 162

 protected:
  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
};

163 164
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
     defined PADDLE_WITH_XPU) &&                            \
T
Thunderbrook 已提交
165
    (defined PADDLE_WITH_PSLIB) && (!defined(PADDLE_WITH_HETERPS))
T
Thunderbrook 已提交
166 167 168 169 170 171 172 173 174 175 176 177
class HeterServiceContext {
 public:
  HeterServiceContext() {}
  virtual ~HeterServiceContext() {
    for (OperatorBase* op : ops_) {
      delete op;
    }
    std::vector<OperatorBase*>().swap(ops_);
  }
  void Reset() { push_dense_status_.clear(); }
  int place_num_;
  Scope* scope_{nullptr};
178 179 180

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  gpuEvent_t event_;
T
Thunderbrook 已提交
181
#endif
T
Thunderbrook 已提交
182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
  std::vector<OperatorBase*> ops_;
  std::vector<::std::future<int32_t>> push_dense_status_;
};

class HeterXpuTrainer : public TrainerBase {
 public:
  HeterXpuTrainer() {}
  virtual ~HeterXpuTrainer() {
    for (OperatorBase* op : ops_) {
      delete op;
    }
    std::vector<OperatorBase*>().swap(ops_);
  }
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
  virtual void InitOtherEnv(const ProgramDesc& main_program);
  virtual void Run();
  virtual void Finalize();
  virtual void DumpWork(int tid);
  virtual void RegisterServiceHandler();
  virtual int RunTask(const HeterRequest* request, HeterResponse* response);
  virtual Scope* GetWorkerScope(int thread_id);
  virtual void CacheProgram(const ProgramDesc& main_program) {
    new (&program_) ProgramDesc(main_program);
  }
T
Thunderbrook 已提交
208 209
  virtual std::string GetDumpPath(int tid) { return ""; }
  virtual void InitDumpEnv() {}
T
Thunderbrook 已提交
210
  template <typename T>
211
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
212 213
  void HeterMemCpy(phi::DenseTensor* tensor,
                   phi::DenseTensor* root_tensor,
T
Thunderbrook 已提交
214
                   const paddle::platform::Place& thread_place,
215
                   gpuStream_t stream);
T
Thunderbrook 已提交
216 217
#endif
#ifdef PADDLE_WITH_XPU
218 219
  void HeterMemCpy(phi::DenseTensor* thread_tensor,
                   phi::DenseTensor* root_tensor,
T
Thunderbrook 已提交
220 221
                   const paddle::platform::Place& thread_place);
#endif
T
Thunderbrook 已提交
222 223
  void CreateThreadParam(const ProgramDesc& program, int num);
  template <typename T>
224 225
  void MergeToRootScope(phi::DenseTensor* root_tensor,
                        phi::DenseTensor* thread_tensor);
T
Thunderbrook 已提交
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
  int EndPass(const HeterRequest* request, HeterResponse* response);
  int StopService(const HeterRequest* request, HeterResponse* response);

 protected:
  DownpourWorkerParameter param_;
  std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
  std::vector<std::string> need_merge_var_names_;
  float scale_datanorm_;
  int xpu_begin_op_index_;
  int xpu_end_op_index_;
  bool running_;
  paddle::platform::Place place_;
  std::mutex mutex_;
  ProgramDesc program_;
  std::condition_variable cond_;
  std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
  std::shared_ptr<paddle::framework::HeterWrapper> heter_ptr_;
  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
  std::vector<OperatorBase*> ops_;
  std::vector<std::string> op_names_;
  std::vector<Scope*> place_scopes_;
  BtObjectPool<HeterServiceContext> object_pool_;
  std::vector<platform::Place> places_;
249 250 251
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  std::vector<gpuStream_t> copy_streams_;
  std::vector<gpuEvent_t> events_;
T
Thunderbrook 已提交
252
#endif
T
Thunderbrook 已提交
253
};
T
Thunderbrook 已提交
254

T
Thunderbrook 已提交
255 256
#endif

F
Fan Zhang 已提交
257 258
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
     defined PADDLE_WITH_XPU_BKCL) &&                        \
259
    (defined PADDLE_WITH_PSLIB)
T
Thunderbrook 已提交
260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
class PSGPUTrainer : public TrainerBase {
 public:
  PSGPUTrainer() {}
  virtual ~PSGPUTrainer() {}
  virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
  virtual void InitTrainerEnv(const ProgramDesc& main_program,
                              const platform::Place& place);
  virtual void InitOtherEnv(const ProgramDesc& main_program);
  virtual void Run();
  virtual void Finalize();
  virtual void RegisterHeterCallback();
  virtual Scope* GetWorkerScope(int thread_id);
  virtual void CacheProgram(const ProgramDesc& main_program) {
    new (&program_) ProgramDesc(main_program);
  }
T
Thunderbrook 已提交
275
  virtual std::string GetDumpPath(int tid);
276
  void InitDumpEnv() override;
277
  virtual void MergeDenseParam();
T
Thunderbrook 已提交
278 279

  template <typename T>
280 281
  void MergeToRootScope(phi::DenseTensor* root_tensor,
                        phi::DenseTensor* thread_tensor);
282
  void InitializeGPUServer(const TrainerDesc& trainer_desc);
T
Thunderbrook 已提交
283 284 285 286 287 288

 protected:
  Dataset* dataset_;
  DownpourWorkerParameter param_;
  std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
  std::vector<std::string> need_merge_var_names_;
289
  std::vector<std::string> trainable_param_;
T
Thunderbrook 已提交
290 291 292 293 294 295 296 297 298 299
  float scale_datanorm_;
  paddle::platform::Place place_;
  ProgramDesc program_;
  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
  std::vector<std::shared_ptr<DeviceWorker>> workers_;
  std::vector<platform::Place> places_;
  // ps-gpu
  std::vector<std::thread> threads_;
  int use_ps_gpu_;
  int thread_num_;
T
Thunderbrook 已提交
300 301 302
  int mpi_rank_;
  int mpi_size_;
  int dump_file_num_;
303 304 305

  // _ps_param for gpups optimizer config
  ::paddle::PSParameter _ps_param;
T
Thunderbrook 已提交
306 307 308
};
#endif

309
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
310
    defined(PADDLE_WITH_ASCEND_CL)
H
hutuxian 已提交
311 312 313 314 315 316 317
class PipelineTrainer : public TrainerBase {
 public:
  PipelineTrainer() {}
  ~PipelineTrainer() override {}
  void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set) override;
  void InitTrainerEnv(const ProgramDesc& main_program,
                      const platform::Place& place) override;
H
hutuxian 已提交
318
  void InitOtherEnv(const ProgramDesc& main_program) override;
H
hutuxian 已提交
319 320
  void Run() override;
  void Finalize() override;
321
  virtual Scope* GetWorkerScope(int thread_id);
H
hutuxian 已提交
322 323
  void InitDumpEnv() override;
  virtual std::string GetDumpPath(int tid);
324
  void GetSkipVars(const ProgramDesc& main_program);
H
hutuxian 已提交
325 326

 protected:
L
lilong12 已提交
327
  int num_microbatches_;
328 329
  platform::Place place_;
  std::vector<std::string> skip_vars_;
L
lilong12 已提交
330
  TrainerDesc trainer_desc_;
H
hutuxian 已提交
331

332 333 334 335 336
  std::future<void> section_thread_;
  std::shared_ptr<paddle::framework::DeviceWorker> worker_;
  Scope* minibatch_scope_;
  // microbatch_scopes_: [microbatch_id]
  std::vector<Scope*> microbatch_scopes_;
L
lilong12 已提交
337

338 339
  void CopyParameters(int microbatch_id,
                      const ProgramDesc& program,
340
                      const platform::Place& place);
H
hutuxian 已提交
341 342
};
#endif
L
lilong12 已提交
343

344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
#if defined(PADDLE_WITH_PSCORE)
class HeterPipelineTrainer : public TrainerBase {
 public:
  HeterPipelineTrainer() {}
  ~HeterPipelineTrainer() override {}
  void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set) override;
  void InitTrainerEnv(const ProgramDesc& main_program,
                      const platform::Place& place) override;
  void InitOtherEnv(const ProgramDesc& main_program) override;
  void Run() override;
  void Finalize() override;
  Scope* GetWorkerScope(int thread_id) override;
  void InitDumpEnv() override;
  std::string GetDumpPath(int tid) override;
  void ResetDataset(Dataset* dataset_ptr) override;

 protected:
  int trainer_id_;             // stage_trainer_id
  std::vector<int> trainers_;  //  std::vector<int> trainers
  int thread_num_;
  std::vector<std::thread> threads_;

  int num_microbatches_;
  platform::Place place_;
  TrainerDesc trainer_desc_;

  int num_pipeline_stages_;
  int pipeline_stage_;
  std::unordered_map<int, std::shared_ptr<paddle::framework::DeviceWorker>>
      workers_;

375 376 377 378
  std::shared_ptr<
      std::unordered_map<int,
                         std::shared_ptr<::paddle::framework::BlockingQueue<
                             std::pair<std::string, int>>>>>
379 380 381 382 383 384 385 386 387 388 389 390
      task_queue_;

  platform::DeviceContext* dev_ctx_ = nullptr;

  std::shared_ptr<std::unordered_map<int, Scope*>> mini_scopes_;
  std::shared_ptr<std::unordered_map<int, std::shared_ptr<std::vector<Scope*>>>>
      micro_scopes_;

  std::unique_ptr<std::thread> listen_ptr_ = nullptr;
};
#endif

391 392
}  // namespace framework
}  // namespace paddle