ipu_executor.cc 16.8 KB
Newer Older
J
jianghaicheng 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15 16
#include "paddle/fluid/platform/device/ipu/ipu_executor.h"

A
Allen Guo 已提交
17 18 19
#include <popart/devicemanager.hpp>
#include <popdist/popdist_poplar.hpp>

20 21 22 23
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/device/ipu/ipu_compiler.h"
#include "paddle/fluid/platform/device/ipu/ipu_names.h"
#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
J
jianghaicheng 已提交
24 25 26 27 28

namespace paddle {
namespace platform {
namespace ipu {

29 30
namespace {

A
Allen Guo 已提交
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
// Get paddle prefix and popart postfix of weight states
// Format: {popart_postfix, paddle_prefix}
std::vector<std::pair<std::string, std::string>> GetOptPrePostfix(
    const std::string &opt_type) {
  std::vector<std::pair<std::string, std::string>> pre_post_fix;
  // Weight self
  pre_post_fix.push_back(std::make_pair("", ""));

  // Weight states
  // TODO(alleng) support pair("Accl1___", "_moment1_{id!=0}")
  if (opt_type == "adam" || opt_type == "lamb" || opt_type == "adamw") {
    pre_post_fix.push_back(std::make_pair("Accl1___", "_moment1_0"));
    pre_post_fix.push_back(std::make_pair("Accl2___", "_moment2_0"));
    pre_post_fix.push_back(std::make_pair("Step___", "_beta1_pow_acc_0"));
  } else if (opt_type == "momentum") {
    pre_post_fix.push_back(std::make_pair("Accl___", "_velocity_0"));
  } else if (opt_type == "adamax") {
    pre_post_fix.push_back(std::make_pair("Accl1___", "_moment_0"));
    pre_post_fix.push_back(std::make_pair("Accl2___", "_inf_norm__0"));
    pre_post_fix.push_back(std::make_pair("Step___", "_beta1_pow_acc_0"));
  } else if (opt_type == "adagrad") {
    pre_post_fix.push_back(std::make_pair("Accl1___", "_moment_0"));
  } else if (opt_type == "adadelta") {
    pre_post_fix.push_back(std::make_pair("Accl1___", "__avg_squared_grad_0"));
    pre_post_fix.push_back(
        std::make_pair("Accl2___", "__avg_squared_update_0"));
  } else if (opt_type == "rmsprop") {
    pre_post_fix.push_back(std::make_pair("Accl1___", "_mean_square_0"));
    pre_post_fix.push_back(std::make_pair("Accl2___", "_mean_grad_0"));
    pre_post_fix.push_back(std::make_pair("Accl3___", "_momentum__0"));
  }
  return pre_post_fix;
}

65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
class PdIArray final : public popart::IArray {
 public:
  explicit PdIArray(const Tensor *tensor) {
    tensor_.ShareDataWith(*tensor);
    for (int i = 0; i < tensor->dims().size(); ++i) {
      shape_.push_back(tensor->dims().at(i));
    }
  }

 public:
  void *data() { return tensor_.data(); }
  popart::DataType dataType() const {
    return PhiDType2PopartDType(tensor_.dtype());
  }
  std::size_t rank() const { return tensor_.dims().size(); }
  int64_t dim(size_t index) const { return tensor_.dims().at(index); }
  std::size_t nelms() const {
82 83 84 85
    return std::accumulate(shape_.begin(),
                           shape_.end(),
                           static_cast<int64_t>(1),
                           std::multiplies<int64_t>());
86 87 88 89 90 91 92 93 94 95
  }
  const popart::Shape shape() const { return shape_; }

 private:
  Tensor tensor_;
  std::vector<int64_t> shape_;
};

}  // namespace

A
Allen Guo 已提交
96
Executor::~Executor() { Reset(); }
97 98 99

void Executor::Prepare(const std::string &proto) {
  VLOG(10) << "enter Executor::Prepare";
A
Allen Guo 已提交
100
  compile_only_ = GetBoolEnv("IPU_COMPILE_ONLY");
J
jianghaicheng 已提交
101

102 103
  AcquireDevice();
  executor_resources_ = std::make_unique<ExecutorResources>();
J
jianghaicheng 已提交
104 105 106

  auto art = popart::AnchorReturnType("All");
  std::map<popart::TensorId, popart::AnchorReturnType> anchor_ids;
107
  for (const auto &id : compiler_resources_->outputs) {
J
jianghaicheng 已提交
108 109 110 111
    anchor_ids.emplace(id, art);
  }
  auto dataFlow = popart::DataFlow(ipu_strategy_->batches_per_step, anchor_ids);

112
  if (ipu_strategy_->is_training) {
J
jianghaicheng 已提交
113
    VLOG(10) << "Creating TrainingSession from Onnx Model...";
114
    auto optimizer = compiler_resources_->NewOptimizer();
J
jianghaicheng 已提交
115
    session_ = popart::TrainingSession::createFromOnnxModel(
116 117 118 119 120 121 122
        proto,
        dataFlow,
        compiler_resources_->loss_var,
        *optimizer,
        device_,
        popart::InputShapeInfo(),
        ipu_strategy_->popart_options,
123
        ipu_strategy_->popart_patterns);
J
jianghaicheng 已提交
124 125 126
  } else {
    VLOG(10) << "Creating InferenceSession from Onnx Model...";
    session_ = popart::InferenceSession::createFromOnnxModel(
127 128 129 130 131 132
        proto,
        dataFlow,
        device_,
        popart::InputShapeInfo(),
        ipu_strategy_->popart_options,
        ipu_strategy_->popart_patterns);
J
jianghaicheng 已提交
133 134 135
  }
  VLOG(10) << "Creating session from Onnx Model...done";

A
Allen Guo 已提交
136 137 138 139 140 141 142 143 144 145 146 147
  if (compile_only_) {
    LOG(INFO)
        << "Save the offline cache as offline_cache.popart in current path.";
    VLOG(10) << "Compile only...";
    session_->compileAndExport("./offline_cache.popart");
    VLOG(10) << "Compile only...done";
    return;
  } else {
    VLOG(10) << "Preparing session device...";
    session_->prepareDevice();
    VLOG(10) << "Preparing session device...done";
  }
J
jianghaicheng 已提交
148 149 150 151 152 153 154

  SetWeightsIO();

  VLOG(10) << "Copy weights from paddle to popart...";
  WeightsFromPaddle();
  VLOG(10) << "Copy weights from paddle to popart...done";

A
Allen Guo 已提交
155 156 157
  if (ipu_strategy_->random_seed != std::numeric_limits<std::uint64_t>::max()) {
    VLOG(10) << "Setting random seed to: " << ipu_strategy_->random_seed;
    session_->setRandomSeed(ipu_strategy_->random_seed);
J
jianghaicheng 已提交
158 159 160
  }
}

161 162
void Executor::Run(const std::vector<const Tensor *> &inputs,
                   const std::vector<Tensor *> &outputs,
J
jianghaicheng 已提交
163
                   const framework::ExecutionContext &ctx) {
A
Allen Guo 已提交
164 165 166 167 168
  if (compile_only_) {
    LOG(INFO) << "If IPU_COMPILE_ONLY=True, skip exe.run";
    return;
  }

169
  VLOG(10) << "enter Executor::Run";
J
jianghaicheng 已提交
170 171
  // inputs
  std::map<popart::TensorId, popart::IArray &> popart_inputs;
172
  std::map<popart::TensorId, PdIArray> input_wrappers;
J
jianghaicheng 已提交
173
  for (size_t i = 0; i < inputs.size(); i++) {
174
    auto tensor_id = compiler_resources_->inputs[i];
175
    input_wrappers.emplace(tensor_id, PdIArray(inputs[i]));
J
jianghaicheng 已提交
176 177 178 179
    popart_inputs.emplace(tensor_id, input_wrappers.at(tensor_id));
  }
  // anchors
  std::map<popart::TensorId, popart::IArray &> popart_anchors;
180
  std::map<popart::TensorId, PdIArray> anchor_wrappers;
J
jianghaicheng 已提交
181
  for (size_t i = 0; i < outputs.size(); i++) {
182
    auto tensor_id = compiler_resources_->outputs[i];
J
jianghaicheng 已提交
183 184 185 186 187 188 189
    // get dims & dtype from session
    auto fetch_info = session_->getInfo(tensor_id);
    auto output_shape = fetch_info.shape();
    if (ipu_strategy_->batches_per_step > 1) {
      output_shape.insert(output_shape.begin(),
                          ipu_strategy_->batches_per_step);
    }
190 191 192 193 194 195 196 197 198 199
    if (ipu_strategy_->popart_options.enableGradientAccumulation) {
      output_shape.insert(output_shape.begin(),
                          ipu_strategy_->popart_options.accumulationFactor);
    }
    if (ipu_strategy_->popart_options.enableReplicatedGraphs) {
      output_shape.insert(output_shape.begin(),
                          ipu_strategy_->popart_options.replicatedGraphCount);
    }

    auto *tensor = outputs[i];
200
    tensor->Resize(phi::make_ddim(output_shape));
J
jianghaicheng 已提交
201
    auto fetch_dtype = fetch_info.dataType();
202
    auto paddle_type = PopartDType2VarType(fetch_dtype);
203
    tensor->mutable_data(ctx.GetPlace(),
204
                         framework::TransToPhiDataType(paddle_type));
205
    anchor_wrappers.emplace(tensor_id, PdIArray(tensor));
J
jianghaicheng 已提交
206 207
    popart_anchors.emplace(tensor_id, anchor_wrappers.at(tensor_id));
  }
208 209
  VLOG(10) << "Prepared inputs/anchors";

A
Allen Guo 已提交
210
  if (ipu_strategy_->is_training && compiler_resources_->with_lr_sched) {
A
Allen Guo 已提交
211 212 213 214 215 216
    popart::Optimizer *optimizer;
    if (ipu_strategy_->runtime_options.enable_eval) {
      VLOG(10) << "Switch optimizer to eval mode";
      optimizer = compiler_resources_->eval_optimizer.get();
    } else {
      VLOG(10) << "Update learning_rate";
217 218 219 220 221 222 223
      float new_lr;
      if (ipu_strategy_->is_dynamic) {
        new_lr = ipu_strategy_->lr;
      } else {
        new_lr =
            GetSingleVarFromScope<float>(scope_, compiler_resources_->lr_var);
      }
A
Allen Guo 已提交
224 225 226
      VLOG(10) << "New Lr: " << new_lr;
      optimizer = compiler_resources_->UpdateOptimizer(new_lr);
    }
227 228
    auto *session = dynamic_cast<popart::TrainingSession *>(session_.get());
    session->updateOptimizerFromHost(optimizer);
J
jianghaicheng 已提交
229 230 231 232
  }

  popart::StepIO stepio(popart_inputs, popart_anchors);
  VLOG(10) << "Running...";
A
Allen Guo 已提交
233
  session_->run(stepio);
J
jianghaicheng 已提交
234
  VLOG(10) << "Running...done";
A
Allen Guo 已提交
235
}
J
jianghaicheng 已提交
236

A
Allen Guo 已提交
237 238
void Executor::WeightsToHost() {
  if (ipu_strategy_->is_training && session_) {
J
jianghaicheng 已提交
239
    WeightsToPaddle();
A
Allen Guo 已提交
240 241
  } else {
    LOG(WARNING) << "For a non-trainning graph, cannot sync weights from IPU.";
J
jianghaicheng 已提交
242 243 244
  }
}

245 246 247 248 249 250
void Executor::AcquireDevice() {
  VLOG(10) << "enter Executor::AcquireDevice";
  if (device_) {
    Detach();
    device_.reset();
  }
J
jianghaicheng 已提交
251

252
  bool use_ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
A
Allen Guo 已提交
253
  bool enable_distribution = ipu_strategy_->enable_distribution;
254
  if (use_ipu_model) {
A
Allen Guo 已提交
255
    VLOG(10) << "Create IPU model device...";
A
Allen Guo 已提交
256 257
    std::map<std::string, std::string> deviceOpts{
        {
258 259
            "numIPUs",
            std::to_string(ipu_strategy_->num_ipus),
A
Allen Guo 已提交
260
        },
261
        {"tilesPerIPU", std::to_string(ipu_strategy_->tiles_per_ipu)},
A
Allen Guo 已提交
262 263
        {"ipuVersion", "ipu2"},
    };
264 265
    device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice(
        deviceOpts);
A
Allen Guo 已提交
266 267 268 269 270
    VLOG(10) << "Create IPU model device...done";
  } else if (compile_only_) {
    VLOG(10) << "Create offline device...";
    std::map<std::string, std::string> deviceOpts{
        {
271 272
            "numIPUs",
            std::to_string(ipu_strategy_->num_ipus),
A
Allen Guo 已提交
273
        },
274
        {"tilesPerIPU", std::to_string(ipu_strategy_->tiles_per_ipu)},
A
Allen Guo 已提交
275 276 277 278 279 280
        {"ipuVersion", "ipu2"},
    };
    device_ =
        popart::DeviceManager::createDeviceManager().createOfflineIPUDevice(
            deviceOpts);
    VLOG(10) << "Create offline device...done";
A
Allen Guo 已提交
281
  } else if (enable_distribution) {
A
Allen Guo 已提交
282
    VLOG(10) << "Create distribution device...";
A
Allen Guo 已提交
283 284
    auto ipus_per_replica = ipu_strategy_->num_ipus /
                            ipu_strategy_->popart_options.replicatedGraphCount;
A
Allen Guo 已提交
285
    auto device_id = popdist::getDeviceId(ipus_per_replica);
A
Allen Guo 已提交
286 287 288
    device_ = popart::DeviceManager::createDeviceManager().acquireDeviceById(
        device_id);
    PADDLE_ENFORCE_NOT_NULL(
289 290 291
        device_,
        errors::Unavailable("Can't attach IPU in distribution, ipu_num = %d.",
                            RequestIpus(ipu_strategy_->num_ipus)));
A
Allen Guo 已提交
292
    VLOG(10) << "Create distribution device...done";
293
  } else {
A
Allen Guo 已提交
294
    VLOG(10) << "Create IPU device...";
295 296 297
    device_ =
        popart::DeviceManager::createDeviceManager().acquireAvailableDevice(
            RequestIpus(ipu_strategy_->num_ipus));
298
    PADDLE_ENFORCE_NOT_NULL(
299 300 301
        device_,
        errors::Unavailable("Can't attach IPU, ipu_num = %d.",
                            RequestIpus(ipu_strategy_->num_ipus)));
A
Allen Guo 已提交
302
    VLOG(10) << "Create IPU device...done";
303 304
  }
  VLOG(10) << "leave Executor::AcquireDevice";
J
jianghaicheng 已提交
305 306
}

307 308 309 310 311 312
void Executor::Detach() {
  if (device_ && device_->isAttached()) {
    VLOG(10) << "trying to detach IPU";
    device_->detach();
    VLOG(10) << " detached IPU";
  }
J
jianghaicheng 已提交
313 314
}

A
Allen Guo 已提交
315 316 317 318 319 320
void Executor::Reset() {
  Detach();
  session_.reset();
  executor_resources_.reset();
}

J
jianghaicheng 已提交
321
void Executor::SetWeightsIO() {
322 323
  auto opt_type = compiler_resources_->optimizer_type;
  VLOG(10) << "SetWeightsIO for " << opt_type;
J
jianghaicheng 已提交
324
  auto pre_post_fix = GetOptPrePostfix(opt_type);
A
Allen Guo 已提交
325
  for (const auto &weight_pd : compiler_resources_->weights) {
J
jianghaicheng 已提交
326 327
    for (const auto &pair : pre_post_fix) {
      // pair.first : popart prefix, pair.second : paddle postfix
A
Allen Guo 已提交
328 329 330
      auto weight_pop = compiler_resources_->tensors[weight_pd];
      auto popart_var = pair.first + weight_pop;
      auto paddle_var = weight_pd + pair.second;
J
jianghaicheng 已提交
331

A
Allen Guo 已提交
332
      if (scope_->FindVar(paddle_var) == nullptr) {
J
jianghaicheng 已提交
333 334
        continue;
      }
A
Allen Guo 已提交
335
      if (!session_->hasInfo(popart_var)) {
336 337 338
        continue;
      }

A
Allen Guo 已提交
339 340 341
      VLOG(10) << "Connect paddle weight: " << paddle_var
               << " with popart weight: " << popart_var;
      auto var = scope_->GetVar(paddle_var);
342
      auto data_ptr = var->GetMutable<framework::LoDTensor>()->data();
A
Allen Guo 已提交
343 344
      auto tensor_info = session_->getInfo(popart_var);
      executor_resources_->weights_io.insert(popart_var,
345 346
                                             {data_ptr, tensor_info});
      executor_resources_->weights_and_opt_state.emplace_back(
A
Allen Guo 已提交
347
          std::make_pair(popart_var, paddle_var));
J
jianghaicheng 已提交
348 349 350 351
    }
  }
}

352 353 354 355
// align_to_popart: align dtype to popart if true, else to paddle
void Executor::ConvertWeights(bool align_to_popart) {
  for (auto weight_pair : executor_resources_->weights_and_opt_state) {
    auto paddle_var = scope_->GetVar(weight_pair.second);
356
    auto paddle_var_dtype = PhiDType2PopartDType(
A
Allen Guo 已提交
357
        paddle_var->GetMutable<framework::LoDTensor>()->dtype());
358 359 360 361

    PADDLE_ENFORCE_EQ((paddle_var_dtype == popart::DataType::FLOAT ||
                       paddle_var_dtype == popart::DataType::FLOAT16),
                      true,
362
                      errors::InvalidArgument(
363 364 365 366 367 368 369 370 371
                          "Currently, we only support FLOAT16 and FLOAT with "
                          "Paddle, but received type is %s.",
                          paddle_var_dtype));

    popart::TensorInfo info = session_->getInfo(weight_pair.first);
    auto popart_var_dtype = info.dataType();
    PADDLE_ENFORCE_EQ((popart_var_dtype == popart::DataType::FLOAT ||
                       popart_var_dtype == popart::DataType::FLOAT16),
                      true,
372
                      errors::InvalidArgument(
373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389
                          "Currently, we only support FLOAT16 and FLOAT with "
                          "popart, but received type is %s.",
                          popart_var_dtype));

    if (paddle_var_dtype == popart_var_dtype) {
      VLOG(10) << weight_pair.first << " and " << weight_pair.second
               << " have the same dtype : " << popart_var_dtype;
      continue;
    } else if (paddle_var_dtype == popart::DataType::FLOAT) {
      VLOG(10) << weight_pair.first << " and " << weight_pair.second
               << " have different dtype : " << popart_var_dtype;
      auto *data_ptr =
          paddle_var->GetMutable<framework::LoDTensor>()->data<float>();

      auto num_elem = info.nelms();
      if (align_to_popart) {
        std::vector<uint16_t> fp16_data;
390 391
        std::transform(data_ptr,
                       data_ptr + num_elem,
392 393
                       std::back_inserter(fp16_data),
                       [&](float elem) { return popart::floatToHalf(elem); });
394 395
        memcpy(reinterpret_cast<void *>(data_ptr),
               fp16_data.data(),
396 397 398 399
               num_elem * sizeof(float16));
      } else {
        std::vector<float> fp32_data;
        auto fp16_data_ptr = reinterpret_cast<uint16_t *>(data_ptr);
400 401 402 403 404 405 406
        std::transform(
            fp16_data_ptr,
            fp16_data_ptr + num_elem,
            std::back_inserter(fp32_data),
            [&](uint16_t elem) { return popart::halfToFloat(elem); });
        memcpy(reinterpret_cast<void *>(data_ptr),
               fp32_data.data(),
407 408 409
               num_elem * sizeof(float));
      }
    } else {
410 411
      PADDLE_THROW(
          errors::Unimplemented("Convert Paddle FLOAT16 to popart FLOAT"));
412 413
    }
  }
J
jianghaicheng 已提交
414 415
}

416 417 418 419 420 421 422 423 424 425 426 427 428
// |-----------------------------------------------------|
// | Paddle  | Popart  |             Method              |
// |-----------------------------------------------------|
// |  FLOAT  |  FLOAT  |         Paddle -> Popart        |
// |  FLOAT  | FLOAT16 | floatToHalf -> Paddle -> Popart |
// | FLOAT16 |  FLOAT  |         Unimplemented           |
// | FLOAT16 | FLOAT16 |         Paddle -> Popart        |
// |-----------------------------------------------------|
// floatToHalf -> Paddle: cast then save to paddle
// Paddle -> Popart: copy from paddle to popart
void Executor::WeightsFromPaddle() {
  ConvertWeights(true);
  session_->writeWeights(executor_resources_->weights_io);
A
Allen Guo 已提交
429
  session_->weightsFromHost();
430
}
J
jianghaicheng 已提交
431

432 433 434 435 436 437 438 439 440 441 442
// |-----------------------------------------------------|
// | Paddle  | Popart  |             Method              |
// |-----------------------------------------------------|
// |  FLOAT  |  FLOAT  |         Popart -> Paddle        |
// |  FLOAT  | FLOAT16 | Popart -> Paddle -> halfToFloat |
// | FLOAT16 |  FLOAT  |         Unimplemented           |
// | FLOAT16 | FLOAT16 |         Popart -> Paddle        |
// |-----------------------------------------------------|
// Paddle -> halfToFloat: cast then save to paddle
// Popart -> Paddle: copy from paddle to popart
void Executor::WeightsToPaddle() {
A
Allen Guo 已提交
443
  session_->weightsToHost();
444 445 446
  session_->readWeights(executor_resources_->weights_io);
  ConvertWeights(false);
}
J
jianghaicheng 已提交
447

448 449 450 451 452 453 454
void Executor::SaveModelToHost(const std::string &path) {
  if (session_) {
    WeightsToPaddle();
    session_->modelToHost(path);
  } else {
    LOG(WARNING) << "Model is empty";
  }
J
jianghaicheng 已提交
455 456 457 458 459
}

}  // namespace ipu
}  // namespace platform
}  // namespace paddle