ipu_executor.cc 14.3 KB
Newer Older
J
jianghaicheng 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15 16 17
#include "paddle/fluid/platform/device/ipu/ipu_executor.h"

using float16 = paddle::platform::float16;
J
jianghaicheng 已提交
18 19 20 21 22

namespace paddle {
namespace platform {
namespace ipu {

A
Allen Guo 已提交
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
// Get paddle prefix and popart postfix of weight states
// Format: {popart_postfix, paddle_prefix}
std::vector<std::pair<std::string, std::string>> GetOptPrePostfix(
    const std::string &opt_type) {
  std::vector<std::pair<std::string, std::string>> pre_post_fix;
  // Weight self
  pre_post_fix.push_back(std::make_pair("", ""));

  // Weight states
  // TODO(alleng) support pair("Accl1___", "_moment1_{id!=0}")
  if (opt_type == "adam" || opt_type == "lamb" || opt_type == "adamw") {
    pre_post_fix.push_back(std::make_pair("Accl1___", "_moment1_0"));
    pre_post_fix.push_back(std::make_pair("Accl2___", "_moment2_0"));
    pre_post_fix.push_back(std::make_pair("Step___", "_beta1_pow_acc_0"));
  } else if (opt_type == "momentum") {
    pre_post_fix.push_back(std::make_pair("Accl___", "_velocity_0"));
  } else if (opt_type == "adamax") {
    pre_post_fix.push_back(std::make_pair("Accl1___", "_moment_0"));
    pre_post_fix.push_back(std::make_pair("Accl2___", "_inf_norm__0"));
    pre_post_fix.push_back(std::make_pair("Step___", "_beta1_pow_acc_0"));
  } else if (opt_type == "adagrad") {
    pre_post_fix.push_back(std::make_pair("Accl1___", "_moment_0"));
  } else if (opt_type == "adadelta") {
    pre_post_fix.push_back(std::make_pair("Accl1___", "__avg_squared_grad_0"));
    pre_post_fix.push_back(
        std::make_pair("Accl2___", "__avg_squared_update_0"));
  } else if (opt_type == "rmsprop") {
    pre_post_fix.push_back(std::make_pair("Accl1___", "_mean_square_0"));
    pre_post_fix.push_back(std::make_pair("Accl2___", "_mean_grad_0"));
    pre_post_fix.push_back(std::make_pair("Accl3___", "_momentum__0"));
  }
  return pre_post_fix;
}

57 58 59 60 61 62 63 64
Executor::~Executor() {
  Detach();
  session_.reset();
  executor_resources_.reset();
}

void Executor::Prepare(const std::string &proto) {
  VLOG(10) << "enter Executor::Prepare";
J
jianghaicheng 已提交
65

66 67
  AcquireDevice();
  executor_resources_ = std::make_unique<ExecutorResources>();
J
jianghaicheng 已提交
68 69 70

  auto art = popart::AnchorReturnType("All");
  std::map<popart::TensorId, popart::AnchorReturnType> anchor_ids;
71
  for (const auto &id : compiler_resources_->outputs) {
J
jianghaicheng 已提交
72 73 74 75
    anchor_ids.emplace(id, art);
  }
  auto dataFlow = popart::DataFlow(ipu_strategy_->batches_per_step, anchor_ids);

76
  if (ipu_strategy_->is_training) {
J
jianghaicheng 已提交
77
    VLOG(10) << "Creating TrainingSession from Onnx Model...";
78
    auto optimizer = compiler_resources_->NewOptimizer();
J
jianghaicheng 已提交
79
    session_ = popart::TrainingSession::createFromOnnxModel(
80 81 82
        proto, dataFlow, compiler_resources_->loss_var, *optimizer, device_,
        popart::InputShapeInfo(), ipu_strategy_->popart_options,
        ipu_strategy_->popart_patterns);
J
jianghaicheng 已提交
83 84 85
  } else {
    VLOG(10) << "Creating InferenceSession from Onnx Model...";
    session_ = popart::InferenceSession::createFromOnnxModel(
86 87
        proto, dataFlow, device_, popart::InputShapeInfo(),
        ipu_strategy_->popart_options, ipu_strategy_->popart_patterns);
J
jianghaicheng 已提交
88 89 90 91 92 93 94 95 96 97 98 99 100
  }
  VLOG(10) << "Creating session from Onnx Model...done";

  VLOG(10) << "Preparing session device...";
  session_->prepareDevice();
  VLOG(10) << "Preparing session device...done";

  SetWeightsIO();

  VLOG(10) << "Copy weights from paddle to popart...";
  WeightsFromPaddle();
  VLOG(10) << "Copy weights from paddle to popart...done";

A
Allen Guo 已提交
101 102 103
  if (ipu_strategy_->random_seed != std::numeric_limits<std::uint64_t>::max()) {
    VLOG(10) << "Setting random seed to: " << ipu_strategy_->random_seed;
    session_->setRandomSeed(ipu_strategy_->random_seed);
J
jianghaicheng 已提交
104 105 106
  }
}

107 108
void Executor::Run(const std::vector<const Tensor *> &inputs,
                   const std::vector<Tensor *> &outputs,
J
jianghaicheng 已提交
109
                   const framework::ExecutionContext &ctx) {
110
  VLOG(10) << "enter Executor::Run";
J
jianghaicheng 已提交
111 112 113 114
  // inputs
  std::map<popart::TensorId, popart::IArray &> popart_inputs;
  std::map<popart::TensorId, PaddleIArray> input_wrappers;
  for (size_t i = 0; i < inputs.size(); i++) {
115 116
    auto tensor_id = compiler_resources_->inputs[i];
    input_wrappers.emplace(tensor_id, PaddleIArray(inputs[i]));
J
jianghaicheng 已提交
117 118 119 120 121 122
    popart_inputs.emplace(tensor_id, input_wrappers.at(tensor_id));
  }
  // anchors
  std::map<popart::TensorId, popart::IArray &> popart_anchors;
  std::map<popart::TensorId, PaddleIArray> anchor_wrappers;
  for (size_t i = 0; i < outputs.size(); i++) {
123
    auto tensor_id = compiler_resources_->outputs[i];
J
jianghaicheng 已提交
124 125 126 127 128 129 130
    // get dims & dtype from session
    auto fetch_info = session_->getInfo(tensor_id);
    auto output_shape = fetch_info.shape();
    if (ipu_strategy_->batches_per_step > 1) {
      output_shape.insert(output_shape.begin(),
                          ipu_strategy_->batches_per_step);
    }
131 132 133 134 135 136 137 138 139 140
    if (ipu_strategy_->popart_options.enableGradientAccumulation) {
      output_shape.insert(output_shape.begin(),
                          ipu_strategy_->popart_options.accumulationFactor);
    }
    if (ipu_strategy_->popart_options.enableReplicatedGraphs) {
      output_shape.insert(output_shape.begin(),
                          ipu_strategy_->popart_options.replicatedGraphCount);
    }

    auto *tensor = outputs[i];
141
    tensor->Resize(phi::make_ddim(output_shape));
J
jianghaicheng 已提交
142 143
    auto fetch_dtype = fetch_info.dataType();
    auto paddle_type = PopartType2VarType(fetch_dtype);
144
    tensor->mutable_data(ctx.GetPlace(),
145
                         framework::TransToPhiDataType(paddle_type));
J
jianghaicheng 已提交
146 147 148
    anchor_wrappers.emplace(tensor_id, PaddleIArray(tensor));
    popart_anchors.emplace(tensor_id, anchor_wrappers.at(tensor_id));
  }
149 150 151
  VLOG(10) << "Prepared inputs/anchors";

  if (ipu_strategy_->is_training && compiler_resources_->with_lr_sched) {
A
Allen Guo 已提交
152 153 154 155 156 157 158 159 160 161 162
    popart::Optimizer *optimizer;
    if (ipu_strategy_->runtime_options.enable_eval) {
      VLOG(10) << "Switch optimizer to eval mode";
      optimizer = compiler_resources_->eval_optimizer.get();
    } else {
      VLOG(10) << "Update learning_rate";
      auto new_lr =
          GetSingleVarFromScope<float>(scope_, compiler_resources_->lr_var);
      VLOG(10) << "New Lr: " << new_lr;
      optimizer = compiler_resources_->UpdateOptimizer(new_lr);
    }
163 164
    auto *session = dynamic_cast<popart::TrainingSession *>(session_.get());
    session->updateOptimizerFromHost(optimizer);
J
jianghaicheng 已提交
165 166 167 168 169 170
  }

  popart::StepIO stepio(popart_inputs, popart_anchors);
  VLOG(10) << "Running...";
  session_->run(stepio);
  VLOG(10) << "Running...done";
A
Allen Guo 已提交
171
}
J
jianghaicheng 已提交
172

A
Allen Guo 已提交
173 174
void Executor::WeightsToHost() {
  if (ipu_strategy_->is_training && session_) {
J
jianghaicheng 已提交
175
    WeightsToPaddle();
A
Allen Guo 已提交
176 177
  } else {
    LOG(WARNING) << "For a non-trainning graph, cannot sync weights from IPU.";
J
jianghaicheng 已提交
178 179 180
  }
}

181 182 183 184 185 186
void Executor::AcquireDevice() {
  VLOG(10) << "enter Executor::AcquireDevice";
  if (device_) {
    Detach();
    device_.reset();
  }
J
jianghaicheng 已提交
187

188
  bool use_ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
A
Allen Guo 已提交
189
  bool enable_distribution = ipu_strategy_->enable_distribution;
190
  if (use_ipu_model) {
A
Allen Guo 已提交
191 192 193 194 195 196
    std::map<std::string, std::string> deviceOpts{
        {
            "numIPUs", std::to_string(ipu_strategy_->num_ipus),
        },
        {"ipuVersion", "ipu2"},
    };
197 198
    device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice(
        deviceOpts);
A
Allen Guo 已提交
199 200 201 202 203 204 205 206 207 208
  } else if (enable_distribution) {
    auto ipus_per_replica = ipu_strategy_->num_ipus /
                            ipu_strategy_->popart_options.replicatedGraphCount;
    auto device_id = popdist_get_device(ipus_per_replica);
    device_ = popart::DeviceManager::createDeviceManager().acquireDeviceById(
        device_id);
    PADDLE_ENFORCE_NOT_NULL(
        device_, platform::errors::Unavailable(
                     "Can't attach IPU in distribution, ipu_num = %d.",
                     RequestIpus(ipu_strategy_->num_ipus)));
209 210 211 212 213 214 215 216 217
  } else {
    device_ =
        popart::DeviceManager::createDeviceManager().acquireAvailableDevice(
            RequestIpus(ipu_strategy_->num_ipus));
    PADDLE_ENFORCE_NOT_NULL(device_, platform::errors::Unavailable(
                                         "Can't attach IPU, ipu_num = %d.",
                                         RequestIpus(ipu_strategy_->num_ipus)));
  }
  VLOG(10) << "leave Executor::AcquireDevice";
J
jianghaicheng 已提交
218 219
}

220 221 222 223 224 225
void Executor::Detach() {
  if (device_ && device_->isAttached()) {
    VLOG(10) << "trying to detach IPU";
    device_->detach();
    VLOG(10) << " detached IPU";
  }
J
jianghaicheng 已提交
226 227 228
}

void Executor::SetWeightsIO() {
229 230
  auto opt_type = compiler_resources_->optimizer_type;
  VLOG(10) << "SetWeightsIO for " << opt_type;
J
jianghaicheng 已提交
231
  auto pre_post_fix = GetOptPrePostfix(opt_type);
A
Allen Guo 已提交
232
  for (const auto &weight_pd : compiler_resources_->weights) {
J
jianghaicheng 已提交
233 234
    for (const auto &pair : pre_post_fix) {
      // pair.first : popart prefix, pair.second : paddle postfix
A
Allen Guo 已提交
235 236 237
      auto weight_pop = compiler_resources_->tensors[weight_pd];
      auto popart_var = pair.first + weight_pop;
      auto paddle_var = weight_pd + pair.second;
J
jianghaicheng 已提交
238

A
Allen Guo 已提交
239
      if (scope_->FindVar(paddle_var) == nullptr) {
J
jianghaicheng 已提交
240 241
        continue;
      }
A
Allen Guo 已提交
242
      if (!session_->hasInfo(popart_var)) {
243 244 245
        continue;
      }

A
Allen Guo 已提交
246 247 248
      VLOG(10) << "Connect paddle weight: " << paddle_var
               << " with popart weight: " << popart_var;
      auto var = scope_->GetVar(paddle_var);
249
      auto data_ptr = var->GetMutable<framework::LoDTensor>()->data();
A
Allen Guo 已提交
250 251
      auto tensor_info = session_->getInfo(popart_var);
      executor_resources_->weights_io.insert(popart_var,
252 253
                                             {data_ptr, tensor_info});
      executor_resources_->weights_and_opt_state.emplace_back(
A
Allen Guo 已提交
254
          std::make_pair(popart_var, paddle_var));
J
jianghaicheng 已提交
255 256 257 258
    }
  }
}

259 260 261 262
// align_to_popart: align dtype to popart if true, else to paddle
void Executor::ConvertWeights(bool align_to_popart) {
  for (auto weight_pair : executor_resources_->weights_and_opt_state) {
    auto paddle_var = scope_->GetVar(weight_pair.second);
A
Allen Guo 已提交
263 264
    auto paddle_var_dtype = PdDataType2PopartType(
        paddle_var->GetMutable<framework::LoDTensor>()->dtype());
265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316

    PADDLE_ENFORCE_EQ((paddle_var_dtype == popart::DataType::FLOAT ||
                       paddle_var_dtype == popart::DataType::FLOAT16),
                      true,
                      platform::errors::InvalidArgument(
                          "Currently, we only support FLOAT16 and FLOAT with "
                          "Paddle, but received type is %s.",
                          paddle_var_dtype));

    popart::TensorInfo info = session_->getInfo(weight_pair.first);
    auto popart_var_dtype = info.dataType();
    PADDLE_ENFORCE_EQ((popart_var_dtype == popart::DataType::FLOAT ||
                       popart_var_dtype == popart::DataType::FLOAT16),
                      true,
                      platform::errors::InvalidArgument(
                          "Currently, we only support FLOAT16 and FLOAT with "
                          "popart, but received type is %s.",
                          popart_var_dtype));

    if (paddle_var_dtype == popart_var_dtype) {
      VLOG(10) << weight_pair.first << " and " << weight_pair.second
               << " have the same dtype : " << popart_var_dtype;
      continue;
    } else if (paddle_var_dtype == popart::DataType::FLOAT) {
      VLOG(10) << weight_pair.first << " and " << weight_pair.second
               << " have different dtype : " << popart_var_dtype;
      auto *data_ptr =
          paddle_var->GetMutable<framework::LoDTensor>()->data<float>();

      auto num_elem = info.nelms();
      if (align_to_popart) {
        std::vector<uint16_t> fp16_data;
        std::transform(data_ptr, data_ptr + num_elem,
                       std::back_inserter(fp16_data),
                       [&](float elem) { return popart::floatToHalf(elem); });
        memcpy(reinterpret_cast<void *>(data_ptr), fp16_data.data(),
               num_elem * sizeof(float16));
      } else {
        std::vector<float> fp32_data;
        auto fp16_data_ptr = reinterpret_cast<uint16_t *>(data_ptr);
        std::transform(fp16_data_ptr, fp16_data_ptr + num_elem,
                       std::back_inserter(fp32_data), [&](uint16_t elem) {
                         return popart::halfToFloat(elem);
                       });
        memcpy(reinterpret_cast<void *>(data_ptr), fp32_data.data(),
               num_elem * sizeof(float));
      }
    } else {
      PADDLE_THROW(platform::errors::Unimplemented(
          "Convert Paddle FLOAT16 to popart FLOAT"));
    }
  }
J
jianghaicheng 已提交
317 318
}

319 320 321 322 323 324 325 326 327 328 329 330 331
// |-----------------------------------------------------|
// | Paddle  | Popart  |             Method              |
// |-----------------------------------------------------|
// |  FLOAT  |  FLOAT  |         Paddle -> Popart        |
// |  FLOAT  | FLOAT16 | floatToHalf -> Paddle -> Popart |
// | FLOAT16 |  FLOAT  |         Unimplemented           |
// | FLOAT16 | FLOAT16 |         Paddle -> Popart        |
// |-----------------------------------------------------|
// floatToHalf -> Paddle: cast then save to paddle
// Paddle -> Popart: copy from paddle to popart
void Executor::WeightsFromPaddle() {
  ConvertWeights(true);
  session_->writeWeights(executor_resources_->weights_io);
A
Allen Guo 已提交
332
  session_->weightsFromHost();
333
}
J
jianghaicheng 已提交
334

335 336 337 338 339 340 341 342 343 344 345
// |-----------------------------------------------------|
// | Paddle  | Popart  |             Method              |
// |-----------------------------------------------------|
// |  FLOAT  |  FLOAT  |         Popart -> Paddle        |
// |  FLOAT  | FLOAT16 | Popart -> Paddle -> halfToFloat |
// | FLOAT16 |  FLOAT  |         Unimplemented           |
// | FLOAT16 | FLOAT16 |         Popart -> Paddle        |
// |-----------------------------------------------------|
// Paddle -> halfToFloat: cast then save to paddle
// Popart -> Paddle: copy from paddle to popart
void Executor::WeightsToPaddle() {
A
Allen Guo 已提交
346
  session_->weightsToHost();
347 348 349
  session_->readWeights(executor_resources_->weights_io);
  ConvertWeights(false);
}
J
jianghaicheng 已提交
350

351 352 353 354 355 356 357
void Executor::SaveModelToHost(const std::string &path) {
  if (session_) {
    WeightsToPaddle();
    session_->modelToHost(path);
  } else {
    LOG(WARNING) << "Model is empty";
  }
J
jianghaicheng 已提交
358 359 360 361 362
}

}  // namespace ipu
}  // namespace platform
}  // namespace paddle