convert_to_mixed_precision.cc 32.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h"

W
Wilber 已提交
17 18
#include <algorithm>
#include <iterator>
19
#include <string>
W
Wilber 已提交
20
#include <unordered_map>
21
#include <unordered_set>
22
#include <utility>
23 24 25

#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/executor.h"
26
#include "paddle/fluid/framework/framework.pb.h"
27 28 29
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
30
#include "paddle/fluid/framework/ir/node.h"
31 32
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
33
#include "paddle/fluid/framework/var_desc.h"
34 35 36 37 38 39 40 41 42 43 44 45 46
#include "paddle/fluid/inference/io.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/tensor_meta.h"

using namespace paddle::framework;  // NOLINT

namespace paddle {
namespace inference {
namespace analysis {

namespace {

W
Wilber 已提交
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
inline std::string SerializeParams(framework::Scope* scope,
                                   const std::vector<std::string>& params) {
  std::ostringstream os;
  phi::CPUContext ctx;
  for (const auto& param : params) {
    VLOG(3) << "Serialize param: " << param;
    PADDLE_ENFORCE_NOT_NULL(
        scope->FindVar(param),
        platform::errors::NotFound("Block should already have a '%s' variable",
                                   param));
    auto* tensor = scope->FindVar(param)->GetMutable<framework::LoDTensor>();
    framework::SerializeToStream(os, *tensor, ctx);
  }
  return os.str();
}

inline void StrToBinary(const std::string& path, const std::string& str) {
  std::ofstream file(path.c_str(), std::ios::binary);
  file.write(str.c_str(), str.size());
  file.close();
}
68

W
Wilber 已提交
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
inline bool NodeVarHasDtype(framework::ir::Node* node) {
  if (node->IsCtrlVar()) return false;

  if (node->IsVar() &&
      (node->Var()->GetType() ==
           paddle::framework::proto::VarType::SELECTED_ROWS ||
       node->Var()->GetType() ==
           paddle::framework::proto::VarType::LOD_TENSOR ||
       node->Var()->GetType() ==
           paddle::framework::proto::VarType::LOD_TENSOR_ARRAY ||
       node->Var()->GetType() == paddle::framework::proto::VarType::STRINGS ||
       node->Var()->GetType() == paddle::framework::proto::VarType::VOCAB)) {
    return true;
  }

  return false;
}
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142

// Return Node* which first appers in block.
framework::ir::Node* GetRealNode(
    const std::vector<framework::ir::Graph*>& graphes,
    int block_idx,
    framework::ir::Node* node,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map) {
  if (vars_in_multi_block_map->count(node->Name())) {
    int var_origin_block_id = vars_in_multi_block_map->at(node->Name()).second;
    if (block_idx != var_origin_block_id) {
      auto graph = graphes[var_origin_block_id];
      for (auto nd : graph->Nodes()) {
        if (nd->Name() == node->Name()) {
          return nd;
        }
      }
    }
  }

  return node;
}

inline bool VarIsMultiOpsOut(
    const std::vector<framework::ir::Graph*>& graphes,
    int block_idx,
    framework::ir::Node* op_node,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map,
    const std::vector<std::set<std::string>>& vars_appear_multi_in_one_block) {
  CHECK_EQ(op_node->IsOp(), true);
  for (auto* out : op_node->outputs) {
    if (out->IsCtrlVar()) continue;
    auto* real_node =
        GetRealNode(graphes, block_idx, out, vars_in_multi_block_map);
    if (!real_node->Var()->Persistable() &&
        vars_appear_multi_in_one_block[block_idx].count(out->Name())) {
      VLOG(2) << out->Name()
              << " is multi op's out, so we skip convert to fp16";
      return true;
    }
  }
  return false;
}

void SaveMixedModel(
    framework::ir::Graph* graph,
    framework::Scope* scope,
    framework::ProgramDesc* mixed_program_desc,
    const std::string& mixed_model_file,
    const std::string& mixed_params_file,
    phi::DataType mixed_precision,
    const std::unordered_map<std::string,
                             std::pair<framework::proto::VarType::Type, int>>&
        vars_in_multi_block_map) {
W
Wilber 已提交
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
  paddle::CPUPlace place;
  auto parameters = scope->LocalVarNames();
  std::sort(parameters.begin(), parameters.end());

  std::unordered_set<std::string> weights_should_be_fp32;
  for (auto* node : graph->Nodes()) {
    if (!(node->IsVar() && !node->IsCtrlVar())) continue;
    if (NodeVarHasDtype(node)) {
      if (node->Var()->Persistable() &&
          node->Var()->GetDataType() ==
              paddle::framework::proto::VarType::FP32) {
        VLOG(2) << "weights keep to fp32: " << node->Name();
        weights_should_be_fp32.insert(node->Name());
      }
    }
  }

  for (const auto& param_name : parameters) {
    auto* var = scope->FindLocalVar(param_name);
    if (var->IsType<framework::LoDTensor>() ||
        var->IsType<framework::Tensor>()) {
      auto* t = var->GetMutable<framework::LoDTensor>();
W
Wilber 已提交
165 166
      if (t->dtype() != phi::DataType::FLOAT32) continue;

W
Wilber 已提交
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
      framework::Tensor mixed_tensor;
      mixed_tensor.Resize(t->dims());
      auto* data = t->mutable_data<float>(platform::CPUPlace());

      if (mixed_precision == phi::DataType::FLOAT16 &&
          !weights_should_be_fp32.count(param_name)) {
        mixed_tensor.set_type(paddle::experimental::DataType::FLOAT16);
        auto* mixed_data =
            mixed_tensor.mutable_data<float16>(platform::CPUPlace());
        for (int i = 0; i < t->numel(); i++) {
          mixed_data[i] = static_cast<float16>(data[i]);
        }
        t->clear();
        paddle::framework::TensorCopySync(mixed_tensor, place, t);
      } else if (mixed_precision == phi::DataType::BFLOAT16 &&
                 !weights_should_be_fp32.count(param_name)) {
        mixed_tensor.set_type(paddle::experimental::DataType::BFLOAT16);
        auto* mixed_data =
            mixed_tensor.mutable_data<bfloat16>(platform::CPUPlace());
        for (int i = 0; i < t->numel(); i++) {
          mixed_data[i] = static_cast<bfloat16>(data[i]);
        }
        t->clear();
        paddle::framework::TensorCopySync(mixed_tensor, place, t);
      }
    }
  }

  StrToBinary(mixed_model_file,
              mixed_program_desc->Proto()->SerializeAsString());
  StrToBinary(mixed_params_file, SerializeParams(scope, parameters));
}

bool PhiKernelSupportPrecision(
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
    const std::string& op_type,
    phi::Backend backend,
    phi::DataType data_type,
    phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) {
  auto kernels = phi::KernelFactory::Instance().kernels();
  if (kernels.find(op_type) == kernels.end()) {
    return false;
  }
  phi::KernelKey kernel_key(backend, layout, data_type);
  return phi::KernelFactory::Instance().HasKernel(op_type, kernel_key);
}

bool GpuKernelSupportPrecision(
    const std::string& op_type,
    phi::DataType data_type,
    phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) {
W
Wilber 已提交
217 218 219 220 221 222 223 224 225 226 227
  auto phi_op_type = phi::TransToPhiKernelName(op_type);
  bool res = PhiKernelSupportPrecision(
      phi_op_type, phi::Backend::GPU, data_type, layout);
  res |= PhiKernelSupportPrecision(
      phi_op_type, phi::Backend::GPUDNN, data_type, layout);

  if (!res) {
    auto& all_kernels = OperatorWithKernel::AllOpKernels();
    auto it = all_kernels.find(op_type);
    if (it != all_kernels.end()) {
      for (auto& kern_pair : it->second) {
228 229
        if (platform::is_gpu_place(kern_pair.first.place_) &&
            kern_pair.first.data_type_ == framework::proto::VarType::FP16) {
W
Wilber 已提交
230 231 232 233 234
          res = true;
        }
      }
    }
  }
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
  return res;
}

// Just process special cases.
bool OutShouldNotConvert(ir::Node* var_node) {
  auto op_node = var_node->inputs[0];
  auto* op_desc = op_node->Op();

  // batch_norm's input and output (variance and mean) are the same.
  if (op_desc->Type() == "batch_norm") {
    auto vecs = op_desc->Output("MeanOut");
    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
      return true;
    }
    vecs = op_desc->Output("VarianceOut");
    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
      return true;
    }
    vecs = op_desc->Output("SavedMean");
    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
      return true;
    }
    vecs = op_desc->Output("SavedVariance");
    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
      return true;
    }
  }

  return false;
}
265 266 267 268 269 270 271 272 273 274 275 276
void ProcessOutputNode(
    const std::vector<framework::ir::Graph*>& graphes,
    int block_idx,
    ir::Node* var_node,
    framework::proto::VarType::Type to_type,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map) {
  auto* real_node =
      GetRealNode(graphes, block_idx, var_node, vars_in_multi_block_map);
  if (!NodeVarHasDtype(real_node)) return;
  auto* out_var = real_node->Var();
W
Wilber 已提交
277 278 279
  if (out_var->GetDataType() == framework::proto::VarType::FP32) {
    if (OutShouldNotConvert(var_node)) return;
    out_var->SetDataType(to_type);
280
  }
W
Wilber 已提交
281 282
  VLOG(3) << " out_node name " << var_node->Name() << " data_type "
          << out_var->GetDataType();
283 284
}

285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
// Just process special cases for weights conversion.
bool WeightsShouldNotConvert(ir::Node* var_node) {
  auto op_nodes = var_node->outputs;
  for (auto* op_node : op_nodes) {
    auto* op_desc = op_node->Op();
    // batch_norm op's bias, mean, scale and variance just be float32, so we can
    // not convert the dtype.
    if (op_desc->Type() == "batch_norm") {
      auto vecs = op_desc->Input("Bias");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
      vecs = op_desc->Input("Mean");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
      vecs = op_desc->Input("Scale");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
      vecs = op_desc->Input("Variance");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
    } else if (op_desc->Type() == "fused_multi_transformer") {
      auto vecs = op_desc->Input("LnScale");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }

      vecs = op_desc->Input("LnBias");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }

      vecs = op_desc->Input("FFNLnScale");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }

      vecs = op_desc->Input("FFNLnBias");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
329 330 331 332 333
    }
  }

  return false;
}
W
Wilber 已提交
334 335 336
inline bool IsFloatVarType(framework::proto::VarType::Type type) {
  if (type == framework::proto::VarType::FP16 ||
      type == framework::proto::VarType::FP32 ||
337
      type == framework::proto::VarType::BF16)
W
Wilber 已提交
338 339 340
    return true;
  return false;
}
W
Wilber 已提交
341 342
void ProcessInputNode(
    bool support_precision,
343
    std::vector<framework::ir::Graph*> graphes,
W
Wilber 已提交
344 345 346 347 348 349
    ir::Node* in_node,
    ir::Node* op_node,
    int* suffix,
    framework::BlockDesc* block_desc,
    std::unordered_map<framework::ir::Node*, framework::ir::Node*>* cast_map,
    framework::proto::VarType::Type to_type,
350 351 352
    int block_idx,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
W
Wilber 已提交
353
        vars_in_multi_block_map) {
354 355 356 357 358 359
  auto* real_node =
      GetRealNode(graphes, block_idx, in_node, vars_in_multi_block_map);
  if (!NodeVarHasDtype(real_node)) return;
  auto graph = graphes[block_idx];
  bool is_main_block = block_idx == 0;
  auto* in_var = real_node->Var();
W
Wilber 已提交
360
  auto in_var_type = in_var->GetDataType();
361 362 363 364
  bool is_in_multi_block = vars_in_multi_block_map->count(in_var->Name());

  if (!is_main_block && is_in_multi_block) {
    in_var_type = vars_in_multi_block_map->at(in_var->Name()).first;
W
Wilber 已提交
365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394
  }
  if (support_precision) {
    if (in_var->Persistable() &&
        in_var_type == framework::proto::VarType::FP32) {
      if (WeightsShouldNotConvert(in_node)) return;
      in_var->SetDataType(to_type);
    } else if (!in_var->Persistable() && IsFloatVarType(in_var_type) &&
               in_var_type != to_type) {
      AddCastOp(graph,
                in_node,
                op_node,
                in_var_type,
                to_type,
                suffix,
                block_desc,
                cast_map);
    }
  } else {
    if (!in_var->Persistable() && IsFloatVarType(in_var_type) &&
        in_var_type != to_type) {
      AddCastOp(graph,
                in_node,
                op_node,
                in_var_type,
                to_type,
                suffix,
                block_desc,
                cast_map);
    }
  }
395
  VLOG(3) << " in_node name " << in_var->Name() << " data_type " << in_var_type;
W
Wilber 已提交
396
}
W
Wilber 已提交
397

398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470
void ConvertAllFp64ToFp32(framework::ir::Graph* graph) {
  auto op_nodes = framework::ir::TopologySortOperations(*graph);
  for (auto* op_node : op_nodes) {
    if (!op_node->IsOp()) continue;
    auto op_type = op_node->Op()->Type();
    if (op_type == "feed" || op_type == "fetch") continue;

    if (op_type == "fill_constant") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "dtype", static_cast<int>(framework::proto::VarType::FP32));
    } else if (op_type == "assign_value") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "dtype", static_cast<int>(framework::proto::VarType::FP32));
    } else if (op_type == "eye") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "dtype", static_cast<int>(framework::proto::VarType::FP32));
    } else if (op_type == "fill_any_like") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "dtype", static_cast<int>(framework::proto::VarType::FP32));
    } else if (op_type == "cast") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("in_dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "in_dtype", static_cast<int>(framework::proto::VarType::FP32));
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("out_dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "out_dtype", static_cast<int>(framework::proto::VarType::FP32));
    }

    auto inputs = op_node->inputs;
    for (auto* in_node : inputs) {
      if (in_node->IsCtrlVar()) continue;
      auto* in_var = in_node->Var();
      if (!in_var->Persistable() &&
          in_var->GetDataType() == framework::proto::VarType::FP64) {
        in_var->SetDataType(framework::proto::VarType::FP32);
      }
    }
  }
}

// Handle special ops which contains dtype attribute. e.g., fill_constant,
// assign_value.
void HandleSpecialOps(framework::OpDesc* op_desc) {
  if (op_desc->Type() == "fill_constant") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
  } else if (op_desc->Type() == "assign_value") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
  } else if (op_desc->Type() == "eye") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
  } else if (op_desc->Type() == "fill_any_like") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
W
Wilber 已提交
471 472 473 474 475
  } else if (op_desc->Type() == "fill_constant_batch_size_like") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
  }
}

// We modify op's input output precision, and we need to fix cast op in_dtype
// and out_dtype attribute.
void FixCastAttr(framework::ir::Graph* graph) {
  auto op_nodes = framework::ir::TopologySortOperations(*graph);
  for (auto* op_node : op_nodes) {
    if (!op_node->IsOp()) continue;
    auto op_type = op_node->Op()->Type();
    if (op_type != "cast") continue;

    auto input = op_node->inputs[0];
    auto output = op_node->outputs[0];
    op_node->Op()->SetAttr("in_dtype",
                           static_cast<int>(input->Var()->GetDataType()));
    op_node->Op()->SetAttr("out_dtype",
                           static_cast<int>(output->Var()->GetDataType()));
  }
}

W
Wilber 已提交
497 498
void FindVarsInMultiBlock(
    framework::ProgramDesc* program_desc,
499 500 501 502 503 504
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map,
    std::vector<std::set<std::string>>* vars_appear_multi_in_one_block) {
  std::vector<std::set<std::string>> block_var_names_set(program_desc->Size());
  for (size_t i = 0; i < program_desc->Size(); ++i) {
W
Wilber 已提交
505 506
    for (auto op : program_desc->Block(i).AllOps()) {
      auto in_names = op->InputArgumentNames();
507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533
      block_var_names_set[i].insert(in_names.begin(), in_names.end());
      auto out_names = op->OutputArgumentNames();
      if (op->HasAttr("sub_block") == false) {
        for (auto& n : out_names) {
          if (block_var_names_set[i].count(n)) {
            (*vars_appear_multi_in_one_block)[i].insert(n);
          }
        }
      }
      block_var_names_set[i].insert(out_names.begin(), out_names.end());
    }
  }

  for (size_t i = 0; i < program_desc->Size() - 1; ++i) {
    for (size_t j = i + 1; j < program_desc->Size(); ++j) {
      std::set<std::string> vars_in_multi_block;
      std::set_intersection(
          block_var_names_set[i].begin(),
          block_var_names_set[i].end(),
          block_var_names_set[j].begin(),
          block_var_names_set[j].end(),
          std::inserter(vars_in_multi_block, vars_in_multi_block.begin()));

      for (auto name : vars_in_multi_block) {
        vars_in_multi_block_map->emplace(
            name, std::make_pair(framework::proto::VarType::FP32, i));
      }
534
    }
535 536
  }
}
W
Wilber 已提交
537

538 539 540 541 542 543 544 545 546 547 548 549 550 551 552
bool OpInOutHasTensorArray(
    std::vector<framework::ir::Graph*> graphes,
    int block_idx,
    framework::ir::Node* op_node,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map) {
  CHECK_EQ(op_node->IsOp(), true);
  for (auto in : op_node->inputs) {
    auto* real_node =
        GetRealNode(graphes, block_idx, in, vars_in_multi_block_map);
    if (!NodeVarHasDtype(real_node)) continue;
    if (real_node->Var()->GetType() ==
        framework::proto::VarType::LOD_TENSOR_ARRAY)
      return true;
W
Wilber 已提交
553 554
  }

555 556 557 558 559 560 561 562
  for (auto out : op_node->outputs) {
    auto* real_node =
        GetRealNode(graphes, block_idx, out, vars_in_multi_block_map);
    if (!NodeVarHasDtype(real_node)) continue;

    if (real_node->Var()->GetType() ==
        framework::proto::VarType::LOD_TENSOR_ARRAY)
      return true;
563
  }
564
  return false;
565 566
}

W
Wilber 已提交
567 568
void ConvertTensorDtype(
    framework::ProgramDesc* program_desc,
569
    std::vector<framework::ir::Graph*> graphes,
W
Wilber 已提交
570 571 572 573
    const std::unordered_set<std::string>& blacklist,
    bool keep_io_types,
    phi::Backend backend,
    phi::DataType tensor_dtype,
574 575 576 577 578 579
    int block_idx,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map,
    const std::vector<std::set<std::string>>& vars_appear_multi_in_one_block) {
  auto graph = graphes[block_idx];
580 581 582 583 584 585 586
  framework::proto::VarType::Type to_type;
  if (tensor_dtype == phi::DataType::FLOAT16) {
    to_type = framework::proto::VarType::FP16;
  } else if (tensor_dtype == phi::DataType::BFLOAT16) {
    to_type = framework::proto::VarType::BF16;
  } else {
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
W
Wilber 已提交
587
        "mixed_precision currently not supported dtype %d, we now only "
588
        "support fp16 and bf16.",
589 590 591
        static_cast<int>(tensor_dtype)));
  }

W
Wilber 已提交
592 593 594
  auto* block_desc =
      framework::ir::TopologySortOperations(*graph)[0]->Op()->Block();

595 596 597 598
  int num_low_precision = 0;
  int suffix = 0;
  std::vector<framework::ir::Node*> output_nodes;
  std::unordered_map<framework::ir::Node*, framework::ir::Node*> cast_map;
599 600
  auto op_nodes = framework::ir::TopologySortOperations(*graph);
  for (auto* op_node : op_nodes) {
601 602
    if (!op_node->IsOp()) continue;
    auto op_type = op_node->Op()->Type();
W
Wilber 已提交
603 604
    VLOG(3) << "-------------------- op_type " << op_type << ", phi_type "
            << phi::TransToPhiKernelName(op_type);
605 606 607 608 609 610 611 612 613 614 615
    // 1. set input dtype.
    if (op_type == "feed") {
      auto feed_var = op_node->outputs[0]->Var();
      if (!keep_io_types &&
          feed_var->GetDataType() == framework::proto::VarType::FP32) {
        feed_var->SetDataType(to_type);
      }
    } else if (op_type == "fetch") {
      auto* fetch_var = op_node->inputs[0];
      output_nodes.push_back(fetch_var);
      continue;
616 617
    } else if (op_type == "cast") {
      continue;
618 619
    }

W
Wilber 已提交
620 621 622 623 624
    else if (op_node->Op()->HasAttr("sub_block")) {  // NOLINT
      // sub_block op's output dtype should be same as input dtype, if have the
      // same name.
      std::unordered_map<std::string, framework::ir::Node*> in_name_to_node;
      for (auto* in : op_node->inputs) {
625 626 627
        auto* real_node =
            GetRealNode(graphes, block_idx, in, vars_in_multi_block_map);
        if (NodeVarHasDtype(real_node)) {
W
Wilber 已提交
628 629 630 631 632
          in_name_to_node[in->Name()] = in;
        }
      }

      for (auto out : op_node->outputs) {
633 634 635
        auto* real_node =
            GetRealNode(graphes, block_idx, out, vars_in_multi_block_map);
        if (NodeVarHasDtype(real_node)) {
W
Wilber 已提交
636
          if (in_name_to_node.count(out->Name()))
637
            real_node->Var()->SetDataType(
W
Wilber 已提交
638 639 640 641 642 643 644
                in_name_to_node[out->Name()]->Var()->GetDataType());
        }
      }

      continue;
    }

645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661
    // A strange case found in multi block.
    else if (op_type == "assign" &&  // NOLINT
             op_node->inputs[0]->Name() == op_node->outputs[0]->Name()) {
      VLOG(2) << " in out are same, continue";
      continue;
    }

    // Handle tensor array.
    else if (OpInOutHasTensorArray(  // NOLINT
                 graphes,
                 block_idx,
                 op_node,
                 vars_in_multi_block_map)) {
      VLOG(2) << "  in or out has tensor array, continue";
      continue;
    }

662 663 664 665
    // 2. if op support fp16/bf16 and not in blacklist.
    //      - cast weight to fp16/bf16.
    //      - add cast op if the input dtype is not fp16/bf16.
    //      - set output dtype.
666 667 668 669 670 671 672 673 674
    //
    // If a var(op's out var) appears multiple times in a block, we should not
    // convert to fp16.
    else if (blacklist.count(op_type) == 0 &&  // NOLINT
             !VarIsMultiOpsOut(graphes,
                               block_idx,
                               op_node,
                               vars_in_multi_block_map,
                               vars_appear_multi_in_one_block)) {
675
      bool support_precision =
W
Wilber 已提交
676
          OpSupportPrecision(op_type, backend, tensor_dtype, blacklist);
677
      VLOG(2) << " support low precision " << support_precision;
678

679
      if (support_precision) {
680
        HandleSpecialOps(op_node->Op());
681 682
        ++num_low_precision;
        auto inputs = op_node->inputs;
W
Wilber 已提交
683
        // Process inputs.
684
        for (auto* in_node : inputs) {
W
Wilber 已提交
685
          ProcessInputNode(true,
686
                           graphes,
W
Wilber 已提交
687 688 689 690 691 692
                           in_node,
                           op_node,
                           &suffix,
                           block_desc,
                           &cast_map,
                           to_type,
693
                           block_idx,
W
Wilber 已提交
694
                           vars_in_multi_block_map);
695
        }
W
Wilber 已提交
696
        // Process outputs.
697
        for (auto* out_node : op_node->outputs) {
698 699
          ProcessOutputNode(
              graphes, block_idx, out_node, to_type, vars_in_multi_block_map);
700 701 702 703
        }
      } else {
        auto inputs = op_node->inputs;
        for (auto* in_node : inputs) {
W
Wilber 已提交
704
          ProcessInputNode(false,
705
                           graphes,
W
Wilber 已提交
706 707 708 709 710 711
                           in_node,
                           op_node,
                           &suffix,
                           block_desc,
                           &cast_map,
                           framework::proto::VarType::FP32,
712
                           block_idx,
W
Wilber 已提交
713
                           vars_in_multi_block_map);
714 715 716 717 718 719 720
        }
      }
    }

    // 3. check op not support fp16/bf16 or in blacklist.
    //      - add cast op if the input dtype is not fp32.
    else {  // NOLINT
721 722
      auto ins = op_node->inputs;
      for (auto* in_node : ins) {
W
Wilber 已提交
723
        if (in_node->IsCtrlVar()) continue;
724 725 726 727 728 729 730 731 732 733 734 735 736 737 738
        auto* in_var = in_node->Var();
        if (in_var->GetDataType() == to_type) {
          AddCastOp(graph,
                    in_node,
                    op_node,
                    to_type,
                    framework::proto::VarType::FP32,
                    &suffix,
                    block_desc,
                    &cast_map);
        }
      }
    }
  }

W
Wilber 已提交
739 740
  // 4. if output_op's dtype is not compatible to output dtype, then just
  // insert cast.
741
  for (auto* node : output_nodes) {
W
Wilber 已提交
742
    if (node->IsCtrlVar()) continue;
743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767
    auto var = node->Var();
    if (keep_io_types && var->GetDataType() == to_type) {
      // fp16/bf16 -> fp32.
      AddCastOp(graph,
                node,
                node->outputs[0],
                to_type,
                framework::proto::VarType::FP32,
                &suffix,
                block_desc,
                &cast_map);
    } else if (!keep_io_types &&
               var->GetDataType() == framework::proto::VarType::FP32) {
      // fp32 -> fp16/bf16
      AddCastOp(graph,
                node,
                node->outputs[0],
                framework::proto::VarType::FP32,
                to_type,
                &suffix,
                block_desc,
                &cast_map);
    }
  }

768 769 770 771 772 773 774 775 776
  for (auto node : graph->Nodes()) {
    auto* real_node =
        GetRealNode(graphes, block_idx, node, vars_in_multi_block_map);
    if (!NodeVarHasDtype(real_node)) continue;

    if (vars_in_multi_block_map->count(real_node->Name()) &&
        vars_in_multi_block_map->at(real_node->Name()).second == block_idx) {
      vars_in_multi_block_map->at(real_node->Name()).first =
          real_node->Var()->GetDataType();
W
Wilber 已提交
777 778 779
    }
  }

780
  if (num_low_precision)
781 782
    LOG(INFO) << "---  detected " << num_low_precision
              << " low precision ops in " << block_idx << " subgraph";
783 784 785
}
}  // namespace

W
Wilber 已提交
786
bool OpSupportPrecision(const std::string& op_type,
787 788 789
                        phi::Backend backend,
                        phi::DataType precision,
                        const std::unordered_set<std::string>& blacklist) {
W
Wilber 已提交
790
  auto phi_op_type = phi::TransToPhiKernelName(op_type);
791
  bool support_precision = false;
W
Wilber 已提交
792
  if (blacklist.count(op_type) == 0) {
793
    if (backend == phi::Backend::GPU)
W
Wilber 已提交
794
      support_precision = GpuKernelSupportPrecision(op_type, precision);
795 796
    else
      support_precision =
W
Wilber 已提交
797
          PhiKernelSupportPrecision(phi_op_type, backend, precision);
798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864
  }
  return support_precision;
}

void AddCastOp(
    framework::ir::Graph* graph,
    framework::ir::Node* node,
    framework::ir::Node* next_op,
    framework::proto::VarType::Type from_type,
    framework::proto::VarType::Type to_type,
    int* suffix,
    framework::BlockDesc* block_desc,
    std::unordered_map<framework::ir::Node*, framework::ir::Node*>* map) {
  auto update_cast_desc = [&](framework::OpDesc& desc,
                              const std::string& x_name,
                              const std::string& out_name,
                              const int in_dtype,
                              const int out_dtype) {
    desc.SetType("cast");
    desc.SetInput("X", {x_name});
    desc.SetOutput("Out", {out_name});
    desc.SetAttr("in_dtype", in_dtype);
    desc.SetAttr("out_dtype", out_dtype);
    desc.SetAttr("use_mkldnn", false);
    desc.SetAttr("with_quant_attr", false);
    desc.Flush();
  };

  if (map->count(node) == 0) {
    // insert cast op before node.
    std::string cast_input_name = node->Var()->Name();
    std::string cast_output_name =
        node->Var()->Name() + "_cast.tmp_" + std::to_string((*suffix)++);
    CHECK_NOTNULL(block_desc);
    framework::OpDesc cast_op_desc(block_desc);
    update_cast_desc(cast_op_desc,
                     cast_input_name,
                     cast_output_name,
                     static_cast<int>(from_type),
                     static_cast<int>(to_type));
    auto* cast_op_node = graph->CreateOpNode(&cast_op_desc);
    auto* cast_output_vardesc = block_desc->Var(cast_output_name);
    cast_output_vardesc->SetPersistable(false);
    cast_output_vardesc->SetDataType(to_type);
    cast_output_vardesc->SetShape(node->Var()->GetShape());
    auto* cast_output_node = graph->CreateVarNode(cast_output_vardesc);
    IR_NODE_LINK_TO(cast_op_node, cast_output_node);
    (*map)[node] = cast_output_node;
  }
  next_op->Op()->RenameInput(node->Name(), map->at(node)->Name());
  IR_NODE_LINK_TO(node, map->at(node)->inputs[0]);
  IR_NODE_LINK_TO(map->at(node), next_op);
}

void ConvertToMixedPrecision(const std::string& model_file,
                             const std::string& params_file,
                             const std::string& mixed_model_file,
                             const std::string& mixed_params_file,
                             phi::DataType mixed_precision,
                             phi::Backend backend,
                             bool keep_io_types,
                             std::unordered_set<std::string> black_list) {
  paddle::CPUPlace place;
  framework::Executor executor(place);
  framework::Scope scope;
  auto program_desc =
      inference::Load(&executor, &scope, model_file, params_file);
W
Wilber 已提交
865
  auto main_graph = std::unique_ptr<framework::ir::Graph>(
866 867
      new framework::ir::Graph(*program_desc));

868 869
  std::unordered_map<std::string,
                     std::pair<framework::proto::VarType::Type, int>>
W
Wilber 已提交
870
      vars_in_multi_block_map;
871 872 873 874 875
  std::vector<std::set<std::string>> vars_appear_multi_in_one_block(
      program_desc->Size());
  FindVarsInMultiBlock(program_desc.get(),
                       &vars_in_multi_block_map,
                       &vars_appear_multi_in_one_block);
W
Wilber 已提交
876

877
  std::vector<framework::ir::Graph*> graphes;
W
Wilber 已提交
878 879
  for (size_t i = 0; i < main_graph->SubGraphsSize(); ++i) {
    auto graph = main_graph->GetSubGraph(i);
880
    graphes.push_back(graph);
W
Wilber 已提交
881
    VLOG(2) << " --------  handle subgraph " << i << ", has "
882
            << graph->Nodes().size() << " nodes --------";
W
Wilber 已提交
883 884 885

    ConvertAllFp64ToFp32(graph);
    ConvertTensorDtype(program_desc.get(),
886
                       graphes,
W
Wilber 已提交
887 888 889 890
                       black_list,
                       keep_io_types,
                       backend,
                       mixed_precision,
891 892 893
                       i,
                       &vars_in_multi_block_map,
                       vars_appear_multi_in_one_block);
W
Wilber 已提交
894
    FixCastAttr(graph);
895 896
  }

W
Wilber 已提交
897 898 899 900 901 902 903 904
  framework::ProgramDesc mixed_program_desc;
  framework::ir::GraphToProgram(*main_graph, &mixed_program_desc);

  SaveMixedModel(main_graph.get(),
                 &scope,
                 &mixed_program_desc,
                 mixed_model_file,
                 mixed_params_file,
905 906
                 mixed_precision,
                 vars_in_multi_block_map);
907 908 909 910 911
}

}  // namespace analysis
}  // namespace inference
}  // namespace paddle