convert_to_mixed_precision.cc 33.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h"

W
Wilber 已提交
17 18
#include <algorithm>
#include <iterator>
19
#include <string>
W
Wilber 已提交
20
#include <unordered_map>
21
#include <unordered_set>
22
#include <utility>
23 24 25

#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/executor.h"
26
#include "paddle/fluid/framework/framework.pb.h"
27 28 29
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
30
#include "paddle/fluid/framework/ir/node.h"
31 32
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
33
#include "paddle/fluid/framework/var_desc.h"
34 35 36 37 38 39 40 41 42 43 44 45 46
#include "paddle/fluid/inference/io.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/tensor_meta.h"

using namespace paddle::framework;  // NOLINT

namespace paddle {
namespace inference {
namespace analysis {

namespace {

W
Wilber 已提交
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
inline std::string SerializeParams(framework::Scope* scope,
                                   const std::vector<std::string>& params) {
  std::ostringstream os;
  phi::CPUContext ctx;
  for (const auto& param : params) {
    VLOG(3) << "Serialize param: " << param;
    PADDLE_ENFORCE_NOT_NULL(
        scope->FindVar(param),
        platform::errors::NotFound("Block should already have a '%s' variable",
                                   param));
    auto* tensor = scope->FindVar(param)->GetMutable<framework::LoDTensor>();
    framework::SerializeToStream(os, *tensor, ctx);
  }
  return os.str();
}

inline void StrToBinary(const std::string& path, const std::string& str) {
  std::ofstream file(path.c_str(), std::ios::binary);
  file.write(str.c_str(), str.size());
  file.close();
}
68

W
Wilber 已提交
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
inline bool NodeVarHasDtype(framework::ir::Node* node) {
  if (node->IsCtrlVar()) return false;

  if (node->IsVar() &&
      (node->Var()->GetType() ==
           paddle::framework::proto::VarType::SELECTED_ROWS ||
       node->Var()->GetType() ==
           paddle::framework::proto::VarType::LOD_TENSOR ||
       node->Var()->GetType() ==
           paddle::framework::proto::VarType::LOD_TENSOR_ARRAY ||
       node->Var()->GetType() == paddle::framework::proto::VarType::STRINGS ||
       node->Var()->GetType() == paddle::framework::proto::VarType::VOCAB)) {
    return true;
  }

  return false;
}
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142

// Return Node* which first appers in block.
framework::ir::Node* GetRealNode(
    const std::vector<framework::ir::Graph*>& graphes,
    int block_idx,
    framework::ir::Node* node,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map) {
  if (vars_in_multi_block_map->count(node->Name())) {
    int var_origin_block_id = vars_in_multi_block_map->at(node->Name()).second;
    if (block_idx != var_origin_block_id) {
      auto graph = graphes[var_origin_block_id];
      for (auto nd : graph->Nodes()) {
        if (nd->Name() == node->Name()) {
          return nd;
        }
      }
    }
  }

  return node;
}

inline bool VarIsMultiOpsOut(
    const std::vector<framework::ir::Graph*>& graphes,
    int block_idx,
    framework::ir::Node* op_node,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map,
    const std::vector<std::set<std::string>>& vars_appear_multi_in_one_block) {
  CHECK_EQ(op_node->IsOp(), true);
  for (auto* out : op_node->outputs) {
    if (out->IsCtrlVar()) continue;
    auto* real_node =
        GetRealNode(graphes, block_idx, out, vars_in_multi_block_map);
    if (!real_node->Var()->Persistable() &&
        vars_appear_multi_in_one_block[block_idx].count(out->Name())) {
      VLOG(2) << out->Name()
              << " is multi op's out, so we skip convert to fp16";
      return true;
    }
  }
  return false;
}

void SaveMixedModel(
    framework::ir::Graph* graph,
    framework::Scope* scope,
    framework::ProgramDesc* mixed_program_desc,
    const std::string& mixed_model_file,
    const std::string& mixed_params_file,
    phi::DataType mixed_precision,
    const std::unordered_map<std::string,
                             std::pair<framework::proto::VarType::Type, int>>&
        vars_in_multi_block_map) {
W
Wilber 已提交
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
  paddle::CPUPlace place;
  auto parameters = scope->LocalVarNames();
  std::sort(parameters.begin(), parameters.end());

  std::unordered_set<std::string> weights_should_be_fp32;
  for (auto* node : graph->Nodes()) {
    if (!(node->IsVar() && !node->IsCtrlVar())) continue;
    if (NodeVarHasDtype(node)) {
      if (node->Var()->Persistable() &&
          node->Var()->GetDataType() ==
              paddle::framework::proto::VarType::FP32) {
        VLOG(2) << "weights keep to fp32: " << node->Name();
        weights_should_be_fp32.insert(node->Name());
      }
    }
  }

  for (const auto& param_name : parameters) {
    auto* var = scope->FindLocalVar(param_name);
    if (var->IsType<framework::LoDTensor>() ||
        var->IsType<framework::Tensor>()) {
      auto* t = var->GetMutable<framework::LoDTensor>();
W
Wilber 已提交
165 166
      if (t->dtype() != phi::DataType::FLOAT32) continue;

W
Wilber 已提交
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
      framework::Tensor mixed_tensor;
      mixed_tensor.Resize(t->dims());
      auto* data = t->mutable_data<float>(platform::CPUPlace());

      if (mixed_precision == phi::DataType::FLOAT16 &&
          !weights_should_be_fp32.count(param_name)) {
        mixed_tensor.set_type(paddle::experimental::DataType::FLOAT16);
        auto* mixed_data =
            mixed_tensor.mutable_data<float16>(platform::CPUPlace());
        for (int i = 0; i < t->numel(); i++) {
          mixed_data[i] = static_cast<float16>(data[i]);
        }
        t->clear();
        paddle::framework::TensorCopySync(mixed_tensor, place, t);
      } else if (mixed_precision == phi::DataType::BFLOAT16 &&
                 !weights_should_be_fp32.count(param_name)) {
        mixed_tensor.set_type(paddle::experimental::DataType::BFLOAT16);
        auto* mixed_data =
            mixed_tensor.mutable_data<bfloat16>(platform::CPUPlace());
        for (int i = 0; i < t->numel(); i++) {
          mixed_data[i] = static_cast<bfloat16>(data[i]);
        }
        t->clear();
        paddle::framework::TensorCopySync(mixed_tensor, place, t);
      }
    }
  }

  StrToBinary(mixed_model_file,
              mixed_program_desc->Proto()->SerializeAsString());
  StrToBinary(mixed_params_file, SerializeParams(scope, parameters));
}

bool PhiKernelSupportPrecision(
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
    const std::string& op_type,
    phi::Backend backend,
    phi::DataType data_type,
    phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) {
  auto kernels = phi::KernelFactory::Instance().kernels();
  if (kernels.find(op_type) == kernels.end()) {
    return false;
  }
  phi::KernelKey kernel_key(backend, layout, data_type);
  return phi::KernelFactory::Instance().HasKernel(op_type, kernel_key);
}

bool GpuKernelSupportPrecision(
    const std::string& op_type,
    phi::DataType data_type,
    phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) {
W
Wilber 已提交
217 218 219 220 221 222 223 224 225 226 227
  auto phi_op_type = phi::TransToPhiKernelName(op_type);
  bool res = PhiKernelSupportPrecision(
      phi_op_type, phi::Backend::GPU, data_type, layout);
  res |= PhiKernelSupportPrecision(
      phi_op_type, phi::Backend::GPUDNN, data_type, layout);

  if (!res) {
    auto& all_kernels = OperatorWithKernel::AllOpKernels();
    auto it = all_kernels.find(op_type);
    if (it != all_kernels.end()) {
      for (auto& kern_pair : it->second) {
228 229
        if (platform::is_gpu_place(kern_pair.first.place_) &&
            kern_pair.first.data_type_ == framework::proto::VarType::FP16) {
W
Wilber 已提交
230 231 232 233 234
          res = true;
        }
      }
    }
  }
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
  return res;
}

// Just process special cases.
bool OutShouldNotConvert(ir::Node* var_node) {
  auto op_node = var_node->inputs[0];
  auto* op_desc = op_node->Op();

  // batch_norm's input and output (variance and mean) are the same.
  if (op_desc->Type() == "batch_norm") {
    auto vecs = op_desc->Output("MeanOut");
    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
      return true;
    }
    vecs = op_desc->Output("VarianceOut");
    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
      return true;
    }
    vecs = op_desc->Output("SavedMean");
    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
      return true;
    }
    vecs = op_desc->Output("SavedVariance");
    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
      return true;
    }
  }

  return false;
}
265 266 267 268 269 270 271 272 273 274 275 276
void ProcessOutputNode(
    const std::vector<framework::ir::Graph*>& graphes,
    int block_idx,
    ir::Node* var_node,
    framework::proto::VarType::Type to_type,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map) {
  auto* real_node =
      GetRealNode(graphes, block_idx, var_node, vars_in_multi_block_map);
  if (!NodeVarHasDtype(real_node)) return;
  auto* out_var = real_node->Var();
W
Wilber 已提交
277 278 279
  if (out_var->GetDataType() == framework::proto::VarType::FP32) {
    if (OutShouldNotConvert(var_node)) return;
    out_var->SetDataType(to_type);
280
  }
W
Wilber 已提交
281 282
  VLOG(3) << " out_node name " << var_node->Name() << " data_type "
          << out_var->GetDataType();
283 284
}

285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
// Just process special cases for weights conversion.
bool WeightsShouldNotConvert(ir::Node* var_node) {
  auto op_nodes = var_node->outputs;
  for (auto* op_node : op_nodes) {
    auto* op_desc = op_node->Op();
    // batch_norm op's bias, mean, scale and variance just be float32, so we can
    // not convert the dtype.
    if (op_desc->Type() == "batch_norm") {
      auto vecs = op_desc->Input("Bias");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
      vecs = op_desc->Input("Mean");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
      vecs = op_desc->Input("Scale");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
      vecs = op_desc->Input("Variance");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
    } else if (op_desc->Type() == "fused_multi_transformer") {
      auto vecs = op_desc->Input("LnScale");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }

      vecs = op_desc->Input("LnBias");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }

      vecs = op_desc->Input("FFNLnScale");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }

      vecs = op_desc->Input("FFNLnBias");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
329 330 331 332 333
    }
  }

  return false;
}
W
Wilber 已提交
334 335 336
inline bool IsFloatVarType(framework::proto::VarType::Type type) {
  if (type == framework::proto::VarType::FP16 ||
      type == framework::proto::VarType::FP32 ||
337
      type == framework::proto::VarType::BF16)
W
Wilber 已提交
338 339 340
    return true;
  return false;
}
W
Wilber 已提交
341 342
void ProcessInputNode(
    bool support_precision,
343
    std::vector<framework::ir::Graph*> graphes,
W
Wilber 已提交
344 345 346 347 348 349
    ir::Node* in_node,
    ir::Node* op_node,
    int* suffix,
    framework::BlockDesc* block_desc,
    std::unordered_map<framework::ir::Node*, framework::ir::Node*>* cast_map,
    framework::proto::VarType::Type to_type,
350 351 352
    int block_idx,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
W
Wilber 已提交
353
        vars_in_multi_block_map) {
354 355 356 357 358 359
  auto* real_node =
      GetRealNode(graphes, block_idx, in_node, vars_in_multi_block_map);
  if (!NodeVarHasDtype(real_node)) return;
  auto graph = graphes[block_idx];
  bool is_main_block = block_idx == 0;
  auto* in_var = real_node->Var();
W
Wilber 已提交
360
  auto in_var_type = in_var->GetDataType();
361 362 363 364
  bool is_in_multi_block = vars_in_multi_block_map->count(in_var->Name());

  if (!is_main_block && is_in_multi_block) {
    in_var_type = vars_in_multi_block_map->at(in_var->Name()).first;
W
Wilber 已提交
365 366 367 368 369 370
  }
  if (support_precision) {
    if (in_var->Persistable() &&
        in_var_type == framework::proto::VarType::FP32) {
      if (WeightsShouldNotConvert(in_node)) return;
      in_var->SetDataType(to_type);
371
      in_var_type = to_type;
W
Wilber 已提交
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
    } else if (!in_var->Persistable() && IsFloatVarType(in_var_type) &&
               in_var_type != to_type) {
      AddCastOp(graph,
                in_node,
                op_node,
                in_var_type,
                to_type,
                suffix,
                block_desc,
                cast_map);
    }
  } else {
    if (!in_var->Persistable() && IsFloatVarType(in_var_type) &&
        in_var_type != to_type) {
      AddCastOp(graph,
                in_node,
                op_node,
                in_var_type,
                to_type,
                suffix,
                block_desc,
                cast_map);
    }
  }
396
  VLOG(3) << " in_node name " << in_var->Name() << " data_type " << in_var_type;
W
Wilber 已提交
397
}
W
Wilber 已提交
398

399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471
void ConvertAllFp64ToFp32(framework::ir::Graph* graph) {
  auto op_nodes = framework::ir::TopologySortOperations(*graph);
  for (auto* op_node : op_nodes) {
    if (!op_node->IsOp()) continue;
    auto op_type = op_node->Op()->Type();
    if (op_type == "feed" || op_type == "fetch") continue;

    if (op_type == "fill_constant") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "dtype", static_cast<int>(framework::proto::VarType::FP32));
    } else if (op_type == "assign_value") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "dtype", static_cast<int>(framework::proto::VarType::FP32));
    } else if (op_type == "eye") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "dtype", static_cast<int>(framework::proto::VarType::FP32));
    } else if (op_type == "fill_any_like") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "dtype", static_cast<int>(framework::proto::VarType::FP32));
    } else if (op_type == "cast") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("in_dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "in_dtype", static_cast<int>(framework::proto::VarType::FP32));
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("out_dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "out_dtype", static_cast<int>(framework::proto::VarType::FP32));
    }

    auto inputs = op_node->inputs;
    for (auto* in_node : inputs) {
      if (in_node->IsCtrlVar()) continue;
      auto* in_var = in_node->Var();
      if (!in_var->Persistable() &&
          in_var->GetDataType() == framework::proto::VarType::FP64) {
        in_var->SetDataType(framework::proto::VarType::FP32);
      }
    }
  }
}

// Handle special ops which contains dtype attribute. e.g., fill_constant,
// assign_value.
void HandleSpecialOps(framework::OpDesc* op_desc) {
  if (op_desc->Type() == "fill_constant") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
  } else if (op_desc->Type() == "assign_value") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
  } else if (op_desc->Type() == "eye") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
  } else if (op_desc->Type() == "fill_any_like") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
W
Wilber 已提交
472 473 474 475 476
  } else if (op_desc->Type() == "fill_constant_batch_size_like") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497
  }
}

// We modify op's input output precision, and we need to fix cast op in_dtype
// and out_dtype attribute.
void FixCastAttr(framework::ir::Graph* graph) {
  auto op_nodes = framework::ir::TopologySortOperations(*graph);
  for (auto* op_node : op_nodes) {
    if (!op_node->IsOp()) continue;
    auto op_type = op_node->Op()->Type();
    if (op_type != "cast") continue;

    auto input = op_node->inputs[0];
    auto output = op_node->outputs[0];
    op_node->Op()->SetAttr("in_dtype",
                           static_cast<int>(input->Var()->GetDataType()));
    op_node->Op()->SetAttr("out_dtype",
                           static_cast<int>(output->Var()->GetDataType()));
  }
}

W
Wilber 已提交
498 499
void FindVarsInMultiBlock(
    framework::ProgramDesc* program_desc,
500 501 502 503 504 505
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map,
    std::vector<std::set<std::string>>* vars_appear_multi_in_one_block) {
  std::vector<std::set<std::string>> block_var_names_set(program_desc->Size());
  for (size_t i = 0; i < program_desc->Size(); ++i) {
W
Wilber 已提交
506 507
    for (auto op : program_desc->Block(i).AllOps()) {
      auto in_names = op->InputArgumentNames();
508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534
      block_var_names_set[i].insert(in_names.begin(), in_names.end());
      auto out_names = op->OutputArgumentNames();
      if (op->HasAttr("sub_block") == false) {
        for (auto& n : out_names) {
          if (block_var_names_set[i].count(n)) {
            (*vars_appear_multi_in_one_block)[i].insert(n);
          }
        }
      }
      block_var_names_set[i].insert(out_names.begin(), out_names.end());
    }
  }

  for (size_t i = 0; i < program_desc->Size() - 1; ++i) {
    for (size_t j = i + 1; j < program_desc->Size(); ++j) {
      std::set<std::string> vars_in_multi_block;
      std::set_intersection(
          block_var_names_set[i].begin(),
          block_var_names_set[i].end(),
          block_var_names_set[j].begin(),
          block_var_names_set[j].end(),
          std::inserter(vars_in_multi_block, vars_in_multi_block.begin()));

      for (auto name : vars_in_multi_block) {
        vars_in_multi_block_map->emplace(
            name, std::make_pair(framework::proto::VarType::FP32, i));
      }
535
    }
536 537
  }
}
W
Wilber 已提交
538

539 540 541 542 543 544 545 546 547 548 549 550 551 552 553
bool OpInOutHasTensorArray(
    std::vector<framework::ir::Graph*> graphes,
    int block_idx,
    framework::ir::Node* op_node,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map) {
  CHECK_EQ(op_node->IsOp(), true);
  for (auto in : op_node->inputs) {
    auto* real_node =
        GetRealNode(graphes, block_idx, in, vars_in_multi_block_map);
    if (!NodeVarHasDtype(real_node)) continue;
    if (real_node->Var()->GetType() ==
        framework::proto::VarType::LOD_TENSOR_ARRAY)
      return true;
W
Wilber 已提交
554 555
  }

556 557 558 559 560 561 562 563
  for (auto out : op_node->outputs) {
    auto* real_node =
        GetRealNode(graphes, block_idx, out, vars_in_multi_block_map);
    if (!NodeVarHasDtype(real_node)) continue;

    if (real_node->Var()->GetType() ==
        framework::proto::VarType::LOD_TENSOR_ARRAY)
      return true;
564
  }
565
  return false;
566 567
}

W
Wilber 已提交
568 569
void ConvertTensorDtype(
    framework::ProgramDesc* program_desc,
570
    std::vector<framework::ir::Graph*> graphes,
W
Wilber 已提交
571 572 573 574
    const std::unordered_set<std::string>& blacklist,
    bool keep_io_types,
    phi::Backend backend,
    phi::DataType tensor_dtype,
575 576 577 578 579 580
    int block_idx,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map,
    const std::vector<std::set<std::string>>& vars_appear_multi_in_one_block) {
  auto graph = graphes[block_idx];
581 582 583 584 585 586 587
  framework::proto::VarType::Type to_type;
  if (tensor_dtype == phi::DataType::FLOAT16) {
    to_type = framework::proto::VarType::FP16;
  } else if (tensor_dtype == phi::DataType::BFLOAT16) {
    to_type = framework::proto::VarType::BF16;
  } else {
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
W
Wilber 已提交
588
        "mixed_precision currently not supported dtype %d, we now only "
589
        "support fp16 and bf16.",
590 591 592
        static_cast<int>(tensor_dtype)));
  }

W
Wilber 已提交
593 594 595
  auto* block_desc =
      framework::ir::TopologySortOperations(*graph)[0]->Op()->Block();

596 597 598 599
  int num_low_precision = 0;
  int suffix = 0;
  std::vector<framework::ir::Node*> output_nodes;
  std::unordered_map<framework::ir::Node*, framework::ir::Node*> cast_map;
600 601
  auto op_nodes = framework::ir::TopologySortOperations(*graph);
  for (auto* op_node : op_nodes) {
602 603
    if (!op_node->IsOp()) continue;
    auto op_type = op_node->Op()->Type();
W
Wilber 已提交
604 605
    VLOG(3) << "-------------------- op_type " << op_type << ", phi_type "
            << phi::TransToPhiKernelName(op_type);
606 607 608 609 610 611 612 613 614 615 616
    // 1. set input dtype.
    if (op_type == "feed") {
      auto feed_var = op_node->outputs[0]->Var();
      if (!keep_io_types &&
          feed_var->GetDataType() == framework::proto::VarType::FP32) {
        feed_var->SetDataType(to_type);
      }
    } else if (op_type == "fetch") {
      auto* fetch_var = op_node->inputs[0];
      output_nodes.push_back(fetch_var);
      continue;
617 618
    } else if (op_type == "cast") {
      continue;
619 620
    }

W
Wilber 已提交
621 622 623 624 625
    else if (op_node->Op()->HasAttr("sub_block")) {  // NOLINT
      // sub_block op's output dtype should be same as input dtype, if have the
      // same name.
      std::unordered_map<std::string, framework::ir::Node*> in_name_to_node;
      for (auto* in : op_node->inputs) {
626 627 628
        auto* real_node =
            GetRealNode(graphes, block_idx, in, vars_in_multi_block_map);
        if (NodeVarHasDtype(real_node)) {
W
Wilber 已提交
629 630 631 632 633
          in_name_to_node[in->Name()] = in;
        }
      }

      for (auto out : op_node->outputs) {
634 635 636
        auto* real_node =
            GetRealNode(graphes, block_idx, out, vars_in_multi_block_map);
        if (NodeVarHasDtype(real_node)) {
W
Wilber 已提交
637
          if (in_name_to_node.count(out->Name()))
638
            real_node->Var()->SetDataType(
W
Wilber 已提交
639 640 641 642 643 644 645
                in_name_to_node[out->Name()]->Var()->GetDataType());
        }
      }

      continue;
    }

646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662
    // A strange case found in multi block.
    else if (op_type == "assign" &&  // NOLINT
             op_node->inputs[0]->Name() == op_node->outputs[0]->Name()) {
      VLOG(2) << " in out are same, continue";
      continue;
    }

    // Handle tensor array.
    else if (OpInOutHasTensorArray(  // NOLINT
                 graphes,
                 block_idx,
                 op_node,
                 vars_in_multi_block_map)) {
      VLOG(2) << "  in or out has tensor array, continue";
      continue;
    }

663 664 665 666
    // 2. if op support fp16/bf16 and not in blacklist.
    //      - cast weight to fp16/bf16.
    //      - add cast op if the input dtype is not fp16/bf16.
    //      - set output dtype.
667 668 669 670 671 672 673 674 675
    //
    // If a var(op's out var) appears multiple times in a block, we should not
    // convert to fp16.
    else if (blacklist.count(op_type) == 0 &&  // NOLINT
             !VarIsMultiOpsOut(graphes,
                               block_idx,
                               op_node,
                               vars_in_multi_block_map,
                               vars_appear_multi_in_one_block)) {
676
      bool support_precision =
W
Wilber 已提交
677
          OpSupportPrecision(op_type, backend, tensor_dtype, blacklist);
678
      VLOG(2) << " support low precision " << support_precision;
679

W
Wilber 已提交
680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699
      // if op not has float input, we will not choose the low precision kernel.
      {
        bool has_float_input{false};
        for (auto in_node : op_node->inputs) {
          auto* real_node =
              GetRealNode(graphes, block_idx, in_node, vars_in_multi_block_map);
          if (real_node->Var()->GetDataType() == proto::VarType::FP16 ||
              real_node->Var()->GetDataType() == proto::VarType::FP32 ||
              real_node->Var()->GetDataType() == proto::VarType::FP64 ||
              real_node->Var()->GetDataType() == proto::VarType::BF16) {
            has_float_input = true;
            break;
          }
        }
        if (!has_float_input) {
          support_precision = false;
          VLOG(2) << " op doesn't has float input, just skip.";
        }
      }

700
      if (support_precision) {
701
        HandleSpecialOps(op_node->Op());
702 703
        ++num_low_precision;
        auto inputs = op_node->inputs;
W
Wilber 已提交
704
        // Process inputs.
705
        for (auto* in_node : inputs) {
W
Wilber 已提交
706
          ProcessInputNode(true,
707
                           graphes,
W
Wilber 已提交
708 709 710 711 712 713
                           in_node,
                           op_node,
                           &suffix,
                           block_desc,
                           &cast_map,
                           to_type,
714
                           block_idx,
W
Wilber 已提交
715
                           vars_in_multi_block_map);
716
        }
W
Wilber 已提交
717
        // Process outputs.
718
        for (auto* out_node : op_node->outputs) {
719 720
          ProcessOutputNode(
              graphes, block_idx, out_node, to_type, vars_in_multi_block_map);
721 722 723 724
        }
      } else {
        auto inputs = op_node->inputs;
        for (auto* in_node : inputs) {
W
Wilber 已提交
725
          ProcessInputNode(false,
726
                           graphes,
W
Wilber 已提交
727 728 729 730 731 732
                           in_node,
                           op_node,
                           &suffix,
                           block_desc,
                           &cast_map,
                           framework::proto::VarType::FP32,
733
                           block_idx,
W
Wilber 已提交
734
                           vars_in_multi_block_map);
735 736 737 738 739 740 741
        }
      }
    }

    // 3. check op not support fp16/bf16 or in blacklist.
    //      - add cast op if the input dtype is not fp32.
    else {  // NOLINT
742 743
      auto ins = op_node->inputs;
      for (auto* in_node : ins) {
W
Wilber 已提交
744
        if (in_node->IsCtrlVar()) continue;
745 746 747 748 749 750 751 752 753 754 755 756 757 758 759
        auto* in_var = in_node->Var();
        if (in_var->GetDataType() == to_type) {
          AddCastOp(graph,
                    in_node,
                    op_node,
                    to_type,
                    framework::proto::VarType::FP32,
                    &suffix,
                    block_desc,
                    &cast_map);
        }
      }
    }
  }

W
Wilber 已提交
760 761
  // 4. if output_op's dtype is not compatible to output dtype, then just
  // insert cast.
762
  for (auto* node : output_nodes) {
W
Wilber 已提交
763
    if (node->IsCtrlVar()) continue;
764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788
    auto var = node->Var();
    if (keep_io_types && var->GetDataType() == to_type) {
      // fp16/bf16 -> fp32.
      AddCastOp(graph,
                node,
                node->outputs[0],
                to_type,
                framework::proto::VarType::FP32,
                &suffix,
                block_desc,
                &cast_map);
    } else if (!keep_io_types &&
               var->GetDataType() == framework::proto::VarType::FP32) {
      // fp32 -> fp16/bf16
      AddCastOp(graph,
                node,
                node->outputs[0],
                framework::proto::VarType::FP32,
                to_type,
                &suffix,
                block_desc,
                &cast_map);
    }
  }

789 790 791 792 793 794 795 796 797
  for (auto node : graph->Nodes()) {
    auto* real_node =
        GetRealNode(graphes, block_idx, node, vars_in_multi_block_map);
    if (!NodeVarHasDtype(real_node)) continue;

    if (vars_in_multi_block_map->count(real_node->Name()) &&
        vars_in_multi_block_map->at(real_node->Name()).second == block_idx) {
      vars_in_multi_block_map->at(real_node->Name()).first =
          real_node->Var()->GetDataType();
W
Wilber 已提交
798 799 800
    }
  }

801
  if (num_low_precision)
802 803
    LOG(INFO) << "---  detected " << num_low_precision
              << " low precision ops in " << block_idx << " subgraph";
804 805 806
}
}  // namespace

W
Wilber 已提交
807
bool OpSupportPrecision(const std::string& op_type,
808 809 810
                        phi::Backend backend,
                        phi::DataType precision,
                        const std::unordered_set<std::string>& blacklist) {
W
Wilber 已提交
811
  auto phi_op_type = phi::TransToPhiKernelName(op_type);
812
  bool support_precision = false;
W
Wilber 已提交
813
  if (blacklist.count(op_type) == 0) {
814
    if (backend == phi::Backend::GPU)
W
Wilber 已提交
815
      support_precision = GpuKernelSupportPrecision(op_type, precision);
816 817
    else
      support_precision =
W
Wilber 已提交
818
          PhiKernelSupportPrecision(phi_op_type, backend, precision);
819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885
  }
  return support_precision;
}

void AddCastOp(
    framework::ir::Graph* graph,
    framework::ir::Node* node,
    framework::ir::Node* next_op,
    framework::proto::VarType::Type from_type,
    framework::proto::VarType::Type to_type,
    int* suffix,
    framework::BlockDesc* block_desc,
    std::unordered_map<framework::ir::Node*, framework::ir::Node*>* map) {
  auto update_cast_desc = [&](framework::OpDesc& desc,
                              const std::string& x_name,
                              const std::string& out_name,
                              const int in_dtype,
                              const int out_dtype) {
    desc.SetType("cast");
    desc.SetInput("X", {x_name});
    desc.SetOutput("Out", {out_name});
    desc.SetAttr("in_dtype", in_dtype);
    desc.SetAttr("out_dtype", out_dtype);
    desc.SetAttr("use_mkldnn", false);
    desc.SetAttr("with_quant_attr", false);
    desc.Flush();
  };

  if (map->count(node) == 0) {
    // insert cast op before node.
    std::string cast_input_name = node->Var()->Name();
    std::string cast_output_name =
        node->Var()->Name() + "_cast.tmp_" + std::to_string((*suffix)++);
    CHECK_NOTNULL(block_desc);
    framework::OpDesc cast_op_desc(block_desc);
    update_cast_desc(cast_op_desc,
                     cast_input_name,
                     cast_output_name,
                     static_cast<int>(from_type),
                     static_cast<int>(to_type));
    auto* cast_op_node = graph->CreateOpNode(&cast_op_desc);
    auto* cast_output_vardesc = block_desc->Var(cast_output_name);
    cast_output_vardesc->SetPersistable(false);
    cast_output_vardesc->SetDataType(to_type);
    cast_output_vardesc->SetShape(node->Var()->GetShape());
    auto* cast_output_node = graph->CreateVarNode(cast_output_vardesc);
    IR_NODE_LINK_TO(cast_op_node, cast_output_node);
    (*map)[node] = cast_output_node;
  }
  next_op->Op()->RenameInput(node->Name(), map->at(node)->Name());
  IR_NODE_LINK_TO(node, map->at(node)->inputs[0]);
  IR_NODE_LINK_TO(map->at(node), next_op);
}

void ConvertToMixedPrecision(const std::string& model_file,
                             const std::string& params_file,
                             const std::string& mixed_model_file,
                             const std::string& mixed_params_file,
                             phi::DataType mixed_precision,
                             phi::Backend backend,
                             bool keep_io_types,
                             std::unordered_set<std::string> black_list) {
  paddle::CPUPlace place;
  framework::Executor executor(place);
  framework::Scope scope;
  auto program_desc =
      inference::Load(&executor, &scope, model_file, params_file);
W
Wilber 已提交
886
  auto main_graph = std::unique_ptr<framework::ir::Graph>(
887 888
      new framework::ir::Graph(*program_desc));

889 890
  std::unordered_map<std::string,
                     std::pair<framework::proto::VarType::Type, int>>
W
Wilber 已提交
891
      vars_in_multi_block_map;
892 893 894 895 896
  std::vector<std::set<std::string>> vars_appear_multi_in_one_block(
      program_desc->Size());
  FindVarsInMultiBlock(program_desc.get(),
                       &vars_in_multi_block_map,
                       &vars_appear_multi_in_one_block);
W
Wilber 已提交
897

898
  std::vector<framework::ir::Graph*> graphes;
W
Wilber 已提交
899 900
  for (size_t i = 0; i < main_graph->SubGraphsSize(); ++i) {
    auto graph = main_graph->GetSubGraph(i);
901
    graphes.push_back(graph);
W
Wilber 已提交
902
    VLOG(2) << " --------  handle subgraph " << i << ", has "
903
            << graph->Nodes().size() << " nodes --------";
W
Wilber 已提交
904 905 906

    ConvertAllFp64ToFp32(graph);
    ConvertTensorDtype(program_desc.get(),
907
                       graphes,
W
Wilber 已提交
908 909 910 911
                       black_list,
                       keep_io_types,
                       backend,
                       mixed_precision,
912 913 914
                       i,
                       &vars_in_multi_block_map,
                       vars_appear_multi_in_one_block);
W
Wilber 已提交
915
    FixCastAttr(graph);
916 917
  }

W
Wilber 已提交
918 919 920 921 922 923 924 925
  framework::ProgramDesc mixed_program_desc;
  framework::ir::GraphToProgram(*main_graph, &mixed_program_desc);

  SaveMixedModel(main_graph.get(),
                 &scope,
                 &mixed_program_desc,
                 mixed_model_file,
                 mixed_params_file,
926 927
                 mixed_precision,
                 vars_in_multi_block_map);
928 929 930 931 932
}

}  // namespace analysis
}  // namespace inference
}  // namespace paddle