convert_to_mixed_precision.cc 32.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h"

W
Wilber 已提交
17 18
#include <algorithm>
#include <iterator>
19
#include <string>
W
Wilber 已提交
20
#include <unordered_map>
21
#include <unordered_set>
22
#include <utility>
23 24 25

#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/executor.h"
26
#include "paddle/fluid/framework/framework.pb.h"
27 28 29
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
30
#include "paddle/fluid/framework/ir/node.h"
31 32
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
33
#include "paddle/fluid/framework/var_desc.h"
34 35 36 37 38 39 40 41 42 43 44 45 46
#include "paddle/fluid/inference/io.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/tensor_meta.h"

using namespace paddle::framework;  // NOLINT

namespace paddle {
namespace inference {
namespace analysis {

namespace {

W
Wilber 已提交
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
inline std::string SerializeParams(framework::Scope* scope,
                                   const std::vector<std::string>& params) {
  std::ostringstream os;
  phi::CPUContext ctx;
  for (const auto& param : params) {
    VLOG(3) << "Serialize param: " << param;
    PADDLE_ENFORCE_NOT_NULL(
        scope->FindVar(param),
        platform::errors::NotFound("Block should already have a '%s' variable",
                                   param));
    auto* tensor = scope->FindVar(param)->GetMutable<framework::LoDTensor>();
    framework::SerializeToStream(os, *tensor, ctx);
  }
  return os.str();
}

inline void StrToBinary(const std::string& path, const std::string& str) {
  std::ofstream file(path.c_str(), std::ios::binary);
  file.write(str.c_str(), str.size());
  file.close();
}
68

W
Wilber 已提交
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
inline bool NodeVarHasDtype(framework::ir::Node* node) {
  if (node->IsCtrlVar()) return false;

  if (node->IsVar() &&
      (node->Var()->GetType() ==
           paddle::framework::proto::VarType::SELECTED_ROWS ||
       node->Var()->GetType() ==
           paddle::framework::proto::VarType::LOD_TENSOR ||
       node->Var()->GetType() ==
           paddle::framework::proto::VarType::LOD_TENSOR_ARRAY ||
       node->Var()->GetType() == paddle::framework::proto::VarType::STRINGS ||
       node->Var()->GetType() == paddle::framework::proto::VarType::VOCAB)) {
    return true;
  }

  return false;
}
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142

// Return Node* which first appers in block.
framework::ir::Node* GetRealNode(
    const std::vector<framework::ir::Graph*>& graphes,
    int block_idx,
    framework::ir::Node* node,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map) {
  if (vars_in_multi_block_map->count(node->Name())) {
    int var_origin_block_id = vars_in_multi_block_map->at(node->Name()).second;
    if (block_idx != var_origin_block_id) {
      auto graph = graphes[var_origin_block_id];
      for (auto nd : graph->Nodes()) {
        if (nd->Name() == node->Name()) {
          return nd;
        }
      }
    }
  }

  return node;
}

inline bool VarIsMultiOpsOut(
    const std::vector<framework::ir::Graph*>& graphes,
    int block_idx,
    framework::ir::Node* op_node,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map,
    const std::vector<std::set<std::string>>& vars_appear_multi_in_one_block) {
  CHECK_EQ(op_node->IsOp(), true);
  for (auto* out : op_node->outputs) {
    if (out->IsCtrlVar()) continue;
    auto* real_node =
        GetRealNode(graphes, block_idx, out, vars_in_multi_block_map);
    if (!real_node->Var()->Persistable() &&
        vars_appear_multi_in_one_block[block_idx].count(out->Name())) {
      VLOG(2) << out->Name()
              << " is multi op's out, so we skip convert to fp16";
      return true;
    }
  }
  return false;
}

void SaveMixedModel(
    framework::ir::Graph* graph,
    framework::Scope* scope,
    framework::ProgramDesc* mixed_program_desc,
    const std::string& mixed_model_file,
    const std::string& mixed_params_file,
    phi::DataType mixed_precision,
    const std::unordered_map<std::string,
                             std::pair<framework::proto::VarType::Type, int>>&
        vars_in_multi_block_map) {
W
Wilber 已提交
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
  paddle::CPUPlace place;
  auto parameters = scope->LocalVarNames();
  std::sort(parameters.begin(), parameters.end());

  std::unordered_set<std::string> weights_should_be_fp32;
  for (auto* node : graph->Nodes()) {
    if (!(node->IsVar() && !node->IsCtrlVar())) continue;
    if (NodeVarHasDtype(node)) {
      if (node->Var()->Persistable() &&
          node->Var()->GetDataType() ==
              paddle::framework::proto::VarType::FP32) {
        VLOG(2) << "weights keep to fp32: " << node->Name();
        weights_should_be_fp32.insert(node->Name());
      }
    }
  }

  for (const auto& param_name : parameters) {
    auto* var = scope->FindLocalVar(param_name);
    if (var->IsType<framework::LoDTensor>() ||
        var->IsType<framework::Tensor>()) {
      auto* t = var->GetMutable<framework::LoDTensor>();
W
Wilber 已提交
165 166
      if (t->dtype() != phi::DataType::FLOAT32) continue;

W
Wilber 已提交
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
      framework::Tensor mixed_tensor;
      mixed_tensor.Resize(t->dims());
      auto* data = t->mutable_data<float>(platform::CPUPlace());

      if (mixed_precision == phi::DataType::FLOAT16 &&
          !weights_should_be_fp32.count(param_name)) {
        mixed_tensor.set_type(paddle::experimental::DataType::FLOAT16);
        auto* mixed_data =
            mixed_tensor.mutable_data<float16>(platform::CPUPlace());
        for (int i = 0; i < t->numel(); i++) {
          mixed_data[i] = static_cast<float16>(data[i]);
        }
        t->clear();
        paddle::framework::TensorCopySync(mixed_tensor, place, t);
      } else if (mixed_precision == phi::DataType::BFLOAT16 &&
                 !weights_should_be_fp32.count(param_name)) {
        mixed_tensor.set_type(paddle::experimental::DataType::BFLOAT16);
        auto* mixed_data =
            mixed_tensor.mutable_data<bfloat16>(platform::CPUPlace());
        for (int i = 0; i < t->numel(); i++) {
          mixed_data[i] = static_cast<bfloat16>(data[i]);
        }
        t->clear();
        paddle::framework::TensorCopySync(mixed_tensor, place, t);
      }
    }
  }

  StrToBinary(mixed_model_file,
              mixed_program_desc->Proto()->SerializeAsString());
  StrToBinary(mixed_params_file, SerializeParams(scope, parameters));
}

bool PhiKernelSupportPrecision(
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
    const std::string& op_type,
    phi::Backend backend,
    phi::DataType data_type,
    phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) {
  auto kernels = phi::KernelFactory::Instance().kernels();
  if (kernels.find(op_type) == kernels.end()) {
    return false;
  }
  phi::KernelKey kernel_key(backend, layout, data_type);
  return phi::KernelFactory::Instance().HasKernel(op_type, kernel_key);
}

bool GpuKernelSupportPrecision(
    const std::string& op_type,
    phi::DataType data_type,
    phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) {
W
Wilber 已提交
217 218 219 220 221 222 223 224 225 226 227
  auto phi_op_type = phi::TransToPhiKernelName(op_type);
  bool res = PhiKernelSupportPrecision(
      phi_op_type, phi::Backend::GPU, data_type, layout);
  res |= PhiKernelSupportPrecision(
      phi_op_type, phi::Backend::GPUDNN, data_type, layout);

  if (!res) {
    auto& all_kernels = OperatorWithKernel::AllOpKernels();
    auto it = all_kernels.find(op_type);
    if (it != all_kernels.end()) {
      for (auto& kern_pair : it->second) {
228 229
        if (platform::is_gpu_place(kern_pair.first.place_) &&
            kern_pair.first.data_type_ == framework::proto::VarType::FP16) {
W
Wilber 已提交
230 231 232 233 234
          res = true;
        }
      }
    }
  }
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
  return res;
}

// Just process special cases.
bool OutShouldNotConvert(ir::Node* var_node) {
  auto op_node = var_node->inputs[0];
  auto* op_desc = op_node->Op();

  // batch_norm's input and output (variance and mean) are the same.
  if (op_desc->Type() == "batch_norm") {
    auto vecs = op_desc->Output("MeanOut");
    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
      return true;
    }
    vecs = op_desc->Output("VarianceOut");
    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
      return true;
    }
    vecs = op_desc->Output("SavedMean");
    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
      return true;
    }
    vecs = op_desc->Output("SavedVariance");
    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
      return true;
    }
  }

  return false;
}
265 266 267 268 269 270 271 272 273 274 275 276
void ProcessOutputNode(
    const std::vector<framework::ir::Graph*>& graphes,
    int block_idx,
    ir::Node* var_node,
    framework::proto::VarType::Type to_type,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map) {
  auto* real_node =
      GetRealNode(graphes, block_idx, var_node, vars_in_multi_block_map);
  if (!NodeVarHasDtype(real_node)) return;
  auto* out_var = real_node->Var();
W
Wilber 已提交
277 278 279
  if (out_var->GetDataType() == framework::proto::VarType::FP32) {
    if (OutShouldNotConvert(var_node)) return;
    out_var->SetDataType(to_type);
280
  }
W
Wilber 已提交
281 282
  VLOG(3) << " out_node name " << var_node->Name() << " data_type "
          << out_var->GetDataType();
283 284
}

285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
// Just process special cases for weights conversion.
bool WeightsShouldNotConvert(ir::Node* var_node) {
  auto op_nodes = var_node->outputs;
  for (auto* op_node : op_nodes) {
    auto* op_desc = op_node->Op();
    // batch_norm op's bias, mean, scale and variance just be float32, so we can
    // not convert the dtype.
    if (op_desc->Type() == "batch_norm") {
      auto vecs = op_desc->Input("Bias");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
      vecs = op_desc->Input("Mean");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
      vecs = op_desc->Input("Scale");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
      vecs = op_desc->Input("Variance");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
    } else if (op_desc->Type() == "fused_multi_transformer") {
      auto vecs = op_desc->Input("LnScale");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }

      vecs = op_desc->Input("LnBias");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }

      vecs = op_desc->Input("FFNLnScale");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }

      vecs = op_desc->Input("FFNLnBias");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
329 330 331 332 333
    }
  }

  return false;
}
W
Wilber 已提交
334 335 336
inline bool IsFloatVarType(framework::proto::VarType::Type type) {
  if (type == framework::proto::VarType::FP16 ||
      type == framework::proto::VarType::FP32 ||
337
      type == framework::proto::VarType::BF16)
W
Wilber 已提交
338 339 340
    return true;
  return false;
}
W
Wilber 已提交
341 342
void ProcessInputNode(
    bool support_precision,
343
    std::vector<framework::ir::Graph*> graphes,
W
Wilber 已提交
344 345 346 347 348 349
    ir::Node* in_node,
    ir::Node* op_node,
    int* suffix,
    framework::BlockDesc* block_desc,
    std::unordered_map<framework::ir::Node*, framework::ir::Node*>* cast_map,
    framework::proto::VarType::Type to_type,
350 351 352
    int block_idx,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
W
Wilber 已提交
353
        vars_in_multi_block_map) {
354 355 356 357 358 359
  auto* real_node =
      GetRealNode(graphes, block_idx, in_node, vars_in_multi_block_map);
  if (!NodeVarHasDtype(real_node)) return;
  auto graph = graphes[block_idx];
  bool is_main_block = block_idx == 0;
  auto* in_var = real_node->Var();
W
Wilber 已提交
360
  auto in_var_type = in_var->GetDataType();
361 362 363 364
  bool is_in_multi_block = vars_in_multi_block_map->count(in_var->Name());

  if (!is_main_block && is_in_multi_block) {
    in_var_type = vars_in_multi_block_map->at(in_var->Name()).first;
W
Wilber 已提交
365 366 367 368 369 370
  }
  if (support_precision) {
    if (in_var->Persistable() &&
        in_var_type == framework::proto::VarType::FP32) {
      if (WeightsShouldNotConvert(in_node)) return;
      in_var->SetDataType(to_type);
371
      in_var_type = to_type;
W
Wilber 已提交
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
    } else if (!in_var->Persistable() && IsFloatVarType(in_var_type) &&
               in_var_type != to_type) {
      AddCastOp(graph,
                in_node,
                op_node,
                in_var_type,
                to_type,
                suffix,
                block_desc,
                cast_map);
    }
  } else {
    if (!in_var->Persistable() && IsFloatVarType(in_var_type) &&
        in_var_type != to_type) {
      AddCastOp(graph,
                in_node,
                op_node,
                in_var_type,
                to_type,
                suffix,
                block_desc,
                cast_map);
    }
  }
396
  VLOG(3) << " in_node name " << in_var->Name() << " data_type " << in_var_type;
W
Wilber 已提交
397
}
W
Wilber 已提交
398

399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471
void ConvertAllFp64ToFp32(framework::ir::Graph* graph) {
  auto op_nodes = framework::ir::TopologySortOperations(*graph);
  for (auto* op_node : op_nodes) {
    if (!op_node->IsOp()) continue;
    auto op_type = op_node->Op()->Type();
    if (op_type == "feed" || op_type == "fetch") continue;

    if (op_type == "fill_constant") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "dtype", static_cast<int>(framework::proto::VarType::FP32));
    } else if (op_type == "assign_value") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "dtype", static_cast<int>(framework::proto::VarType::FP32));
    } else if (op_type == "eye") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "dtype", static_cast<int>(framework::proto::VarType::FP32));
    } else if (op_type == "fill_any_like") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "dtype", static_cast<int>(framework::proto::VarType::FP32));
    } else if (op_type == "cast") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("in_dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "in_dtype", static_cast<int>(framework::proto::VarType::FP32));
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("out_dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "out_dtype", static_cast<int>(framework::proto::VarType::FP32));
    }

    auto inputs = op_node->inputs;
    for (auto* in_node : inputs) {
      if (in_node->IsCtrlVar()) continue;
      auto* in_var = in_node->Var();
      if (!in_var->Persistable() &&
          in_var->GetDataType() == framework::proto::VarType::FP64) {
        in_var->SetDataType(framework::proto::VarType::FP32);
      }
    }
  }
}

// Handle special ops which contains dtype attribute. e.g., fill_constant,
// assign_value.
void HandleSpecialOps(framework::OpDesc* op_desc) {
  if (op_desc->Type() == "fill_constant") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
  } else if (op_desc->Type() == "assign_value") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
  } else if (op_desc->Type() == "eye") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
  } else if (op_desc->Type() == "fill_any_like") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
W
Wilber 已提交
472 473 474 475 476
  } else if (op_desc->Type() == "fill_constant_batch_size_like") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497
  }
}

// We modify op's input output precision, and we need to fix cast op in_dtype
// and out_dtype attribute.
void FixCastAttr(framework::ir::Graph* graph) {
  auto op_nodes = framework::ir::TopologySortOperations(*graph);
  for (auto* op_node : op_nodes) {
    if (!op_node->IsOp()) continue;
    auto op_type = op_node->Op()->Type();
    if (op_type != "cast") continue;

    auto input = op_node->inputs[0];
    auto output = op_node->outputs[0];
    op_node->Op()->SetAttr("in_dtype",
                           static_cast<int>(input->Var()->GetDataType()));
    op_node->Op()->SetAttr("out_dtype",
                           static_cast<int>(output->Var()->GetDataType()));
  }
}

W
Wilber 已提交
498 499
void FindVarsInMultiBlock(
    framework::ProgramDesc* program_desc,
500 501 502 503 504 505
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map,
    std::vector<std::set<std::string>>* vars_appear_multi_in_one_block) {
  std::vector<std::set<std::string>> block_var_names_set(program_desc->Size());
  for (size_t i = 0; i < program_desc->Size(); ++i) {
W
Wilber 已提交
506 507
    for (auto op : program_desc->Block(i).AllOps()) {
      auto in_names = op->InputArgumentNames();
508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534
      block_var_names_set[i].insert(in_names.begin(), in_names.end());
      auto out_names = op->OutputArgumentNames();
      if (op->HasAttr("sub_block") == false) {
        for (auto& n : out_names) {
          if (block_var_names_set[i].count(n)) {
            (*vars_appear_multi_in_one_block)[i].insert(n);
          }
        }
      }
      block_var_names_set[i].insert(out_names.begin(), out_names.end());
    }
  }

  for (size_t i = 0; i < program_desc->Size() - 1; ++i) {
    for (size_t j = i + 1; j < program_desc->Size(); ++j) {
      std::set<std::string> vars_in_multi_block;
      std::set_intersection(
          block_var_names_set[i].begin(),
          block_var_names_set[i].end(),
          block_var_names_set[j].begin(),
          block_var_names_set[j].end(),
          std::inserter(vars_in_multi_block, vars_in_multi_block.begin()));

      for (auto name : vars_in_multi_block) {
        vars_in_multi_block_map->emplace(
            name, std::make_pair(framework::proto::VarType::FP32, i));
      }
535
    }
536 537
  }
}
W
Wilber 已提交
538

539 540 541 542 543 544 545 546 547 548 549 550 551 552 553
bool OpInOutHasTensorArray(
    std::vector<framework::ir::Graph*> graphes,
    int block_idx,
    framework::ir::Node* op_node,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map) {
  CHECK_EQ(op_node->IsOp(), true);
  for (auto in : op_node->inputs) {
    auto* real_node =
        GetRealNode(graphes, block_idx, in, vars_in_multi_block_map);
    if (!NodeVarHasDtype(real_node)) continue;
    if (real_node->Var()->GetType() ==
        framework::proto::VarType::LOD_TENSOR_ARRAY)
      return true;
W
Wilber 已提交
554 555
  }

556 557 558 559 560 561 562 563
  for (auto out : op_node->outputs) {
    auto* real_node =
        GetRealNode(graphes, block_idx, out, vars_in_multi_block_map);
    if (!NodeVarHasDtype(real_node)) continue;

    if (real_node->Var()->GetType() ==
        framework::proto::VarType::LOD_TENSOR_ARRAY)
      return true;
564
  }
565
  return false;
566 567
}

W
Wilber 已提交
568 569
void ConvertTensorDtype(
    framework::ProgramDesc* program_desc,
570
    std::vector<framework::ir::Graph*> graphes,
W
Wilber 已提交
571 572 573 574
    const std::unordered_set<std::string>& blacklist,
    bool keep_io_types,
    phi::Backend backend,
    phi::DataType tensor_dtype,
575 576 577 578 579 580
    int block_idx,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map,
    const std::vector<std::set<std::string>>& vars_appear_multi_in_one_block) {
  auto graph = graphes[block_idx];
581 582 583 584 585 586 587
  framework::proto::VarType::Type to_type;
  if (tensor_dtype == phi::DataType::FLOAT16) {
    to_type = framework::proto::VarType::FP16;
  } else if (tensor_dtype == phi::DataType::BFLOAT16) {
    to_type = framework::proto::VarType::BF16;
  } else {
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
W
Wilber 已提交
588
        "mixed_precision currently not supported dtype %d, we now only "
589
        "support fp16 and bf16.",
590 591 592
        static_cast<int>(tensor_dtype)));
  }

W
Wilber 已提交
593 594 595
  auto* block_desc =
      framework::ir::TopologySortOperations(*graph)[0]->Op()->Block();

596 597 598 599
  int num_low_precision = 0;
  int suffix = 0;
  std::vector<framework::ir::Node*> output_nodes;
  std::unordered_map<framework::ir::Node*, framework::ir::Node*> cast_map;
600 601
  auto op_nodes = framework::ir::TopologySortOperations(*graph);
  for (auto* op_node : op_nodes) {
602 603
    if (!op_node->IsOp()) continue;
    auto op_type = op_node->Op()->Type();
W
Wilber 已提交
604 605
    VLOG(3) << "-------------------- op_type " << op_type << ", phi_type "
            << phi::TransToPhiKernelName(op_type);
606 607 608 609 610 611 612 613 614 615 616
    // 1. set input dtype.
    if (op_type == "feed") {
      auto feed_var = op_node->outputs[0]->Var();
      if (!keep_io_types &&
          feed_var->GetDataType() == framework::proto::VarType::FP32) {
        feed_var->SetDataType(to_type);
      }
    } else if (op_type == "fetch") {
      auto* fetch_var = op_node->inputs[0];
      output_nodes.push_back(fetch_var);
      continue;
617 618
    } else if (op_type == "cast") {
      continue;
619 620
    }

W
Wilber 已提交
621 622 623 624 625
    else if (op_node->Op()->HasAttr("sub_block")) {  // NOLINT
      // sub_block op's output dtype should be same as input dtype, if have the
      // same name.
      std::unordered_map<std::string, framework::ir::Node*> in_name_to_node;
      for (auto* in : op_node->inputs) {
626 627 628
        auto* real_node =
            GetRealNode(graphes, block_idx, in, vars_in_multi_block_map);
        if (NodeVarHasDtype(real_node)) {
W
Wilber 已提交
629 630 631 632 633
          in_name_to_node[in->Name()] = in;
        }
      }

      for (auto out : op_node->outputs) {
634 635 636
        auto* real_node =
            GetRealNode(graphes, block_idx, out, vars_in_multi_block_map);
        if (NodeVarHasDtype(real_node)) {
W
Wilber 已提交
637
          if (in_name_to_node.count(out->Name()))
638
            real_node->Var()->SetDataType(
W
Wilber 已提交
639 640 641 642 643 644 645
                in_name_to_node[out->Name()]->Var()->GetDataType());
        }
      }

      continue;
    }

646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662
    // A strange case found in multi block.
    else if (op_type == "assign" &&  // NOLINT
             op_node->inputs[0]->Name() == op_node->outputs[0]->Name()) {
      VLOG(2) << " in out are same, continue";
      continue;
    }

    // Handle tensor array.
    else if (OpInOutHasTensorArray(  // NOLINT
                 graphes,
                 block_idx,
                 op_node,
                 vars_in_multi_block_map)) {
      VLOG(2) << "  in or out has tensor array, continue";
      continue;
    }

663 664 665 666
    // 2. if op support fp16/bf16 and not in blacklist.
    //      - cast weight to fp16/bf16.
    //      - add cast op if the input dtype is not fp16/bf16.
    //      - set output dtype.
667 668 669 670 671 672 673 674 675
    //
    // If a var(op's out var) appears multiple times in a block, we should not
    // convert to fp16.
    else if (blacklist.count(op_type) == 0 &&  // NOLINT
             !VarIsMultiOpsOut(graphes,
                               block_idx,
                               op_node,
                               vars_in_multi_block_map,
                               vars_appear_multi_in_one_block)) {
676
      bool support_precision =
W
Wilber 已提交
677
          OpSupportPrecision(op_type, backend, tensor_dtype, blacklist);
678
      VLOG(2) << " support low precision " << support_precision;
679

680
      if (support_precision) {
681
        HandleSpecialOps(op_node->Op());
682 683
        ++num_low_precision;
        auto inputs = op_node->inputs;
W
Wilber 已提交
684
        // Process inputs.
685
        for (auto* in_node : inputs) {
W
Wilber 已提交
686
          ProcessInputNode(true,
687
                           graphes,
W
Wilber 已提交
688 689 690 691 692 693
                           in_node,
                           op_node,
                           &suffix,
                           block_desc,
                           &cast_map,
                           to_type,
694
                           block_idx,
W
Wilber 已提交
695
                           vars_in_multi_block_map);
696
        }
W
Wilber 已提交
697
        // Process outputs.
698
        for (auto* out_node : op_node->outputs) {
699 700
          ProcessOutputNode(
              graphes, block_idx, out_node, to_type, vars_in_multi_block_map);
701 702 703 704
        }
      } else {
        auto inputs = op_node->inputs;
        for (auto* in_node : inputs) {
W
Wilber 已提交
705
          ProcessInputNode(false,
706
                           graphes,
W
Wilber 已提交
707 708 709 710 711 712
                           in_node,
                           op_node,
                           &suffix,
                           block_desc,
                           &cast_map,
                           framework::proto::VarType::FP32,
713
                           block_idx,
W
Wilber 已提交
714
                           vars_in_multi_block_map);
715 716 717 718 719 720 721
        }
      }
    }

    // 3. check op not support fp16/bf16 or in blacklist.
    //      - add cast op if the input dtype is not fp32.
    else {  // NOLINT
722 723
      auto ins = op_node->inputs;
      for (auto* in_node : ins) {
W
Wilber 已提交
724
        if (in_node->IsCtrlVar()) continue;
725 726 727 728 729 730 731 732 733 734 735 736 737 738 739
        auto* in_var = in_node->Var();
        if (in_var->GetDataType() == to_type) {
          AddCastOp(graph,
                    in_node,
                    op_node,
                    to_type,
                    framework::proto::VarType::FP32,
                    &suffix,
                    block_desc,
                    &cast_map);
        }
      }
    }
  }

W
Wilber 已提交
740 741
  // 4. if output_op's dtype is not compatible to output dtype, then just
  // insert cast.
742
  for (auto* node : output_nodes) {
W
Wilber 已提交
743
    if (node->IsCtrlVar()) continue;
744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768
    auto var = node->Var();
    if (keep_io_types && var->GetDataType() == to_type) {
      // fp16/bf16 -> fp32.
      AddCastOp(graph,
                node,
                node->outputs[0],
                to_type,
                framework::proto::VarType::FP32,
                &suffix,
                block_desc,
                &cast_map);
    } else if (!keep_io_types &&
               var->GetDataType() == framework::proto::VarType::FP32) {
      // fp32 -> fp16/bf16
      AddCastOp(graph,
                node,
                node->outputs[0],
                framework::proto::VarType::FP32,
                to_type,
                &suffix,
                block_desc,
                &cast_map);
    }
  }

769 770 771 772 773 774 775 776 777
  for (auto node : graph->Nodes()) {
    auto* real_node =
        GetRealNode(graphes, block_idx, node, vars_in_multi_block_map);
    if (!NodeVarHasDtype(real_node)) continue;

    if (vars_in_multi_block_map->count(real_node->Name()) &&
        vars_in_multi_block_map->at(real_node->Name()).second == block_idx) {
      vars_in_multi_block_map->at(real_node->Name()).first =
          real_node->Var()->GetDataType();
W
Wilber 已提交
778 779 780
    }
  }

781
  if (num_low_precision)
782 783
    LOG(INFO) << "---  detected " << num_low_precision
              << " low precision ops in " << block_idx << " subgraph";
784 785 786
}
}  // namespace

W
Wilber 已提交
787
bool OpSupportPrecision(const std::string& op_type,
788 789 790
                        phi::Backend backend,
                        phi::DataType precision,
                        const std::unordered_set<std::string>& blacklist) {
W
Wilber 已提交
791
  auto phi_op_type = phi::TransToPhiKernelName(op_type);
792
  bool support_precision = false;
W
Wilber 已提交
793
  if (blacklist.count(op_type) == 0) {
794
    if (backend == phi::Backend::GPU)
W
Wilber 已提交
795
      support_precision = GpuKernelSupportPrecision(op_type, precision);
796 797
    else
      support_precision =
W
Wilber 已提交
798
          PhiKernelSupportPrecision(phi_op_type, backend, precision);
799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865
  }
  return support_precision;
}

void AddCastOp(
    framework::ir::Graph* graph,
    framework::ir::Node* node,
    framework::ir::Node* next_op,
    framework::proto::VarType::Type from_type,
    framework::proto::VarType::Type to_type,
    int* suffix,
    framework::BlockDesc* block_desc,
    std::unordered_map<framework::ir::Node*, framework::ir::Node*>* map) {
  auto update_cast_desc = [&](framework::OpDesc& desc,
                              const std::string& x_name,
                              const std::string& out_name,
                              const int in_dtype,
                              const int out_dtype) {
    desc.SetType("cast");
    desc.SetInput("X", {x_name});
    desc.SetOutput("Out", {out_name});
    desc.SetAttr("in_dtype", in_dtype);
    desc.SetAttr("out_dtype", out_dtype);
    desc.SetAttr("use_mkldnn", false);
    desc.SetAttr("with_quant_attr", false);
    desc.Flush();
  };

  if (map->count(node) == 0) {
    // insert cast op before node.
    std::string cast_input_name = node->Var()->Name();
    std::string cast_output_name =
        node->Var()->Name() + "_cast.tmp_" + std::to_string((*suffix)++);
    CHECK_NOTNULL(block_desc);
    framework::OpDesc cast_op_desc(block_desc);
    update_cast_desc(cast_op_desc,
                     cast_input_name,
                     cast_output_name,
                     static_cast<int>(from_type),
                     static_cast<int>(to_type));
    auto* cast_op_node = graph->CreateOpNode(&cast_op_desc);
    auto* cast_output_vardesc = block_desc->Var(cast_output_name);
    cast_output_vardesc->SetPersistable(false);
    cast_output_vardesc->SetDataType(to_type);
    cast_output_vardesc->SetShape(node->Var()->GetShape());
    auto* cast_output_node = graph->CreateVarNode(cast_output_vardesc);
    IR_NODE_LINK_TO(cast_op_node, cast_output_node);
    (*map)[node] = cast_output_node;
  }
  next_op->Op()->RenameInput(node->Name(), map->at(node)->Name());
  IR_NODE_LINK_TO(node, map->at(node)->inputs[0]);
  IR_NODE_LINK_TO(map->at(node), next_op);
}

void ConvertToMixedPrecision(const std::string& model_file,
                             const std::string& params_file,
                             const std::string& mixed_model_file,
                             const std::string& mixed_params_file,
                             phi::DataType mixed_precision,
                             phi::Backend backend,
                             bool keep_io_types,
                             std::unordered_set<std::string> black_list) {
  paddle::CPUPlace place;
  framework::Executor executor(place);
  framework::Scope scope;
  auto program_desc =
      inference::Load(&executor, &scope, model_file, params_file);
W
Wilber 已提交
866
  auto main_graph = std::unique_ptr<framework::ir::Graph>(
867 868
      new framework::ir::Graph(*program_desc));

869 870
  std::unordered_map<std::string,
                     std::pair<framework::proto::VarType::Type, int>>
W
Wilber 已提交
871
      vars_in_multi_block_map;
872 873 874 875 876
  std::vector<std::set<std::string>> vars_appear_multi_in_one_block(
      program_desc->Size());
  FindVarsInMultiBlock(program_desc.get(),
                       &vars_in_multi_block_map,
                       &vars_appear_multi_in_one_block);
W
Wilber 已提交
877

878
  std::vector<framework::ir::Graph*> graphes;
W
Wilber 已提交
879 880
  for (size_t i = 0; i < main_graph->SubGraphsSize(); ++i) {
    auto graph = main_graph->GetSubGraph(i);
881
    graphes.push_back(graph);
W
Wilber 已提交
882
    VLOG(2) << " --------  handle subgraph " << i << ", has "
883
            << graph->Nodes().size() << " nodes --------";
W
Wilber 已提交
884 885 886

    ConvertAllFp64ToFp32(graph);
    ConvertTensorDtype(program_desc.get(),
887
                       graphes,
W
Wilber 已提交
888 889 890 891
                       black_list,
                       keep_io_types,
                       backend,
                       mixed_precision,
892 893 894
                       i,
                       &vars_in_multi_block_map,
                       vars_appear_multi_in_one_block);
W
Wilber 已提交
895
    FixCastAttr(graph);
896 897
  }

W
Wilber 已提交
898 899 900 901 902 903 904 905
  framework::ProgramDesc mixed_program_desc;
  framework::ir::GraphToProgram(*main_graph, &mixed_program_desc);

  SaveMixedModel(main_graph.get(),
                 &scope,
                 &mixed_program_desc,
                 mixed_model_file,
                 mixed_params_file,
906 907
                 mixed_precision,
                 vars_in_multi_block_map);
908 909 910 911 912
}

}  // namespace analysis
}  // namespace inference
}  // namespace paddle