convert_to_mixed_precision.cc 33.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h"

W
Wilber 已提交
17 18
#include <algorithm>
#include <iterator>
19
#include <string>
W
Wilber 已提交
20
#include <unordered_map>
21
#include <unordered_set>
22
#include <utility>
23 24 25

#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/executor.h"
26
#include "paddle/fluid/framework/framework.pb.h"
27 28 29
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
30
#include "paddle/fluid/framework/ir/node.h"
31 32
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
33
#include "paddle/fluid/framework/var_desc.h"
34 35 36 37 38 39 40 41 42 43 44 45 46
#include "paddle/fluid/inference/io.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/tensor_meta.h"

using namespace paddle::framework;  // NOLINT

namespace paddle {
namespace inference {
namespace analysis {

namespace {

W
Wilber 已提交
47 48 49 50 51 52 53 54 55 56
inline std::string SerializeParams(framework::Scope* scope,
                                   const std::vector<std::string>& params) {
  std::ostringstream os;
  phi::CPUContext ctx;
  for (const auto& param : params) {
    VLOG(3) << "Serialize param: " << param;
    PADDLE_ENFORCE_NOT_NULL(
        scope->FindVar(param),
        platform::errors::NotFound("Block should already have a '%s' variable",
                                   param));
57
    auto* tensor = scope->FindVar(param)->GetMutable<phi::DenseTensor>();
W
Wilber 已提交
58 59 60 61 62 63 64 65 66 67
    framework::SerializeToStream(os, *tensor, ctx);
  }
  return os.str();
}

inline void StrToBinary(const std::string& path, const std::string& str) {
  std::ofstream file(path.c_str(), std::ios::binary);
  file.write(str.c_str(), str.size());
  file.close();
}
68

W
Wilber 已提交
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
inline bool NodeVarHasDtype(framework::ir::Node* node) {
  if (node->IsCtrlVar()) return false;

  if (node->IsVar() &&
      (node->Var()->GetType() ==
           paddle::framework::proto::VarType::SELECTED_ROWS ||
       node->Var()->GetType() ==
           paddle::framework::proto::VarType::LOD_TENSOR ||
       node->Var()->GetType() ==
           paddle::framework::proto::VarType::LOD_TENSOR_ARRAY ||
       node->Var()->GetType() == paddle::framework::proto::VarType::STRINGS ||
       node->Var()->GetType() == paddle::framework::proto::VarType::VOCAB)) {
    return true;
  }

  return false;
}
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142

// Return Node* which first appers in block.
framework::ir::Node* GetRealNode(
    const std::vector<framework::ir::Graph*>& graphes,
    int block_idx,
    framework::ir::Node* node,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map) {
  if (vars_in_multi_block_map->count(node->Name())) {
    int var_origin_block_id = vars_in_multi_block_map->at(node->Name()).second;
    if (block_idx != var_origin_block_id) {
      auto graph = graphes[var_origin_block_id];
      for (auto nd : graph->Nodes()) {
        if (nd->Name() == node->Name()) {
          return nd;
        }
      }
    }
  }

  return node;
}

inline bool VarIsMultiOpsOut(
    const std::vector<framework::ir::Graph*>& graphes,
    int block_idx,
    framework::ir::Node* op_node,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map,
    const std::vector<std::set<std::string>>& vars_appear_multi_in_one_block) {
  CHECK_EQ(op_node->IsOp(), true);
  for (auto* out : op_node->outputs) {
    if (out->IsCtrlVar()) continue;
    auto* real_node =
        GetRealNode(graphes, block_idx, out, vars_in_multi_block_map);
    if (!real_node->Var()->Persistable() &&
        vars_appear_multi_in_one_block[block_idx].count(out->Name())) {
      VLOG(2) << out->Name()
              << " is multi op's out, so we skip convert to fp16";
      return true;
    }
  }
  return false;
}

void SaveMixedModel(
    framework::ir::Graph* graph,
    framework::Scope* scope,
    framework::ProgramDesc* mixed_program_desc,
    const std::string& mixed_model_file,
    const std::string& mixed_params_file,
    phi::DataType mixed_precision,
    const std::unordered_map<std::string,
                             std::pair<framework::proto::VarType::Type, int>>&
        vars_in_multi_block_map) {
W
Wilber 已提交
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
  paddle::CPUPlace place;
  auto parameters = scope->LocalVarNames();
  std::sort(parameters.begin(), parameters.end());

  std::unordered_set<std::string> weights_should_be_fp32;
  for (auto* node : graph->Nodes()) {
    if (!(node->IsVar() && !node->IsCtrlVar())) continue;
    if (NodeVarHasDtype(node)) {
      if (node->Var()->Persistable() &&
          node->Var()->GetDataType() ==
              paddle::framework::proto::VarType::FP32) {
        VLOG(2) << "weights keep to fp32: " << node->Name();
        weights_should_be_fp32.insert(node->Name());
      }
    }
  }

  for (const auto& param_name : parameters) {
    auto* var = scope->FindLocalVar(param_name);
162 163
    if (var->IsType<phi::DenseTensor>() || var->IsType<phi::DenseTensor>()) {
      auto* t = var->GetMutable<phi::DenseTensor>();
W
Wilber 已提交
164 165
      if (t->dtype() != phi::DataType::FLOAT32) continue;

166
      phi::DenseTensor mixed_tensor;
W
Wilber 已提交
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
      mixed_tensor.Resize(t->dims());
      auto* data = t->mutable_data<float>(platform::CPUPlace());

      if (mixed_precision == phi::DataType::FLOAT16 &&
          !weights_should_be_fp32.count(param_name)) {
        mixed_tensor.set_type(paddle::experimental::DataType::FLOAT16);
        auto* mixed_data =
            mixed_tensor.mutable_data<float16>(platform::CPUPlace());
        for (int i = 0; i < t->numel(); i++) {
          mixed_data[i] = static_cast<float16>(data[i]);
        }
        t->clear();
        paddle::framework::TensorCopySync(mixed_tensor, place, t);
      } else if (mixed_precision == phi::DataType::BFLOAT16 &&
                 !weights_should_be_fp32.count(param_name)) {
        mixed_tensor.set_type(paddle::experimental::DataType::BFLOAT16);
        auto* mixed_data =
            mixed_tensor.mutable_data<bfloat16>(platform::CPUPlace());
        for (int i = 0; i < t->numel(); i++) {
          mixed_data[i] = static_cast<bfloat16>(data[i]);
        }
        t->clear();
        paddle::framework::TensorCopySync(mixed_tensor, place, t);
      }
    }
  }

  StrToBinary(mixed_model_file,
              mixed_program_desc->Proto()->SerializeAsString());
  StrToBinary(mixed_params_file, SerializeParams(scope, parameters));
}

bool PhiKernelSupportPrecision(
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
    const std::string& op_type,
    phi::Backend backend,
    phi::DataType data_type,
    phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) {
  auto kernels = phi::KernelFactory::Instance().kernels();
  if (kernels.find(op_type) == kernels.end()) {
    return false;
  }
  phi::KernelKey kernel_key(backend, layout, data_type);
  return phi::KernelFactory::Instance().HasKernel(op_type, kernel_key);
}

bool GpuKernelSupportPrecision(
    const std::string& op_type,
    phi::DataType data_type,
    phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) {
W
Wilber 已提交
216 217 218 219 220 221 222 223 224 225 226
  auto phi_op_type = phi::TransToPhiKernelName(op_type);
  bool res = PhiKernelSupportPrecision(
      phi_op_type, phi::Backend::GPU, data_type, layout);
  res |= PhiKernelSupportPrecision(
      phi_op_type, phi::Backend::GPUDNN, data_type, layout);

  if (!res) {
    auto& all_kernels = OperatorWithKernel::AllOpKernels();
    auto it = all_kernels.find(op_type);
    if (it != all_kernels.end()) {
      for (auto& kern_pair : it->second) {
227 228
        if (platform::is_gpu_place(kern_pair.first.place_) &&
            kern_pair.first.data_type_ == framework::proto::VarType::FP16) {
W
Wilber 已提交
229 230 231 232 233
          res = true;
        }
      }
    }
  }
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
  return res;
}

// Just process special cases.
bool OutShouldNotConvert(ir::Node* var_node) {
  auto op_node = var_node->inputs[0];
  auto* op_desc = op_node->Op();

  // batch_norm's input and output (variance and mean) are the same.
  if (op_desc->Type() == "batch_norm") {
    auto vecs = op_desc->Output("MeanOut");
    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
      return true;
    }
    vecs = op_desc->Output("VarianceOut");
    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
      return true;
    }
    vecs = op_desc->Output("SavedMean");
    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
      return true;
    }
    vecs = op_desc->Output("SavedVariance");
    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
      return true;
    }
  }

  return false;
}
264 265 266 267 268 269 270 271 272 273 274 275
void ProcessOutputNode(
    const std::vector<framework::ir::Graph*>& graphes,
    int block_idx,
    ir::Node* var_node,
    framework::proto::VarType::Type to_type,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map) {
  auto* real_node =
      GetRealNode(graphes, block_idx, var_node, vars_in_multi_block_map);
  if (!NodeVarHasDtype(real_node)) return;
  auto* out_var = real_node->Var();
W
Wilber 已提交
276 277 278
  if (out_var->GetDataType() == framework::proto::VarType::FP32) {
    if (OutShouldNotConvert(var_node)) return;
    out_var->SetDataType(to_type);
279
  }
W
Wilber 已提交
280 281
  VLOG(3) << " out_node name " << var_node->Name() << " data_type "
          << out_var->GetDataType();
282 283
}

284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307
// Just process special cases for weights conversion.
bool WeightsShouldNotConvert(ir::Node* var_node) {
  auto op_nodes = var_node->outputs;
  for (auto* op_node : op_nodes) {
    auto* op_desc = op_node->Op();
    // batch_norm op's bias, mean, scale and variance just be float32, so we can
    // not convert the dtype.
    if (op_desc->Type() == "batch_norm") {
      auto vecs = op_desc->Input("Bias");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
      vecs = op_desc->Input("Mean");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
      vecs = op_desc->Input("Scale");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
      vecs = op_desc->Input("Variance");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327
    } else if (op_desc->Type() == "fused_multi_transformer") {
      auto vecs = op_desc->Input("LnScale");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }

      vecs = op_desc->Input("LnBias");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }

      vecs = op_desc->Input("FFNLnScale");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }

      vecs = op_desc->Input("FFNLnBias");
      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
        return true;
      }
328 329 330 331 332
    }
  }

  return false;
}
W
Wilber 已提交
333 334 335
inline bool IsFloatVarType(framework::proto::VarType::Type type) {
  if (type == framework::proto::VarType::FP16 ||
      type == framework::proto::VarType::FP32 ||
336
      type == framework::proto::VarType::BF16)
W
Wilber 已提交
337 338 339
    return true;
  return false;
}
W
Wilber 已提交
340 341
void ProcessInputNode(
    bool support_precision,
342
    std::vector<framework::ir::Graph*> graphes,
W
Wilber 已提交
343 344 345 346 347 348
    ir::Node* in_node,
    ir::Node* op_node,
    int* suffix,
    framework::BlockDesc* block_desc,
    std::unordered_map<framework::ir::Node*, framework::ir::Node*>* cast_map,
    framework::proto::VarType::Type to_type,
349 350 351
    int block_idx,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
W
Wilber 已提交
352
        vars_in_multi_block_map) {
353 354 355 356 357 358
  auto* real_node =
      GetRealNode(graphes, block_idx, in_node, vars_in_multi_block_map);
  if (!NodeVarHasDtype(real_node)) return;
  auto graph = graphes[block_idx];
  bool is_main_block = block_idx == 0;
  auto* in_var = real_node->Var();
W
Wilber 已提交
359
  auto in_var_type = in_var->GetDataType();
360 361 362 363
  bool is_in_multi_block = vars_in_multi_block_map->count(in_var->Name());

  if (!is_main_block && is_in_multi_block) {
    in_var_type = vars_in_multi_block_map->at(in_var->Name()).first;
W
Wilber 已提交
364 365 366 367 368 369
  }
  if (support_precision) {
    if (in_var->Persistable() &&
        in_var_type == framework::proto::VarType::FP32) {
      if (WeightsShouldNotConvert(in_node)) return;
      in_var->SetDataType(to_type);
370
      in_var_type = to_type;
W
Wilber 已提交
371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394
    } else if (!in_var->Persistable() && IsFloatVarType(in_var_type) &&
               in_var_type != to_type) {
      AddCastOp(graph,
                in_node,
                op_node,
                in_var_type,
                to_type,
                suffix,
                block_desc,
                cast_map);
    }
  } else {
    if (!in_var->Persistable() && IsFloatVarType(in_var_type) &&
        in_var_type != to_type) {
      AddCastOp(graph,
                in_node,
                op_node,
                in_var_type,
                to_type,
                suffix,
                block_desc,
                cast_map);
    }
  }
395
  VLOG(3) << " in_node name " << in_var->Name() << " data_type " << in_var_type;
W
Wilber 已提交
396
}
W
Wilber 已提交
397

398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470
void ConvertAllFp64ToFp32(framework::ir::Graph* graph) {
  auto op_nodes = framework::ir::TopologySortOperations(*graph);
  for (auto* op_node : op_nodes) {
    if (!op_node->IsOp()) continue;
    auto op_type = op_node->Op()->Type();
    if (op_type == "feed" || op_type == "fetch") continue;

    if (op_type == "fill_constant") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "dtype", static_cast<int>(framework::proto::VarType::FP32));
    } else if (op_type == "assign_value") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "dtype", static_cast<int>(framework::proto::VarType::FP32));
    } else if (op_type == "eye") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "dtype", static_cast<int>(framework::proto::VarType::FP32));
    } else if (op_type == "fill_any_like") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "dtype", static_cast<int>(framework::proto::VarType::FP32));
    } else if (op_type == "cast") {
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("in_dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "in_dtype", static_cast<int>(framework::proto::VarType::FP32));
      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("out_dtype")) ==
          static_cast<int>(framework::proto::VarType::FP64))
        op_node->Op()->SetAttr(
            "out_dtype", static_cast<int>(framework::proto::VarType::FP32));
    }

    auto inputs = op_node->inputs;
    for (auto* in_node : inputs) {
      if (in_node->IsCtrlVar()) continue;
      auto* in_var = in_node->Var();
      if (!in_var->Persistable() &&
          in_var->GetDataType() == framework::proto::VarType::FP64) {
        in_var->SetDataType(framework::proto::VarType::FP32);
      }
    }
  }
}

// Handle special ops which contains dtype attribute. e.g., fill_constant,
// assign_value.
void HandleSpecialOps(framework::OpDesc* op_desc) {
  if (op_desc->Type() == "fill_constant") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
  } else if (op_desc->Type() == "assign_value") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
  } else if (op_desc->Type() == "eye") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
  } else if (op_desc->Type() == "fill_any_like") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
W
Wilber 已提交
471 472 473 474 475
  } else if (op_desc->Type() == "fill_constant_batch_size_like") {
    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
        static_cast<int>(framework::proto::VarType::FP32))
      op_desc->SetAttr("dtype",
                       static_cast<int>(framework::proto::VarType::FP16));
476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
  }
}

// We modify op's input output precision, and we need to fix cast op in_dtype
// and out_dtype attribute.
void FixCastAttr(framework::ir::Graph* graph) {
  auto op_nodes = framework::ir::TopologySortOperations(*graph);
  for (auto* op_node : op_nodes) {
    if (!op_node->IsOp()) continue;
    auto op_type = op_node->Op()->Type();
    if (op_type != "cast") continue;

    auto input = op_node->inputs[0];
    auto output = op_node->outputs[0];
    op_node->Op()->SetAttr("in_dtype",
                           static_cast<int>(input->Var()->GetDataType()));
    op_node->Op()->SetAttr("out_dtype",
                           static_cast<int>(output->Var()->GetDataType()));
  }
}

W
Wilber 已提交
497 498
void FindVarsInMultiBlock(
    framework::ProgramDesc* program_desc,
499 500 501 502 503 504
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map,
    std::vector<std::set<std::string>>* vars_appear_multi_in_one_block) {
  std::vector<std::set<std::string>> block_var_names_set(program_desc->Size());
  for (size_t i = 0; i < program_desc->Size(); ++i) {
W
Wilber 已提交
505 506
    for (auto op : program_desc->Block(i).AllOps()) {
      auto in_names = op->InputArgumentNames();
507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533
      block_var_names_set[i].insert(in_names.begin(), in_names.end());
      auto out_names = op->OutputArgumentNames();
      if (op->HasAttr("sub_block") == false) {
        for (auto& n : out_names) {
          if (block_var_names_set[i].count(n)) {
            (*vars_appear_multi_in_one_block)[i].insert(n);
          }
        }
      }
      block_var_names_set[i].insert(out_names.begin(), out_names.end());
    }
  }

  for (size_t i = 0; i < program_desc->Size() - 1; ++i) {
    for (size_t j = i + 1; j < program_desc->Size(); ++j) {
      std::set<std::string> vars_in_multi_block;
      std::set_intersection(
          block_var_names_set[i].begin(),
          block_var_names_set[i].end(),
          block_var_names_set[j].begin(),
          block_var_names_set[j].end(),
          std::inserter(vars_in_multi_block, vars_in_multi_block.begin()));

      for (auto name : vars_in_multi_block) {
        vars_in_multi_block_map->emplace(
            name, std::make_pair(framework::proto::VarType::FP32, i));
      }
534
    }
535 536
  }
}
W
Wilber 已提交
537

538 539 540 541 542 543 544 545 546 547 548 549 550 551 552
bool OpInOutHasTensorArray(
    std::vector<framework::ir::Graph*> graphes,
    int block_idx,
    framework::ir::Node* op_node,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map) {
  CHECK_EQ(op_node->IsOp(), true);
  for (auto in : op_node->inputs) {
    auto* real_node =
        GetRealNode(graphes, block_idx, in, vars_in_multi_block_map);
    if (!NodeVarHasDtype(real_node)) continue;
    if (real_node->Var()->GetType() ==
        framework::proto::VarType::LOD_TENSOR_ARRAY)
      return true;
W
Wilber 已提交
553 554
  }

555 556 557 558 559 560 561 562
  for (auto out : op_node->outputs) {
    auto* real_node =
        GetRealNode(graphes, block_idx, out, vars_in_multi_block_map);
    if (!NodeVarHasDtype(real_node)) continue;

    if (real_node->Var()->GetType() ==
        framework::proto::VarType::LOD_TENSOR_ARRAY)
      return true;
563
  }
564
  return false;
565 566
}

W
Wilber 已提交
567 568
void ConvertTensorDtype(
    framework::ProgramDesc* program_desc,
569
    std::vector<framework::ir::Graph*> graphes,
W
Wilber 已提交
570 571 572 573
    const std::unordered_set<std::string>& blacklist,
    bool keep_io_types,
    phi::Backend backend,
    phi::DataType tensor_dtype,
574 575 576 577 578 579
    int block_idx,
    std::unordered_map<std::string,
                       std::pair<framework::proto::VarType::Type, int>>*
        vars_in_multi_block_map,
    const std::vector<std::set<std::string>>& vars_appear_multi_in_one_block) {
  auto graph = graphes[block_idx];
580 581 582 583 584 585 586
  framework::proto::VarType::Type to_type;
  if (tensor_dtype == phi::DataType::FLOAT16) {
    to_type = framework::proto::VarType::FP16;
  } else if (tensor_dtype == phi::DataType::BFLOAT16) {
    to_type = framework::proto::VarType::BF16;
  } else {
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
W
Wilber 已提交
587
        "mixed_precision currently not supported dtype %d, we now only "
588
        "support fp16 and bf16.",
589 590 591
        static_cast<int>(tensor_dtype)));
  }

W
Wilber 已提交
592 593 594
  auto* block_desc =
      framework::ir::TopologySortOperations(*graph)[0]->Op()->Block();

595 596 597 598
  int num_low_precision = 0;
  int suffix = 0;
  std::vector<framework::ir::Node*> output_nodes;
  std::unordered_map<framework::ir::Node*, framework::ir::Node*> cast_map;
599 600
  auto op_nodes = framework::ir::TopologySortOperations(*graph);
  for (auto* op_node : op_nodes) {
601 602
    if (!op_node->IsOp()) continue;
    auto op_type = op_node->Op()->Type();
W
Wilber 已提交
603 604
    VLOG(3) << "-------------------- op_type " << op_type << ", phi_type "
            << phi::TransToPhiKernelName(op_type);
605 606 607 608 609 610 611 612 613 614 615
    // 1. set input dtype.
    if (op_type == "feed") {
      auto feed_var = op_node->outputs[0]->Var();
      if (!keep_io_types &&
          feed_var->GetDataType() == framework::proto::VarType::FP32) {
        feed_var->SetDataType(to_type);
      }
    } else if (op_type == "fetch") {
      auto* fetch_var = op_node->inputs[0];
      output_nodes.push_back(fetch_var);
      continue;
616 617
    } else if (op_type == "cast") {
      continue;
618 619
    }

W
Wilber 已提交
620 621 622 623 624
    else if (op_node->Op()->HasAttr("sub_block")) {  // NOLINT
      // sub_block op's output dtype should be same as input dtype, if have the
      // same name.
      std::unordered_map<std::string, framework::ir::Node*> in_name_to_node;
      for (auto* in : op_node->inputs) {
625 626 627
        auto* real_node =
            GetRealNode(graphes, block_idx, in, vars_in_multi_block_map);
        if (NodeVarHasDtype(real_node)) {
W
Wilber 已提交
628 629 630 631 632
          in_name_to_node[in->Name()] = in;
        }
      }

      for (auto out : op_node->outputs) {
633 634 635
        auto* real_node =
            GetRealNode(graphes, block_idx, out, vars_in_multi_block_map);
        if (NodeVarHasDtype(real_node)) {
W
Wilber 已提交
636
          if (in_name_to_node.count(out->Name()))
637
            real_node->Var()->SetDataType(
W
Wilber 已提交
638 639 640 641 642 643 644
                in_name_to_node[out->Name()]->Var()->GetDataType());
        }
      }

      continue;
    }

645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661
    // A strange case found in multi block.
    else if (op_type == "assign" &&  // NOLINT
             op_node->inputs[0]->Name() == op_node->outputs[0]->Name()) {
      VLOG(2) << " in out are same, continue";
      continue;
    }

    // Handle tensor array.
    else if (OpInOutHasTensorArray(  // NOLINT
                 graphes,
                 block_idx,
                 op_node,
                 vars_in_multi_block_map)) {
      VLOG(2) << "  in or out has tensor array, continue";
      continue;
    }

662 663 664 665
    // 2. if op support fp16/bf16 and not in blacklist.
    //      - cast weight to fp16/bf16.
    //      - add cast op if the input dtype is not fp16/bf16.
    //      - set output dtype.
666 667 668 669 670 671 672 673 674
    //
    // If a var(op's out var) appears multiple times in a block, we should not
    // convert to fp16.
    else if (blacklist.count(op_type) == 0 &&  // NOLINT
             !VarIsMultiOpsOut(graphes,
                               block_idx,
                               op_node,
                               vars_in_multi_block_map,
                               vars_appear_multi_in_one_block)) {
675
      bool support_precision =
W
Wilber 已提交
676
          OpSupportPrecision(op_type, backend, tensor_dtype, blacklist);
677
      VLOG(2) << " support low precision " << support_precision;
678

W
Wilber 已提交
679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698
      // if op not has float input, we will not choose the low precision kernel.
      {
        bool has_float_input{false};
        for (auto in_node : op_node->inputs) {
          auto* real_node =
              GetRealNode(graphes, block_idx, in_node, vars_in_multi_block_map);
          if (real_node->Var()->GetDataType() == proto::VarType::FP16 ||
              real_node->Var()->GetDataType() == proto::VarType::FP32 ||
              real_node->Var()->GetDataType() == proto::VarType::FP64 ||
              real_node->Var()->GetDataType() == proto::VarType::BF16) {
            has_float_input = true;
            break;
          }
        }
        if (!has_float_input) {
          support_precision = false;
          VLOG(2) << " op doesn't has float input, just skip.";
        }
      }

699
      if (support_precision) {
700
        HandleSpecialOps(op_node->Op());
701 702
        ++num_low_precision;
        auto inputs = op_node->inputs;
W
Wilber 已提交
703
        // Process inputs.
704
        for (auto* in_node : inputs) {
W
Wilber 已提交
705
          ProcessInputNode(true,
706
                           graphes,
W
Wilber 已提交
707 708 709 710 711 712
                           in_node,
                           op_node,
                           &suffix,
                           block_desc,
                           &cast_map,
                           to_type,
713
                           block_idx,
W
Wilber 已提交
714
                           vars_in_multi_block_map);
715
        }
W
Wilber 已提交
716
        // Process outputs.
717
        for (auto* out_node : op_node->outputs) {
718 719
          ProcessOutputNode(
              graphes, block_idx, out_node, to_type, vars_in_multi_block_map);
720 721 722 723
        }
      } else {
        auto inputs = op_node->inputs;
        for (auto* in_node : inputs) {
W
Wilber 已提交
724
          ProcessInputNode(false,
725
                           graphes,
W
Wilber 已提交
726 727 728 729 730 731
                           in_node,
                           op_node,
                           &suffix,
                           block_desc,
                           &cast_map,
                           framework::proto::VarType::FP32,
732
                           block_idx,
W
Wilber 已提交
733
                           vars_in_multi_block_map);
734 735 736 737 738 739 740
        }
      }
    }

    // 3. check op not support fp16/bf16 or in blacklist.
    //      - add cast op if the input dtype is not fp32.
    else {  // NOLINT
741 742
      auto ins = op_node->inputs;
      for (auto* in_node : ins) {
W
Wilber 已提交
743
        if (in_node->IsCtrlVar()) continue;
744 745 746 747 748 749 750 751 752 753 754 755 756 757 758
        auto* in_var = in_node->Var();
        if (in_var->GetDataType() == to_type) {
          AddCastOp(graph,
                    in_node,
                    op_node,
                    to_type,
                    framework::proto::VarType::FP32,
                    &suffix,
                    block_desc,
                    &cast_map);
        }
      }
    }
  }

W
Wilber 已提交
759 760
  // 4. if output_op's dtype is not compatible to output dtype, then just
  // insert cast.
761
  for (auto* node : output_nodes) {
W
Wilber 已提交
762
    if (node->IsCtrlVar()) continue;
763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787
    auto var = node->Var();
    if (keep_io_types && var->GetDataType() == to_type) {
      // fp16/bf16 -> fp32.
      AddCastOp(graph,
                node,
                node->outputs[0],
                to_type,
                framework::proto::VarType::FP32,
                &suffix,
                block_desc,
                &cast_map);
    } else if (!keep_io_types &&
               var->GetDataType() == framework::proto::VarType::FP32) {
      // fp32 -> fp16/bf16
      AddCastOp(graph,
                node,
                node->outputs[0],
                framework::proto::VarType::FP32,
                to_type,
                &suffix,
                block_desc,
                &cast_map);
    }
  }

788 789 790 791 792 793 794 795 796
  for (auto node : graph->Nodes()) {
    auto* real_node =
        GetRealNode(graphes, block_idx, node, vars_in_multi_block_map);
    if (!NodeVarHasDtype(real_node)) continue;

    if (vars_in_multi_block_map->count(real_node->Name()) &&
        vars_in_multi_block_map->at(real_node->Name()).second == block_idx) {
      vars_in_multi_block_map->at(real_node->Name()).first =
          real_node->Var()->GetDataType();
W
Wilber 已提交
797 798 799
    }
  }

800
  if (num_low_precision)
801 802
    LOG(INFO) << "---  detected " << num_low_precision
              << " low precision ops in " << block_idx << " subgraph";
803 804 805
}
}  // namespace

W
Wilber 已提交
806
bool OpSupportPrecision(const std::string& op_type,
807 808 809
                        phi::Backend backend,
                        phi::DataType precision,
                        const std::unordered_set<std::string>& blacklist) {
W
Wilber 已提交
810
  auto phi_op_type = phi::TransToPhiKernelName(op_type);
811
  bool support_precision = false;
W
Wilber 已提交
812
  if (blacklist.count(op_type) == 0) {
813
    if (backend == phi::Backend::GPU)
W
Wilber 已提交
814
      support_precision = GpuKernelSupportPrecision(op_type, precision);
815 816
    else
      support_precision =
W
Wilber 已提交
817
          PhiKernelSupportPrecision(phi_op_type, backend, precision);
818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884
  }
  return support_precision;
}

void AddCastOp(
    framework::ir::Graph* graph,
    framework::ir::Node* node,
    framework::ir::Node* next_op,
    framework::proto::VarType::Type from_type,
    framework::proto::VarType::Type to_type,
    int* suffix,
    framework::BlockDesc* block_desc,
    std::unordered_map<framework::ir::Node*, framework::ir::Node*>* map) {
  auto update_cast_desc = [&](framework::OpDesc& desc,
                              const std::string& x_name,
                              const std::string& out_name,
                              const int in_dtype,
                              const int out_dtype) {
    desc.SetType("cast");
    desc.SetInput("X", {x_name});
    desc.SetOutput("Out", {out_name});
    desc.SetAttr("in_dtype", in_dtype);
    desc.SetAttr("out_dtype", out_dtype);
    desc.SetAttr("use_mkldnn", false);
    desc.SetAttr("with_quant_attr", false);
    desc.Flush();
  };

  if (map->count(node) == 0) {
    // insert cast op before node.
    std::string cast_input_name = node->Var()->Name();
    std::string cast_output_name =
        node->Var()->Name() + "_cast.tmp_" + std::to_string((*suffix)++);
    CHECK_NOTNULL(block_desc);
    framework::OpDesc cast_op_desc(block_desc);
    update_cast_desc(cast_op_desc,
                     cast_input_name,
                     cast_output_name,
                     static_cast<int>(from_type),
                     static_cast<int>(to_type));
    auto* cast_op_node = graph->CreateOpNode(&cast_op_desc);
    auto* cast_output_vardesc = block_desc->Var(cast_output_name);
    cast_output_vardesc->SetPersistable(false);
    cast_output_vardesc->SetDataType(to_type);
    cast_output_vardesc->SetShape(node->Var()->GetShape());
    auto* cast_output_node = graph->CreateVarNode(cast_output_vardesc);
    IR_NODE_LINK_TO(cast_op_node, cast_output_node);
    (*map)[node] = cast_output_node;
  }
  next_op->Op()->RenameInput(node->Name(), map->at(node)->Name());
  IR_NODE_LINK_TO(node, map->at(node)->inputs[0]);
  IR_NODE_LINK_TO(map->at(node), next_op);
}

void ConvertToMixedPrecision(const std::string& model_file,
                             const std::string& params_file,
                             const std::string& mixed_model_file,
                             const std::string& mixed_params_file,
                             phi::DataType mixed_precision,
                             phi::Backend backend,
                             bool keep_io_types,
                             std::unordered_set<std::string> black_list) {
  paddle::CPUPlace place;
  framework::Executor executor(place);
  framework::Scope scope;
  auto program_desc =
      inference::Load(&executor, &scope, model_file, params_file);
W
Wilber 已提交
885
  auto main_graph = std::unique_ptr<framework::ir::Graph>(
886 887
      new framework::ir::Graph(*program_desc));

888 889
  std::unordered_map<std::string,
                     std::pair<framework::proto::VarType::Type, int>>
W
Wilber 已提交
890
      vars_in_multi_block_map;
891 892 893 894 895
  std::vector<std::set<std::string>> vars_appear_multi_in_one_block(
      program_desc->Size());
  FindVarsInMultiBlock(program_desc.get(),
                       &vars_in_multi_block_map,
                       &vars_appear_multi_in_one_block);
W
Wilber 已提交
896

897
  std::vector<framework::ir::Graph*> graphes;
W
Wilber 已提交
898 899
  for (size_t i = 0; i < main_graph->SubGraphsSize(); ++i) {
    auto graph = main_graph->GetSubGraph(i);
900
    graphes.push_back(graph);
W
Wilber 已提交
901
    VLOG(2) << " --------  handle subgraph " << i << ", has "
902
            << graph->Nodes().size() << " nodes --------";
W
Wilber 已提交
903 904 905

    ConvertAllFp64ToFp32(graph);
    ConvertTensorDtype(program_desc.get(),
906
                       graphes,
W
Wilber 已提交
907 908 909 910
                       black_list,
                       keep_io_types,
                       backend,
                       mixed_precision,
911 912 913
                       i,
                       &vars_in_multi_block_map,
                       vars_appear_multi_in_one_block);
W
Wilber 已提交
914
    FixCastAttr(graph);
915 916
  }

W
Wilber 已提交
917 918 919 920 921 922 923 924
  framework::ProgramDesc mixed_program_desc;
  framework::ir::GraphToProgram(*main_graph, &mixed_program_desc);

  SaveMixedModel(main_graph.get(),
                 &scope,
                 &mixed_program_desc,
                 mixed_model_file,
                 mixed_params_file,
925 926
                 mixed_precision,
                 vars_in_multi_block_map);
927 928 929 930 931
}

}  // namespace analysis
}  // namespace inference
}  // namespace paddle