paddle_pass_builder.cc 18.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/inference/api/paddle_pass_builder.h"
16 17 18
#ifdef PADDLE_WITH_CUDA
#include <cudnn.h>
#endif
19 20 21
#ifdef PADDLE_WITH_HIP
#include <miopen/miopen.h>
#endif
22
#include <glog/logging.h>
23

24
#include <algorithm>
25
#include <sstream>
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54

namespace paddle {

void PaddlePassBuilder::AppendPass(const std::string &pass_type) {
  passes_.push_back(pass_type);
}

void PaddlePassBuilder::TurnOnDebug() {
  std::vector<std::string> passes;
  auto it = std::begin(passes_);
  while (it != std::end(passes_)) {
    if (*it != "graph_viz_pass") {
      it = passes_.insert(it + 1, "graph_viz_pass");
    } else {
      ++it;
    }
  }
}

std::string PaddlePassBuilder::DebugString() {
  std::stringstream ss;
  ss << "Passes to apply:\n";
  for (auto &pass : passes_) {
    ss << "  - " << pass << '\n';
  }
  return ss.str();
}

void PaddlePassBuilder::DeletePass(const std::string &pass_type) {
55
  deleted_passes_.insert(pass_type);
56 57 58 59 60 61 62 63 64 65
  auto it = std::begin(passes_);
  while (it != std::end(passes_)) {
    if (*it == pass_type) {
      it = passes_.erase(it);
    } else {
      ++it;
    }
  }
}

66 67 68 69 70 71
size_t PaddlePassBuilder::GetPassIndex(const std::string &pass_type) {
  auto iter = std::find(std::begin(passes_), std::end(passes_), pass_type);
  if (iter == std::end(passes_)) return -1;
  return std::distance(std::begin(passes_), iter);
}

72 73 74 75 76 77 78 79
void PaddlePassBuilder::InsertPass(size_t idx, const std::string &pass_type) {
  passes_.insert(std::begin(passes_) + idx, pass_type);
}

void PaddlePassBuilder::DeletePass(size_t idx) {
  passes_.erase(std::begin(passes_) + idx);
}

W
Wojciech Uss 已提交
80 81
void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
  analysis_passes_.push_back(pass);
82 83
}

W
Wojciech Uss 已提交
84 85
void PaddlePassBuilder::ClearPasses() { passes_.clear(); }

86
const std::vector<std::string> kTRTSubgraphPasses({
87
  "adaptive_pool2d_convert_global_pass",       //
88 89
      "shuffle_channel_detect_pass",           //
      "quant_conv2d_dequant_fuse_pass",        //
S
shentanyue 已提交
90
      "delete_fill_constant_op_pass",          //
91 92 93 94
      "delete_quant_dequant_op_pass",          //
      "delete_quant_dequant_filter_op_pass",   //
      "delete_weight_dequant_linear_op_pass",  //
      "delete_quant_dequant_linear_op_pass",   //
95
      "identity_scale_op_clean_pass",          //
96
      "add_support_int8_pass",                 //
97
      // "fc_fuse_pass",                        //
98 99 100 101
      "simplify_with_basic_ops_pass",  //

#if defined _WIN32
#else
102
      "trt_embedding_eltwise_layernorm_fuse_pass",    //
103
      "preln_embedding_eltwise_layernorm_fuse_pass",  //
104 105 106 107 108
#endif

      "delete_c_identity_op_pass",            //
      "trt_multihead_matmul_fuse_pass_v2",    //
      "trt_multihead_matmul_fuse_pass_v3",    //
109
      "constant_folding_pass",                //
110 111 112 113
      "vit_attention_fuse_pass",              //
      "trt_skip_layernorm_fuse_pass",         //
      "preln_skip_layernorm_fuse_pass",       //
      "layernorm_shift_partition_fuse_pass",  //
W
Wang Bojun 已提交
114
      "merge_layernorm_fuse_pass",            //
115
      "preln_residual_bias_fuse_pass",        //
W
wenbin 已提交
116
      "preln_layernorm_x_fuse_pass",          //
117
      // "set_transformer_input_convert_pass",       //
118 119 120 121 122 123 124 125 126 127 128
      "conv_bn_fuse_pass",                           //
      "unsqueeze2_eltwise_fuse_pass",                //
      "trt_squeeze2_matmul_fuse_pass",               //
      "trt_flatten2_matmul_fuse_pass",               //
      "trt_map_matmul_v2_to_mul_pass",               //
      "trt_map_matmul_v2_to_matmul_pass",            //
      "trt_map_matmul_to_mul_pass",                  //
      "fc_fuse_pass",                                //
      "conv_elementwise_add_fuse_pass",              //
      "remove_padding_recover_padding_pass",         //
      "delete_remove_padding_recover_padding_pass",  //
129
      // "yolo_box_fuse_pass",      //
130 131
      "dense_fc_to_sparse_pass",                //
      "dense_multihead_matmul_to_sparse_pass",  //
132 133
      "tensorrt_subgraph_pass",                 //
      "conv_bn_fuse_pass",                      //
134 135
#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
                           // guaranteed at least v7
136 137 138
// cudnn8.0 has memory leak problem in conv + eltwise + act, so we
// disable the pass.
#if !(CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8100)
139 140
      "conv_elementwise_add_act_fuse_pass",   //
      "conv_elementwise_add2_act_fuse_pass",  //
141 142
#endif
#endif
143 144 145
      "transpose_flatten_concat_fuse_pass",
});

D
denglin-github 已提交
146 147
const std::vector<std::string> kDlnneSubgraphPasses({
    "is_test_pass",                  //
M
ming1753 已提交
148
    "delete_dropout_op_pass",        //
D
denglin-github 已提交
149 150 151 152 153 154 155
    "simplify_with_basic_ops_pass",  //
    "conv_bn_fuse_pass",             //
    "depthwise_conv_bn_fuse_pass",   //
    "shuffle_channel_detect_pass",   //
    "dlnne_subgraph_pass",           //
});

石晓伟 已提交
156 157 158 159 160 161
const std::vector<std::string> kLiteSubgraphPasses({
#ifdef PADDLE_WITH_LITE
    "lite_subgraph_pass",
#endif
});

162 163 164 165
// TODO(inference): Most of the existing pass fusion operators do not
// support fp16/bf16 precision, temporarily use low precision pass to prevent
// running errors. After fusion operator supports low precision, delete this.
const std::vector<std::string> kGpuLowerPrecisionPasses{
W
Wilber 已提交
166
    "simplify_with_basic_ops_pass",
167 168 169 170
    "conv_bn_fuse_pass",
    "conv_eltwiseadd_bn_fuse_pass",
    "conv_elementwise_add_act_fuse_pass",
    "conv_elementwise_add2_act_fuse_pass",
M
ming1753 已提交
171
    "conv_elementwise_add_fuse_pass",
W
Wilber 已提交
172
    "multihead_matmul_fuse_pass_v2",
173 174 175 176 177 178
    "fused_multi_transformer_encoder_pass",
    "fused_multi_transformer_decoder_pass",
    "fused_multi_transformer_encoder_fuse_qkv_pass",
    "fused_multi_transformer_decoder_fuse_qkv_pass",
    "multi_devices_fused_multi_transformer_encoder_fuse_qkv_pass",
    "multi_devices_fused_multi_transformer_decoder_fuse_qkv_pass",
W
Wilber 已提交
179 180
    "gpu_cpu_map_matmul_v2_to_mul_pass",
    "gpu_cpu_map_matmul_v2_to_matmul_pass",
181 182
    "fc_fuse_pass",
    "fc_elementwise_layernorm_fuse_pass",
183 184
    "embedding_eltwise_layernorm_fuse_pass",
    "runtime_context_cache_pass",
185
};
186

187
const std::vector<std::string> kTrtLowerPrecisionPasses{
W
Wilber 已提交
188
    "simplify_with_basic_ops_pass",
189 190
    // "conv_bn_fuse_pass",
    // "conv_eltwiseadd_bn_fuse_pass",
191 192
    "trt_embedding_eltwise_layernorm_fuse_pass",
    "trt_skip_layernorm_fuse_pass",
193 194 195 196
    "trt_map_matmul_v2_to_mul_pass",
    "trt_map_matmul_v2_to_matmul_pass",
    "trt_map_matmul_to_mul_pass",
    "fc_fuse_pass",
197 198 199
    "tensorrt_subgraph_pass",
};

200 201
GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
  passes_.assign({
202
    //   "identity_scale_op_clean_pass",             //
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
    "is_test_pass",                                                     //
        "simplify_with_basic_ops_pass",                                 //
        "conv_bn_fuse_pass",                                            //
        "conv_eltwiseadd_bn_fuse_pass",                                 //
        "embedding_eltwise_layernorm_fuse_pass",                        //
        "multihead_matmul_fuse_pass_v2",                                //
        "fused_multi_transformer_encoder_pass",                         //
        "fused_multi_transformer_decoder_pass",                         //
        "fused_multi_transformer_encoder_fuse_qkv_pass",                //
        "fused_multi_transformer_decoder_fuse_qkv_pass",                //
        "multi_devices_fused_multi_transformer_encoder_fuse_qkv_pass",  //
        "multi_devices_fused_multi_transformer_decoder_fuse_qkv_pass",  //
        "gpu_cpu_squeeze2_matmul_fuse_pass",                            //
        "gpu_cpu_reshape2_matmul_fuse_pass",                            //
        "gpu_cpu_flatten2_matmul_fuse_pass",                            //
        "gpu_cpu_map_matmul_v2_to_mul_pass",                            //
        "gpu_cpu_map_matmul_v2_to_matmul_pass",                         //
        "matmul_scale_fuse_pass",                                       //
        "multihead_matmul_fuse_pass_v3",                                //
        "gpu_cpu_map_matmul_to_mul_pass",                               //
        "fc_fuse_pass",                                                 //
        "fc_elementwise_layernorm_fuse_pass",                           //
225 226
#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
                           // guaranteed at least v7
227 228 229
// cudnn8.0 has memory leak problem in conv + eltwise + act, so we
// disable the pass.
#if !(CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8100)
230 231
        "conv_elementwise_add_act_fuse_pass",   //
        "conv_elementwise_add2_act_fuse_pass",  //
232 233 234 235
#endif
        "conv_elementwise_add_fuse_pass",      //
#endif                                         //
        "transpose_flatten_concat_fuse_pass",  //
236
        "constant_folding_pass",
237
        // following pass should be located in the last, since it will
238 239
        // work on all fused ops.
        "runtime_context_cache_pass"
240 241 242 243 244
  });

  use_gpu_ = true;
}

245 246 247 248 249 250 251
void GpuPassStrategy::EnableCUDNN() {
  if (!use_cudnn_) {
    passes_.insert(passes_.begin(), "cudnn_placement_pass");
  }
  use_cudnn_ = true;
}

W
Wojciech Uss 已提交
252 253
void GpuPassStrategy::EnableMKLDNN() {
  LOG(ERROR) << "GPU not support MKLDNN yet";
254 255
}

W
Wojciech Uss 已提交
256 257
void GpuPassStrategy::EnableMkldnnQuantizer() {
  LOG(ERROR) << "GPU not support MKL-DNN quantization";
Y
Yan Chunwei 已提交
258 259
}

260 261 262 263
void GpuPassStrategy::EnableMkldnnBfloat16() {
  LOG(ERROR) << "GPU not support MKL-DNN bfloat16";
}

B
baoachun 已提交
264 265 266 267
void GpuPassStrategy::EnableMkldnnInt8() {
  LOG(ERROR) << "GPU not support MKL-DNN int8";
}

268 269 270
CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
  // NOTE the large fusions should be located in the front, so that they will
  // not be damaged by smaller ones.
271 272
  passes_.assign({"simplify_with_basic_ops_pass",  //
                  "layer_norm_fuse_pass",
273
                  "attention_lstm_fuse_pass",       //
274 275
                  "seqconv_eltadd_relu_fuse_pass",  //
                  // "seqpool_concat_fuse_pass",    //
276
                  "seqpool_cvm_concat_fuse_pass",  //
277
                  // "embedding_fc_lstm_fuse_pass", //
278
                  // TODO(wilber): fix correctness problem.
279
                  // "fc_lstm_fuse_pass",                    //
280 281 282 283
                  "mul_lstm_fuse_pass",                      //
                  "fc_gru_fuse_pass",                        //
                  "mul_gru_fuse_pass",                       //
                  "seq_concat_fc_fuse_pass",                 //
284 285 286
                  "gpu_cpu_squeeze2_matmul_fuse_pass",       //
                  "gpu_cpu_reshape2_matmul_fuse_pass",       //
                  "gpu_cpu_flatten2_matmul_fuse_pass",       //
H
heliqi 已提交
287
                  "matmul_v2_scale_fuse_pass",               //
288 289
                  "gpu_cpu_map_matmul_v2_to_mul_pass",       //
                  "gpu_cpu_map_matmul_v2_to_matmul_pass",    //
H
heliqi 已提交
290
                  "matmul_scale_fuse_pass",                  //
291
                  "gpu_cpu_map_matmul_to_mul_pass",          //
292 293 294 295 296 297 298 299
                  "fc_fuse_pass",                            //
                  "repeated_fc_relu_fuse_pass",              //
                  "squared_mat_sub_fuse_pass",               //
                  "conv_bn_fuse_pass",                       //
                  "conv_eltwiseadd_bn_fuse_pass",            //
                  "conv_transpose_bn_fuse_pass",             //
                  "conv_transpose_eltwiseadd_bn_fuse_pass",  //
                  "is_test_pass",                            //
300
                  "constant_folding_pass",
301 302
                  // following pass should be located in the last, since
                  // it will work on all fused ops.
303
                  "runtime_context_cache_pass"});
Y
Yan Chunwei 已提交
304

305 306
  use_gpu_ = false;
}
W
Wojciech Uss 已提交
307

308 309
void CpuPassStrategy::EnableCUDNN() { LOG(ERROR) << "CPU not support cuDNN"; }

W
Wojciech Uss 已提交
310 311 312 313 314 315
void CpuPassStrategy::EnableMKLDNN() {
// TODO(Superjomn) Consider the way to mix CPU with GPU.
#ifdef PADDLE_WITH_MKLDNN
  if (!use_mkldnn_) {
    passes_.insert(passes_.begin(), "mkldnn_placement_pass");

316
    for (auto &pass : std::vector<std::string>({
317
             "squeeze2_transpose2_onednn_fuse_pass",
318 319 320
             "depthwise_conv_mkldnn_pass",    //
             "conv_bn_fuse_pass",             // Execute BN passes again to
             "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
321 322
             "conv_affine_channel_mkldnn_fuse_pass",    //
             "conv_transpose_bn_fuse_pass",             //
323 324
             "conv_transpose_eltwiseadd_bn_fuse_pass",  //
             "conv_bias_mkldnn_fuse_pass",              //
325
             "conv_transpose_bias_mkldnn_fuse_pass",
326 327
             // TODO(baoachun): Need to support 5-dimensional input.
             // "conv3d_bias_mkldnn_fuse_pass",  //
328
             "conv_elementwise_add_mkldnn_fuse_pass",
329 330 331 332 333 334
             "conv_activation_mkldnn_fuse_pass",           //
             "scale_matmul_fuse_pass",                     //
             "reshape_transpose_matmul_mkldnn_fuse_pass",  //
             "matmul_transpose_reshape_mkldnn_fuse_pass",  //
             "matmul_elementwise_add_mkldnn_fuse_pass",    //
             "matmul_activation_mkldnn_fuse_pass",         //
335
             // Disabled due to topology-dependent speed-up
H
heliqi 已提交
336 337
             //  "fc_mkldnn_pass",
             //  "fc_act_mkldnn_fuse_pass",
338
             "fc_elementwise_add_mkldnn_fuse_pass",   //
339 340
             "batch_norm_act_fuse_pass",              //
             "softplus_activation_mkldnn_fuse_pass",  //
341
             "shuffle_channel_mkldnn_detect_pass",    //
342
             "elt_act_mkldnn_fuse_pass",              //
343
             "operator_scale_onednn_fuse_pass",       //
344 345
             "operator_unsqueeze2_onednn_fuse_pass",  //
             "operator_reshape2_onednn_fuse_pass",    //
346 347
             // TODO(intel): Please fix the bug on windows.
             // https://github.com/PaddlePaddle/Paddle/issues/29710
348
             // "mkldnn_inplace_pass",  // This pass should be activated after
349 350
             // fuses. Disabled by default due to
             // little gain and lots of problems
351
         })) {
W
Wojciech Uss 已提交
352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371
      passes_.push_back(pass);
    }
  }
  use_mkldnn_ = true;
#else
  use_mkldnn_ = false;
#endif
}

void CpuPassStrategy::EnableMkldnnQuantizer() {
#ifdef PADDLE_WITH_MKLDNN
  if (!use_mkldnn_quantizer_) {
    passes_.push_back("cpu_quantize_placement_pass");
  }
  use_mkldnn_quantizer_ = true;
#else
  use_mkldnn_quantizer_ = false;
#endif
}

372 373
void CpuPassStrategy::EnableMkldnnBfloat16() {
#ifdef PADDLE_WITH_MKLDNN
374
  if (!use_mkldnn_bfloat16_) {
T
Tomasz Socha 已提交
375 376 377 378
    passes_.push_back("fc_mkldnn_pass");
    passes_.push_back("fc_act_mkldnn_fuse_pass");
    passes_.push_back("fc_elementwise_add_mkldnn_fuse_pass");

379 380
    passes_.push_back("cpu_bfloat16_placement_pass");
    passes_.push_back("cpu_bfloat16_pass");
381
    passes_.push_back("cpu_quantize_squash_pass");
382
  }
383 384 385 386 387 388
  use_mkldnn_bfloat16_ = true;
#else
  use_mkldnn_bfloat16_ = false;
#endif
}

B
baoachun 已提交
389 390 391 392 393
void CpuPassStrategy::EnableMkldnnInt8() {
#ifdef PADDLE_WITH_MKLDNN
  if (!use_mkldnn_int8_) {
    passes_.clear();
    passes_.push_back("quant_dequant_mkldnn_pass");
394 395
    passes_.push_back("mkldnn_placement_pass");
    passes_.push_back("simplify_with_basic_ops_pass");
396
    passes_.push_back("constant_folding_pass");
397
    passes_.push_back("squeeze2_transpose2_onednn_fuse_pass");
B
baoachun 已提交
398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421
    passes_.push_back("layer_norm_fuse_pass");
    passes_.push_back("attention_lstm_fuse_pass");
    passes_.push_back("seqconv_eltadd_relu_fuse_pass");
    passes_.push_back("fc_lstm_fuse_pass");
    passes_.push_back("mul_lstm_fuse_pass");
    passes_.push_back("fc_gru_fuse_pass");
    passes_.push_back("mul_gru_fuse_pass");
    passes_.push_back("multi_gru_fuse_pass");
    passes_.push_back("multi_gru_seq_fuse_pass");
    passes_.push_back("seq_concat_fc_fuse_pass");
    passes_.push_back("gpu_cpu_squeeze2_matmul_fuse_pass");
    passes_.push_back("gpu_cpu_reshape2_matmul_fuse_pass");
    passes_.push_back("gpu_cpu_flatten2_matmul_fuse_pass");
    passes_.push_back("matmul_v2_scale_fuse_pass");
    passes_.push_back("squared_mat_sub_fuse_pass");
    passes_.push_back("is_test_pass");
    passes_.push_back("gpu_cpu_map_matmul_v2_to_mul_pass");
    passes_.push_back("gpu_cpu_map_matmul_v2_to_matmul_pass");
    passes_.push_back("matmul_scale_fuse_pass");
    passes_.push_back("gpu_cpu_map_matmul_to_mul_pass");
    passes_.push_back("repeated_fc_relu_fuse_pass");
    passes_.push_back("depthwise_conv_mkldnn_pass");
    passes_.push_back("conv_bn_fuse_pass");
    passes_.push_back("conv_eltwiseadd_bn_fuse_pass");
422
    passes_.push_back("conv_affine_channel_mkldnn_fuse_pass");
B
baoachun 已提交
423 424 425 426 427
    passes_.push_back("conv_transpose_bn_fuse_pass");
    passes_.push_back("conv_transpose_eltwiseadd_bn_fuse_pass");
    passes_.push_back("conv_bias_mkldnn_fuse_pass");
    passes_.push_back("conv_transpose_bias_mkldnn_fuse_pass");
    passes_.push_back("conv_elementwise_add_mkldnn_fuse_pass");
428
    passes_.push_back("conv_activation_mkldnn_fuse_pass");
B
baoachun 已提交
429 430 431 432
    passes_.push_back("fc_fuse_pass");
    passes_.push_back("repeated_fc_relu_fuse_pass");
    passes_.push_back("fc_mkldnn_pass");
    passes_.push_back("fc_act_mkldnn_fuse_pass");
433
    passes_.push_back("matmul_transpose_reshape_mkldnn_fuse_pass");
B
baoachun 已提交
434 435 436 437 438
    passes_.push_back("batch_norm_act_fuse_pass");
    passes_.push_back("softplus_activation_mkldnn_fuse_pass");
    passes_.push_back("compute_propagate_scales_mkldnn_pass");
    passes_.push_back("scale_matmul_fuse_pass");
    passes_.push_back("reshape_transpose_matmul_mkldnn_fuse_pass");
439
    passes_.push_back("matmul_elementwise_add_mkldnn_fuse_pass");
440
    passes_.push_back("operator_scale_onednn_fuse_pass");
441 442
    passes_.push_back("operator_unsqueeze2_onednn_fuse_pass");
    passes_.push_back("operator_reshape2_onednn_fuse_pass");
B
baoachun 已提交
443 444 445
    passes_.push_back("cpu_quantize_placement_pass");
    passes_.push_back("cpu_quantize_pass");
    passes_.push_back("cpu_quantize_squash_pass");
446 447
    passes_.push_back("int8_scale_calculation_mkldnn_pass");
    passes_.push_back("params_quantization_mkldnn_pass");
B
baoachun 已提交
448 449 450 451 452 453 454 455 456
    passes_.push_back("mkldnn_inplace_pass");
    passes_.push_back("runtime_context_cache_pass");
  }
  use_mkldnn_int8_ = true;
#else
  use_mkldnn_int8_ = false;
#endif
}

J
jianghaicheng 已提交
457 458 459 460
IpuPassStrategy::IpuPassStrategy() : PassStrategy({}) {
  passes_.assign({"inference_process_pass"});
}

461
}  // namespace paddle