ir_pass_manager.cc 16.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
16

17
#include <map>
18
#include <memory>
19
#include <string>
20
#include <unordered_map>
21 22
#include <unordered_set>
#include <utility>
L
luotao1 已提交
23
#include <vector>
24

Y
Yan Chunwei 已提交
25
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
26 27
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/scope.h"
28
#include "paddle/fluid/inference/analysis/argument.h"
Y
Yan Chunwei 已提交
29
#include "paddle/fluid/string/pretty_log.h"
30
#include "paddle/phi/common/data_type.h"
31
#include "paddle/phi/core/errors.h"
32 33 34 35

namespace paddle {
namespace inference {
namespace analysis {
Y
Yan Chunwei 已提交
36
using string::PrettyLog;
37
using string::PrettyLogEndl;
Y
Yan Chunwei 已提交
38
using string::Style;
39

40
IRPassManager::IRPassManager(Argument *argument) {
41
  disable_logs_ = argument->disable_logs();
42 43 44

  ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
  CreatePasses(argument, argument->ir_analysis_passes());
45 46
}

47 48
void IRPassManager::CreatePasses(Argument *argument,
                                 const std::vector<std::string> &passes) {
49
  // For graph_viz_pass
50
  std::string pre_pass;
L
luotao1 已提交
51
  int pass_num = 0;
52

53
  for (const std::string &pass_name : passes) {
54
    auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
55
    pass->Set("use_varseqlen", new bool(argument->tensorrt_use_varseqlen()));
56
    pass->Set("use_cutlass", new bool(argument->use_cutlass()));
57 58
    pass->Set("with_interleaved",
              new bool(argument->tensorrt_with_interleaved()));
59 60 61 62
    pass->Set("tensorrt_transformer_posid",
              new std::string(argument->tensorrt_transformer_posid()));
    pass->Set("tensorrt_transformer_maskid",
              new std::string(argument->tensorrt_transformer_maskid()));
63
    pass->Set("disable_logs", new bool(argument->disable_logs()));
64 65 66
    auto trt_precision_mode = argument->tensorrt_precision_mode();
    bool enable_int8 =
        trt_precision_mode == static_cast<int>(phi::DataType::INT8);
67
    pass->Set("enable_int8", new bool(enable_int8));
W
Wilber 已提交
68 69 70 71 72 73 74 75 76
    pass->Set("max_input_shape",
              new std::map<std::string, std::vector<int>>(
                  argument->max_input_shape()));
    pass->Set("min_input_shape",
              new std::map<std::string, std::vector<int>>(
                  argument->min_input_shape()));
    pass->Set("optim_input_shape",
              new std::map<std::string, std::vector<int>>(
                  argument->optim_input_shape()));
77 78 79 80 81 82 83 84 85
    // Now, shape tensor value is not explicit set by user,
    // it is collected through API CollectShapeRangeInfo.
    pass->Set("max_shape_tensor",
              new std::map<std::string, std::vector<int>>());
    pass->Set("min_shape_tensor",
              new std::map<std::string, std::vector<int>>());
    pass->Set("optim_shape_tensor",
              new std::map<std::string, std::vector<int>>());

86 87 88 89
    // This gpu_device_id is used by some fp16 precision passes, so move it
    // here.
    pass->Set("gpu_device_id", new int(argument->gpu_device_id()));

90 91 92 93 94 95 96 97
    // tuned trt dynamic_shape
    pass->Set("trt_tuned_dynamic_shape",
              new bool(argument->tensorrt_tuned_dynamic_shape()));
    bool with_dynamic_shape = (argument->max_input_shape().size() > 0 &&
                               argument->min_input_shape().size() > 0 &&
                               argument->optim_input_shape().size() > 0) ||
                              argument->tensorrt_tuned_dynamic_shape();
    pass->Set("with_dynamic_shape", new bool(with_dynamic_shape));
98

99
    // Mixed precision related.
100 101 102
    pass->Set(
        "mixed_black_list",
        new std::unordered_set<std::string>(argument->mixed_black_list()));
103
    pass->Set("enable_gpu_mixed", new bool(argument->enable_gpu_mixed()));
104 105
    pass->Set("enable_custom_device_mixed",
              new bool(argument->enable_custom_device_mixed()));
106 107
    pass->Set("mixed_precision_mode",
              new int(argument->mixed_precision_mode()));
108
    pass->Set("model_precision", new int(argument->model_precision()));
109 110
    pass->Set("enable_low_precision_io",
              new bool(argument->enable_low_precision_io()));
111

Z
zhupengyang 已提交
112 113 114
    // "use_xpu" is used for passes in subgraphs.
    pass->Set("use_xpu", new bool(argument->use_xpu()));

115
    if (pass_name == "graph_viz_pass") {
116 117 118 119 120 121 122 123 124 125
      std::string optim_cache_dir = argument->optim_cache_dir();
      std::string dot_file_path;
      if (optim_cache_dir.empty()) {
        dot_file_path = std::to_string(pass_num) + "_ir_" +
                        (pre_pass.empty() ? "origin" : pre_pass) + ".dot";
      } else {
        dot_file_path = optim_cache_dir + "/" + std::to_string(pass_num) +
                        "_ir_" + (pre_pass.empty() ? "origin" : pre_pass) +
                        ".dot";
      }
126
      pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
127
      pass->Set("optim_cache_dir", new std::string(std::move(optim_cache_dir)));
L
luotao1 已提交
128
      pass_num++;
129
    } else if (pass_name == "mkldnn_placement_pass") {
130 131 132
      pass->Set("mkldnn_enabled_op_types",
                new std::unordered_set<std::string>(
                    argument->mkldnn_enabled_op_types()));
133 134 135
    } else if (pass_name == "cudnn_placement_pass") {
      pass->Set("cudnn_enabled_op_types",
                new std::unordered_set<std::string>());
136
#ifdef PADDLE_WITH_MKLDNN
137 138 139 140 141 142 143
    } else if (pass_name == "cpu_quantize_placement_pass") {
      pass->Set("quantize_enabled_op_types",
                new std::unordered_set<std::string>(
                    argument->quantize_enabled_op_types()));
      pass->Set(
          "quantize_excluded_op_ids",
          new std::unordered_set<int>(argument->quantize_excluded_op_ids()));
144
    } else if (pass_name == "cpu_quantize_pass") {
B
baoachun 已提交
145
      if (argument->quantize_enabled_op_types().count("conv2d") ||
Z
zyfncg 已提交
146
          argument->quantize_enabled_op_types().count("fused_conv2d") ||
B
baoachun 已提交
147 148 149
          argument->quantize_enabled_op_types().count("depthwise_conv2d")) {
        pass->Set("data_layout", new std::string("NHWC"));
      }
150 151
      pass->Set("quant_var_scales",
                new VarQuantScale(argument->quant_var_scales()));
152 153 154 155
    } else if (pass_name == "cpu_bfloat16_placement_pass") {
      pass->Set("bfloat16_enabled_op_types",
                new std::unordered_set<std::string>(
                    argument->bfloat16_enabled_op_types()));
156
#endif
157
    } else if (pass_name == "tensorrt_subgraph_pass") {
158 159
      pass->Set("workspace_size",
                new int64_t(argument->tensorrt_workspace_size()));
160
      pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
161 162
      pass->Set("min_subgraph_size",
                new int(argument->tensorrt_min_subgraph_size()));
N
nhzlx 已提交
163 164
      pass->Set("program",
                new framework::ProgramDesc *(&argument->main_program()));
165
      pass->Set("predictor_id", new int(argument->predictor_id()));
166 167
      bool use_calib_mode = argument->tensorrt_use_calib_mode();
      pass->Set("use_calib_mode", new bool(use_calib_mode));
168
      pass->Set("trt_precision_mode", new int(trt_precision_mode));
169 170
      pass->Set("context_memory_sharing",
                new bool(argument->trt_engine_memory_sharing()));
W
Wilber 已提交
171 172
      pass->Set("use_cuda_graph",
                new bool(argument->tensorrt_use_cuda_graph()));
173 174
      bool use_static_engine = argument->tensorrt_use_static_engine();
      bool model_from_memory = argument->model_from_memory();
175
      std::string optim_cache_dir = argument->optim_cache_dir();
176 177
      bool int8_valid = !(model_from_memory && optim_cache_dir.empty() &&
                          enable_int8 && use_calib_mode);
178
      PADDLE_ENFORCE_EQ(
W
Wilber 已提交
179 180
          int8_valid,
          true,
181 182 183 184
          platform::errors::PreconditionNotMet(
              "When you are in TRT INT8 mode, and load model from "
              "memory, you should set optim_cache_dir using "
              "config.SetOptimCacheDir()"));
185 186
      if (model_from_memory && use_static_engine) {
        PADDLE_ENFORCE_EQ(
W
Wilber 已提交
187 188
            optim_cache_dir.empty(),
            false,
189 190 191 192 193 194
            platform::errors::PreconditionNotMet(
                "When you are using Paddle-TRT, and using load model "
                "from memory, and also set the use_static to true. "
                "you must set optim_cache_dir using "
                "config.SetOptimCacheDir()."));
      }
N
nhzlx 已提交
195

196
      if (!optim_cache_dir.empty()) {
197 198
        if (!PathExists(optim_cache_dir)) {
          PADDLE_ENFORCE_NE(
W
Wilber 已提交
199 200
              MKDIR(optim_cache_dir.c_str()),
              -1,
201 202 203 204 205
              platform::errors::PreconditionNotMet(
                  "Can not create optimize cache directory: %s, Make sure you "
                  "have permission to write",
                  optim_cache_dir));
        }
206
        pass->Set("model_opt_cache_dir", new std::string(optim_cache_dir));
207
      } else if (use_static_engine || enable_int8 || with_dynamic_shape) {
208 209 210 211 212 213 214 215 216
        std::string model_opt_cache_dir =
            argument->Has("model_dir")
                ? argument->model_dir()
                : GetDirRoot(argument->model_program_path());
        pass->Set(
            "model_opt_cache_dir",
            new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
      }
      pass->Set("use_static_engine", new bool(use_static_engine));
217
      pass->Set("model_from_memory", new bool(argument->model_from_memory()));
218
      pass->Set("use_inspector", new bool(argument->tensorrt_use_inspector()));
219 220 221 222 223 224

      // tuned trt dynamic_shape
      pass->Set("trt_shape_range_info_path",
                new std::string(argument->tensorrt_shape_range_info_path()));
      pass->Set("trt_allow_build_at_runtime",
                new bool(argument->tensorrt_allow_build_at_runtime()));
W
Wilber 已提交
225 226 227
      pass->Set(
          "trt_disabled_ops",
          new std::vector<std::string>(argument->tensorrt_disabled_ops()));
228 229
      pass->Set("trt_use_dla", new bool(argument->tensorrt_use_dla()));
      pass->Set("trt_dla_core", new int(argument->tensorrt_dla_core()));
230

231
      // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will
232
      // not run fp16.
233 234
      pass->Set("disable_trt_plugin_fp16",
                new bool(argument->disable_trt_plugin_fp16()));
D
denglin-github 已提交
235
    } else if (pass_name == "dlnne_subgraph_pass") {
D
denglin-github 已提交
236
      auto precision_mode = argument->dlnne_precision_mode();
D
denglin-github 已提交
237 238
      pass->Set("min_subgraph_size",
                new int(argument->dlnne_min_subgraph_size()));
D
denglin-github 已提交
239 240 241 242 243 244 245 246 247
      pass->Set("max_batch_size", new int(argument->dlnne_max_batch_size()));
      pass->Set("use_static_batch",
                new bool(argument->dlnne_use_static_batch()));
      pass->Set("weight_share_mode",
                new std::string(argument->dlnne_weight_share_mode()));
      pass->Set("disable_nodes_by_outputs",
                new std::unordered_set<std::string>(
                    argument->dlnne_disable_nodes_by_outputs()));
      pass->Set("use_calib_mode", new bool(argument->dlnne_use_calib_mode()));
248
      pass->Set("dlnne_precision_mode", new int(precision_mode));
D
denglin-github 已提交
249 250 251
      pass->Set("input_shape_dict",
                new std::map<std::string, std::vector<int64_t>>(
                    argument->dlnne_input_shape_dict()));
D
denglin-github 已提交
252 253
      pass->Set("program",
                new framework::ProgramDesc *(&argument->main_program()));
254 255
    } else if (pass_name == "memory_optimize_pass") {
      pass->Set("root_predictor_id", new int(argument->root_predictor_id()));
256 257
    } else if (pass_name == "build_cinn_pass") {
      pass->Set("is_inference_stage", new bool(argument->use_cinn_compiler()));
258
    } else if (pass_name == "lite_subgraph_pass") {
259 260
      bool lite_enable_int8 = argument->lite_precision_mode() ==
                              static_cast<int>(phi::DataType::INT8);
石晓伟 已提交
261 262 263 264 265
      pass->Set("program",
                new framework::ProgramDesc *(&argument->main_program()));
      pass->Set("lite_ops_filter",
                new std::vector<std::string>(argument->lite_ops_filter()));
      pass->Set("predictor_id", new int(argument->predictor_id()));
266 267
      pass->Erase("enable_int8");
      pass->Set("enable_int8", new bool(lite_enable_int8));
石晓伟 已提交
268
      pass->Set("use_gpu", new bool(argument->use_gpu()));
269
      pass->Set("zero_copy", new bool(argument->lite_zero_copy()));
Z
zhupengyang 已提交
270 271 272 273 274
      pass->Set("xpu_device_id", new int(argument->xpu_device_id()));
      pass->Set("xpu_l3_size", new size_t(argument->xpu_l3_size()));
      pass->Set("xpu_l3_ptr", new void *(argument->xpu_l3_ptr()));
      pass->Set("xpu_l3_autotune_size",
                new size_t(argument->xpu_l3_autotune_size()));
275
      pass->Set("xpu_context", new void *(argument->xpu_context()));
Z
zhupengyang 已提交
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302
      pass->Set("xpu_stream", new void *(argument->xpu_stream()));
      pass->Set("xpu_conv_autotune_level",
                new int(argument->xpu_conv_autotune_level()));
      pass->Set("xpu_conv_autotune_file",
                new std::string(argument->xpu_conv_autotune_file()));
      pass->Set("xpu_conv_autotune_file_writeback",
                new bool(argument->xpu_conv_autotune_file_writeback()));
      pass->Set("xpu_fc_autotune_level",
                new int(argument->xpu_fc_autotune_level()));
      pass->Set("xpu_fc_autotune_file",
                new std::string(argument->xpu_fc_autotune_file()));
      pass->Set("xpu_fc_autotune_file_writeback",
                new bool(argument->xpu_fc_autotune_file_writeback()));
      pass->Set("xpu_gemm_compute_precision",
                new int(argument->xpu_gemm_compute_precision()));
      pass->Set("xpu_transformer_softmax_optimize_level",
                new int(argument->xpu_transformer_softmax_optimize_level()));
      pass->Set("xpu_transformer_encoder_adaptive_seqlen",
                new bool(argument->xpu_transformer_encoder_adaptive_seqlen()));
      pass->Set(
          "xpu_quant_post_static_gelu_out_threshold",
          new float(argument->xpu_quant_post_static_gelu_out_threshold()));
      pass->Set("xpu_quant_post_dynamic_activation_method",
                new int(argument->xpu_quant_post_dynamic_activation_method()));
      pass->Set("xpu_l3_locked", new bool(argument->xpu_lite_l3_locked()));
      pass->Set("xpu_enable_multi_stream",
                new bool(argument->xpu_lite_enable_multi_stream()));
303
      pass->Set("use_opencl", new bool(argument->use_opencl()));
W
Wilber 已提交
304 305
      pass->Set("cpu_math_library_num_threads",
                new int(argument->cpu_math_library_num_threads()));
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
      // NNAdapter Related
      pass->Set("use_nnadapter", new bool(argument->use_nnadapter()));
      pass->Set("nnadapter_model_cache_dir",
                new std::string(argument->nnadapter_model_cache_dir()));
      pass->Set(
          "nnadapter_device_names",
          new std::vector<std::string>(argument->nnadapter_device_names()));
      pass->Set("nnadapter_context_properties",
                new std::string(argument->nnadapter_context_properties()));
      pass->Set("nnadapter_subgraph_partition_config_buffer",
                new std::string(
                    argument->nnadapter_subgraph_partition_config_buffer()));
      pass->Set("nnadapter_subgraph_partition_config_path",
                new std::string(
                    argument->nnadapter_subgraph_partition_config_path()));
      pass->Set("nnadapter_model_cache_buffer",
                new std::vector<std::vector<char>>(
                    argument->nnadapter_model_cache_buffer()));
      pass->Set("nnadapter_model_cache_token",
                new std::vector<std::string>(
                    argument->nnadapter_model_cache_token()));
327
    } else if (pass_name == "fc_fuse_pass") {
328
      pass->Set("use_gpu", new bool(argument->use_gpu()));
329 330 331 332 333 334 335 336
      bool fc_mkldnn_pass = 0;
      for (const std::string &pass_n : passes) {
        if (pass_n == "fc_mkldnn_pass") {
          fc_mkldnn_pass = 1;
        }
      }
      bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding();
      pass->Set("use_fc_padding", new bool(use_fc_padding));
337
    } else if (pass_name == "fused_multi_transformer_xpu_pass") {
Z
zhupengyang 已提交
338 339 340 341
      int quant_post_dynamic_weight_precision =
          argument->xpu_quant_post_dynamic_weight_precision();
      if (quant_post_dynamic_weight_precision == 0) {
        pass->Set("quant_post_dynamic_weight_precision ", new int(0));
Z
zhupengyang 已提交
342
      }
343
    }
344
    pre_pass = pass_name;
345 346

    passes_.emplace_back(std::move(pass));
347 348 349
  }
}

350
std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
W
Wilber 已提交
351
  PADDLE_ENFORCE_NOT_NULL(
352
      graph.get(), platform::errors::InvalidArgument("Graph cannot be null."));
353 354
  // Apply all the passes
  for (const auto &pass : passes_) {
355
    if (pass->Type() != "graph_viz_pass" && !disable_logs_) {
Y
Yan Chunwei 已提交
356 357
      PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
    }
358
    graph.reset(pass->Apply(graph.release()));
359
  }
G
Gabor Buella 已提交
360
  return graph;
361 362
}

363 364 365
}  // namespace analysis
}  // namespace inference
}  // namespace paddle