未验证 提交 d9cd9898 编写于 作者: T Tao Luo 提交者: GitHub

Merge pull request #17048 from luotao1/fix_runtime_cache_bug

fix runtime_context_cache bug when gpu model has an op runs only on cpu
...@@ -1095,6 +1095,17 @@ Scope* OperatorWithKernel::PrepareData( ...@@ -1095,6 +1095,17 @@ Scope* OperatorWithKernel::PrepareData(
if (!new_scope) { if (!new_scope) {
new_scope = &scope.NewScope(); new_scope = &scope.NewScope();
} }
// For inference, if a gpu model has an op which could only run on CPU,
// each result of different input will be the same with the first one.
// The reason is that if a gpu tensor is the input of a cpu kernel,
// we will create a new cpu tensor in new scope.
// However, if enable_cache_runtime_context, we get the cpu tensor each
// time, not the gpu tensor.
// Thus, we set pre_scope_ = nullptr to trigger `new RuntimeContext()` in
// RunImpl().
if (enable_cache_runtime_context) {
pre_scope_ = nullptr;
}
auto* trans_var = new_scope->Var(var_name); auto* trans_var = new_scope->Var(var_name);
input_vars[i] = trans_var; input_vars[i] = trans_var;
......
...@@ -98,7 +98,10 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { ...@@ -98,7 +98,10 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
"conv_elementwise_add_fuse_pass", // "conv_elementwise_add_fuse_pass", //
#endif // #endif //
"transpose_flatten_concat_fuse_pass", "transpose_flatten_concat_fuse_pass",
// following two passes should be located in the last, since they will
// work on all fused ops.
"expected_kernel_cache_pass", // "expected_kernel_cache_pass", //
"runtime_context_cache_pass"
}); });
use_gpu_ = true; use_gpu_ = true;
...@@ -115,25 +118,26 @@ void GpuPassStrategy::EnableMkldnnQuantizer() { ...@@ -115,25 +118,26 @@ void GpuPassStrategy::EnableMkldnnQuantizer() {
CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
// NOTE the large fusions should be located in the front, so that they will // NOTE the large fusions should be located in the front, so that they will
// not be damaged by smaller ones. // not be damaged by smaller ones.
passes_.assign({ passes_.assign({"infer_clean_graph_pass", //
"infer_clean_graph_pass", // "attention_lstm_fuse_pass", //
"attention_lstm_fuse_pass", // "seqconv_eltadd_relu_fuse_pass", //
"seqconv_eltadd_relu_fuse_pass", // // "seqpool_concat_fuse_pass", //
// "seqpool_concat_fuse_pass", // // "embedding_fc_lstm_fuse_pass", //
// "embedding_fc_lstm_fuse_pass", // "fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass", // "mul_lstm_fuse_pass", //
"mul_lstm_fuse_pass", // "fc_gru_fuse_pass", //
"fc_gru_fuse_pass", // "mul_gru_fuse_pass", //
"mul_gru_fuse_pass", // "seq_concat_fc_fuse_pass", //
"seq_concat_fc_fuse_pass", // "fc_fuse_pass", //
"fc_fuse_pass", // "repeated_fc_relu_fuse_pass", //
"repeated_fc_relu_fuse_pass", // "squared_mat_sub_fuse_pass", //
"squared_mat_sub_fuse_pass", // "conv_bn_fuse_pass", //
"conv_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", // "is_test_pass", //
"is_test_pass", // // following two passes should be located in the last, since
"expected_kernel_cache_pass", // // they will work on all fused ops.
}); "expected_kernel_cache_pass", //
"runtime_context_cache_pass"});
use_gpu_ = false; use_gpu_ = false;
} }
......
...@@ -110,11 +110,6 @@ void SetConfig(AnalysisConfig *cfg) { ...@@ -110,11 +110,6 @@ void SetConfig(AnalysisConfig *cfg) {
if (FLAGS_zero_copy) { if (FLAGS_zero_copy) {
cfg->SwitchUseFeedFetchOps(false); cfg->SwitchUseFeedFetchOps(false);
} }
// Enable runtime_context_cache_pass, disabled by default since it doesn't
// cover all the cases.
// See detail: https://github.com/PaddlePaddle/Paddle/issues/16609
// https://github.com/PaddlePaddle/Paddle/issues/16841
cfg->pass_builder()->AppendPass("runtime_context_cache_pass");
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册