diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index cb77637d67df5a412af3f0dcefd70f7099601922..78387c407398b58d3fab6eab12445c4198f809b5 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -31,7 +31,9 @@ pass_library(fc_fuse_pass inference) pass_library(attention_lstm_fuse_pass inference) pass_library(infer_clean_graph_pass inference) pass_library(fc_lstm_fuse_pass inference) +pass_library(fc_gru_fuse_pass inference) pass_library(seq_concat_fc_fuse_pass inference) + set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..90d8d5c042fccd8ca5ddf4f1303b2ce766786732 --- /dev/null +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc @@ -0,0 +1,203 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h" +#include +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +namespace ir { + +static void BuildPattern(PDPattern* pattern, const std::string& name_scope, + bool with_fc_bias) { + PDNode* x = pattern->NewNode(name_scope, "x") + ->assert_is_op_input("mul") + ->assert_var_not_persistable(); + auto* fc_out = patterns::FC(pattern, name_scope, x, with_fc_bias); + fc_out->AsIntermediate(); // fc_out is a tmp var, will be removed after fuse. + patterns::GRU(pattern, name_scope, fc_out); + VLOG(3) << "fc_gru pattern \n" << pattern->DotString(); +} + +static int BuildFusion(Graph* graph, const std::string& name_scope, + Scope* scope, bool with_fc_bias) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + BuildPattern(pattern, name_scope, with_fc_bias); + + // Create New OpDesc + auto gru_creater = [&](int gru, int x, int weight_x, int weight_h, int bias, + int hidden, int fc_bias) { +#define GET_NODE(x) auto* x##_n = graph->RetriveNode(x); + GET_NODE(x); + GET_NODE(weight_x); + GET_NODE(weight_h); + GET_NODE(bias); + GET_NODE(hidden); + GET_NODE(gru); + + OpDesc op_desc; + op_desc.SetType("fusion_gru"); + +#define NEW_NAME(x) name_scope + "/at." #x ".new" +#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__##_n->Name()}); + SET_IN(X, x); + SET_IN(WeightX, weight_x); + SET_IN(WeightH, weight_h); + if (with_fc_bias) { + op_desc.SetInput("Bias", {NEW_NAME(bias) + bias_n->Name()}); + } else { + SET_IN(Bias, bias); + } +#undef SET_IN + op_desc.SetInput("H0", {}); + op_desc.SetOutput("Hidden", {hidden_n->Name()}); + op_desc.SetAttr("is_reverse", gru_n->Op()->GetAttr("is_reverse")); + // TODO(TJ): This should be a option for infer + op_desc.SetAttr("use_seq", true); + +#define SET_IMTERMEDIATE_OUT(key) op_desc.SetOutput(#key, {NEW_NAME(key)}) + SET_IMTERMEDIATE_OUT(ReorderedH0); + SET_IMTERMEDIATE_OUT(XX); + SET_IMTERMEDIATE_OUT(BatchedInput); + SET_IMTERMEDIATE_OUT(BatchedOut); +#undef SET_IMTERMEDIATE_OUT + + auto* op = graph->CreateOpNode(&op_desc); + PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); + auto* scope = graph->Get(kParamScopeAttr); + PADDLE_ENFORCE(scope); + if (with_fc_bias) { + // Fusion GRU bias = fcbias + grubias + auto* fusion_bias_var = scope->Var(NEW_NAME(bias) + bias_n->Name()); + auto* out_bias_tensor = + fusion_bias_var->GetMutable(); + PADDLE_ENFORCE(fusion_bias_var); + GET_NODE(fc_bias); + PADDLE_ENFORCE(fc_bias_n); + auto* gru_bias_var = scope->FindVar(bias_n->Name()); + auto* fc_bias_var = scope->FindVar(fc_bias_n->Name()); + PADDLE_ENFORCE(gru_bias_var); + PADDLE_ENFORCE(fc_bias_var); + const auto& gru_bias_tenosr = gru_bias_var->Get(); + const auto& fc_bias_tensor = fc_bias_var->Get(); + // new bias = fc bias + gru bias + out_bias_tensor->Resize(gru_bias_tenosr.dims()); + auto* data = out_bias_tensor->mutable_data(platform::CPUPlace()); + for (int i = 0; i < out_bias_tensor->numel(); i++) { + data[i] = + fc_bias_tensor.data()[i] + gru_bias_tenosr.data()[i]; + } + } +#undef GET_NODE + +#define NEW_IMTERMEDIATE_OUT(key) \ + scope->Var(NEW_NAME(key))->GetMutable() + NEW_IMTERMEDIATE_OUT(ReorderedH0); + NEW_IMTERMEDIATE_OUT(XX); + NEW_IMTERMEDIATE_OUT(BatchedInput); + NEW_IMTERMEDIATE_OUT(BatchedOut); +#undef NEW_NAME +#undef NEW_IMTERMEDIATE_OUT + + IR_NODE_LINK_TO(x_n, op); + IR_NODE_LINK_TO(weight_x_n, op); + IR_NODE_LINK_TO(weight_h_n, op); + IR_NODE_LINK_TO(bias_n, op); // actually should link to new bias if have + IR_NODE_LINK_TO(op, hidden_n); + // h0? + return op; + }; + + int fusion_count{0}; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { +#define GET_NODE(name__) \ + std::string name__##key = name_scope + "/" + #name__; \ + auto* name__##n = pattern->RetrieveNode(name__##key); \ + PADDLE_ENFORCE(name__##n); \ + PADDLE_ENFORCE(subgraph.count(name__##n)); \ + Node* name__##_n = subgraph.at(name__##n); \ + int name__ __attribute__((unused)) = name__##_n->id(); + + GET_NODE(x); + GET_NODE(w); // fc weight + GET_NODE(mul); + GET_NODE(fc_out); + GET_NODE(Weight); + GET_NODE(gru); + GET_NODE(Bias); + GET_NODE(Hidden); + // nodes need be removed + GET_NODE(BatchGate); + GET_NODE(BatchResetHiddenPrev); + GET_NODE(BatchHidden); + + if (with_fc_bias) { + GET_NODE(mul_out); + GET_NODE(fc_bias); + GET_NODE(elementwise_add); + gru_creater(gru, x, w, Weight, Bias, Hidden, fc_bias); + // Remove unneeded nodes. + std::unordered_set marked_nodes( + {mul_n, gru_n, elementwise_add_n, fc_bias_n, fc_out_n, mul_out_n, + BatchGate_n, BatchResetHiddenPrev_n, BatchHidden_n}); + GraphSafeRemoveNodes(graph, marked_nodes); + } else { + gru_creater(gru, x, w, Weight, Bias, Hidden, -1); + // Remove unneeded nodes. + std::unordered_set marked_nodes( + {mul_n, gru_n, BatchGate_n, BatchResetHiddenPrev_n, BatchHidden_n}); + GraphSafeRemoveNodes(graph, marked_nodes); + } +#undef GET_NODE + + ++fusion_count; + }; + + gpd(graph, handler); + + return fusion_count; +} + +std::unique_ptr MulGRUFusePass::ApplyImpl( + std::unique_ptr graph) const { + FusePassBase::Init(name_scope_, graph.get()); + + int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(), + false /*with_fc_bias*/); + + AddStatis(fusion_count); + return graph; +} + +std::unique_ptr FCGRUFusePass::ApplyImpl( + std::unique_ptr graph) const { + FusePassBase::Init(name_scope_, graph.get()); + + int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(), + true /*with_fc_bias*/); + + AddStatis(fusion_count); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass); +REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass); diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..63e1c72bfb2e2641ae5d44858b342d5e427e9045 --- /dev/null +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +// The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op. + +class FCGRUFusePass : public FusePassBase { + public: + virtual ~FCGRUFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + + const std::string name_scope_{"fc_gru_fuse"}; +}; + +// Just FC without bias +class MulGRUFusePass : public FusePassBase { + public: + virtual ~MulGRUFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + const std::string name_scope_{"fc_nobias_gru_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index 9512fd056e73836cdc34a9e409ab2d7809a6aff6..3e09613699e04bc05abf19e81e9a4ea5b41a6733 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -20,12 +20,13 @@ namespace paddle { namespace framework { namespace ir { -std::string GenNodeName(const std::string& prefix, const std::string& name) { +static std::string GenNodeName(const std::string& prefix, + const std::string& name) { return prefix + "/" + name; } -void BuildPattern(PDPattern* pattern, const std::string& name_scope, - bool with_fc_bias) { +static void BuildPattern(PDPattern* pattern, const std::string& name_scope, + bool with_fc_bias) { PDNode* x = pattern->NewNode(name_scope, "x") ->assert_is_op_input("mul") ->assert_var_not_persistable(); @@ -35,8 +36,8 @@ void BuildPattern(PDPattern* pattern, const std::string& name_scope, // LOG(INFO) << "\n" << pattern->DotString(); } -int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, - bool with_fc_bias) { +static int BuildFusion(Graph* graph, const std::string& name_scope, + Scope* scope, bool with_fc_bias) { GraphPatternDetector gpd; auto* pattern = gpd.mutable_pattern(); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 731b89423354532f684e19305dfa87e8eb75d4b1..5ca75095158649c95371248c115054ff68faab9d 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -519,76 +519,96 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) { PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope, PDNode* x, bool with_bias) { - // Create Operators - PDNode* elementwise_add_op{nullptr}; + // mul op auto* mul_op = pattern->NewNode(name_scope, "mul")->assert_is_op("mul"); - if (with_bias) { - elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add") - ->assert_is_op("elementwise_add"); - } - // Create variables - // w auto* mul_weight_var = pattern->NewNode(name_scope, "w") ->AsInput() ->assert_is_persistable_var() - ->assert_is_op_nth_input("mul", "Y", 0); - PDNode* mul_out_var{nullptr}; + ->assert_is_op_input("mul", "Y"); + + PDNode* fc_out{nullptr}; if (with_bias) { + PDNode* elementwise_add_op{nullptr}; + PDNode *mul_out_var{nullptr}, *bias{nullptr}; + elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add") + ->assert_is_op("elementwise_add"); // intermediate variable, will be removed in the IR after fuse. mul_out_var = pattern->NewNode(name_scope, "mul_out") ->AsIntermediate() ->assert_is_only_output_of_op("mul") ->assert_is_op_input("elementwise_add"); - } - PDNode *bias{nullptr}, *fc_out{nullptr}; - if (with_bias) { // bias bias = pattern->NewNode(name_scope, "fc_bias") - ->assert_is_op_input("elementwise_add") - ->AsInput(); + ->AsInput() + ->assert_is_op_input("elementwise_add"); // output fc_out = pattern->NewNode(name_scope, "fc_out") ->AsOutput() ->assert_is_op_output("elementwise_add"); + mul_op->LinksFrom({x, mul_weight_var}).LinksTo({mul_out_var}); + elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out}); } else { fc_out = pattern->NewNode(name_scope, "fc_out") ->AsOutput() ->assert_is_op_output("mul"); - } - - if (with_bias) { - mul_op->LinksFrom({mul_weight_var, x}).LinksTo({mul_out_var}); - elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out}); - } else { mul_op->LinksFrom({mul_weight_var, x}).LinksTo({fc_out}); } - return fc_out; } + +#define NEW_NODE(op__, arg__, io__) \ + auto* arg__ = pattern->NewNode(name_scope, #arg__) \ + ->assert_is_op_##io__(#op__, #arg__); + PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x) { x->assert_is_op_input("lstm", "Input"); auto* lstm_op = pattern->NewNode(name_scope, "lstm")->assert_is_op("lstm"); -#define NEW_NODE(arg__, io__) \ - auto* arg__ = pattern->NewNode(name_scope, #arg__) \ - ->assert_is_op_##io__("lstm", #arg__); // Currently, the H0 and C0 are optional // TODO(Superjomn) upgrade the fuse framework to support optional. // NEW_NODE(H0, input); // NEW_NODE(C0, input); - NEW_NODE(Weight, input); - NEW_NODE(Bias, input); + NEW_NODE(lstm, Weight, input); + NEW_NODE(lstm, Bias, input); - NEW_NODE(Hidden, output); - NEW_NODE(Cell, output); - NEW_NODE(BatchGate, output); - NEW_NODE(BatchCellPreAct, output); + NEW_NODE(lstm, Hidden, output); + NEW_NODE(lstm, Cell, output); + NEW_NODE(lstm, BatchGate, output); + NEW_NODE(lstm, BatchCellPreAct, output); lstm_op->LinksFrom({x, Weight, Bias}); lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct}); return Hidden; } + +PDNode* patterns::GRU(PDPattern* pattern, const std::string& name_scope, + PDNode* x) { + x->assert_is_op_input("gru", "Input"); + auto* gru_op = pattern->NewNode(name_scope, "gru")->assert_is_op("gru"); + + NEW_NODE(gru, Weight, input); + // TODO(Superjomn): upgrade the fuse framework to support optional. + // H0 and bias are optional + NEW_NODE(gru, Bias, input); // also optional + // NEW_NODE(H0, input); + + NEW_NODE(gru, Hidden, output); + // below are intermediate + NEW_NODE(gru, BatchGate, output); + NEW_NODE(gru, BatchResetHiddenPrev, output); + NEW_NODE(gru, BatchHidden, output); + + BatchGate->AsIntermediate(); + BatchResetHiddenPrev->AsIntermediate(); + BatchHidden->AsIntermediate(); + + gru_op->LinksFrom({x, Weight, Bias}); + gru_op->LinksTo({Hidden, BatchGate, BatchResetHiddenPrev, BatchHidden}); + return Hidden; +} +#undef NEW_NODE + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index eacea1750f6f1e86a8fe79637c3bd757a7275398..71e4c36d9b6327ff419179ca7ed10332f448e245 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -298,6 +298,8 @@ PDNode* FC(PDPattern* pattern, const std::string& name_scope, PDNode* x, PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x); +PDNode* GRU(PDPattern* pattern, const std::string& name_scope, PDNode* x); + } // namespace patterns #define IR_NODE_LINK_TO(a, b) \ diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 765f8a4486bb94792e198dea481ba3b6d153767a..a115bc8f4a3326502762afb0d4f399d1f9674694 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -81,7 +81,7 @@ if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE) endif() inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api + EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt) @@ -94,7 +94,7 @@ if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE) endif() inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api + EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor ARGS --infer_model=${LAC_INSTALL_DIR}/model --infer_data=${LAC_INSTALL_DIR}/data.txt) diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index abc3021e7ec3f0f970d786b782ad17510b8bdbd8..399afbe64a56393176795ecdd1ac70bfedd5c91a 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -38,7 +38,6 @@ limitations under the License. */ #include #include #include - #include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/inference/analysis/flags.h" #include "paddle/fluid/inference/analysis/pass_manager.h" @@ -69,6 +68,8 @@ class Analyzer : public OrderedRegistry { "attention_lstm_fuse_pass", // "fc_lstm_fuse_pass", // "mul_lstm_fuse_pass", // + "fc_gru_fuse_pass", // + "mul_gru_fuse_pass", // "seq_concat_fc_fuse_pass", // "fc_fuse_pass", // }}; diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc index 3bb5d9462f5d53e9b600186ac2b4a027489097cf..522d870db8583aac4006e8cdb7909625c3feb34b 100644 --- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc @@ -11,13 +11,14 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include "paddle/fluid/inference/analysis/analyzer.h" -#include #include -#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/platform/profiler.h" DEFINE_string(infer_model, "", "model path for LAC"); @@ -102,6 +103,7 @@ struct DataRecord { return data; } }; + void GetOneBatch(std::vector *input_slots, DataRecord *data, int batch_size) { auto one_batch = data->NextBatch(); @@ -114,6 +116,7 @@ void GetOneBatch(std::vector *input_slots, DataRecord *data, PADDLE_ENFORCE_EQ(batch_size, static_cast(one_batch.lod.size() - 1)); input_slots->assign({input_tensor}); } + void BenchAllData(const std::string &model_path, const std::string &data_file, const int batch_size, const int repeat) { NativeConfig config; @@ -141,17 +144,16 @@ void BenchAllData(const std::string &model_path, const std::string &data_file, } PrintTime(batch_size, repeat, 1, 0, sum / repeat); } + const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23}; + void TestLACPrediction(const std::string &model_path, const std::string &data_file, const int batch_size, - const int repeat, bool test_all_data) { - if (test_all_data) { - BenchAllData(model_path, data_file, batch_size, repeat); - return; - } + const int repeat, bool test_all_data, + bool use_analysis = false) { NativeConfig config; config.model_dir = model_path; config.use_gpu = false; @@ -160,17 +162,47 @@ void TestLACPrediction(const std::string &model_path, std::vector input_slots, outputs_slots; DataRecord data(data_file, batch_size); GetOneBatch(&input_slots, &data, batch_size); - auto predictor = - CreatePaddlePredictor(config); + std::unique_ptr predictor; + if (use_analysis) { + AnalysisConfig cfg; + cfg.model_dir = model_path; + cfg.use_gpu = false; + cfg.device = 0; + cfg.specify_input_name = true; + cfg.enable_ir_optim = true; + predictor = + CreatePaddlePredictor(cfg); + } else { + predictor = + CreatePaddlePredictor(config); + } for (int i = 0; i < FLAGS_burning; i++) { predictor->Run(input_slots, &outputs_slots); } Timer timer; + if (test_all_data) { + double sum = 0; + LOG(INFO) << "Total number of samples: " << data.datasets.size(); + for (int i = 0; i < repeat; i++) { + for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) { + GetOneBatch(&input_slots, &data, batch_size); + timer.tic(); + predictor->Run(input_slots, &outputs_slots); + sum += timer.toc(); + } + } + PrintTime(batch_size, repeat, 1, 0, sum / repeat); + LOG(INFO) << "Average latency of each sample: " + << sum / repeat / data.datasets.size() << " ms"; + return; + } timer.tic(); for (int i = 0; i < repeat; i++) { predictor->Run(input_slots, &outputs_slots); } PrintTime(batch_size, repeat, 1, 0, timer.toc() / repeat); + + // check result EXPECT_EQ(outputs_slots.size(), 1UL); auto &out = outputs_slots[0]; size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, @@ -182,12 +214,60 @@ void TestLACPrediction(const std::string &model_path, for (size_t i = 0; i < batch1_size; ++i) { EXPECT_EQ(pdata[i], lac_ref_data[i]); } + + if (use_analysis) { + // run once for comparion as reference + auto ref_predictor = + CreatePaddlePredictor(config); + std::vector ref_outputs_slots; + ref_predictor->Run(input_slots, &ref_outputs_slots); + EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size()); + auto &ref_out = ref_outputs_slots[0]; + size_t ref_size = + std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, + [](int a, int b) { return a * b; }); + EXPECT_EQ(size, ref_size); + int64_t *pdata_ref = static_cast(ref_out.data.data()); + for (size_t i = 0; i < size; ++i) { + EXPECT_EQ(pdata_ref[i], pdata[i]); + } + + AnalysisPredictor *analysis_predictor = + dynamic_cast(predictor.get()); + auto &fuse_statis = analysis_predictor->analysis_argument() + .Get>( + framework::ir::kFuseStatisAttr); + for (auto &item : fuse_statis) { + LOG(INFO) << "fused " << item.first << " " << item.second; + } + int num_ops = 0; + for (auto &node : + analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { + if (node->IsFunction()) { + ++num_ops; + } + } + LOG(INFO) << "has num ops: " << num_ops; + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); + EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); + EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4); + EXPECT_EQ(num_ops, 11); + } } + TEST(Analyzer_LAC, native) { LOG(INFO) << "LAC with native"; TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, FLAGS_repeat, FLAGS_test_all_data); } + +TEST(Analyzer_LAC, analysis) { + LOG(INFO) << "LAC with analysis"; + TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, + FLAGS_repeat, FLAGS_test_all_data, true); +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc index eaae09b051f6d2d6c90b25312a07c50c4019e120..661b047ed7cb70545267e468d8c2c48596a2994c 100644 --- a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc @@ -13,12 +13,12 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/analyzer.h" -#include #include -#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/platform/profiler.h" DEFINE_string(infer_model, "", "model path"); @@ -112,7 +112,7 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26, 48, 39, 38, 16, 25}; -void TestChineseNERPrediction() { +void TestChineseNERPrediction(bool use_analysis) { NativeConfig config; config.prog_file = FLAGS_infer_model + "/__model__"; config.param_file = FLAGS_infer_model + "/param"; @@ -120,11 +120,23 @@ void TestChineseNERPrediction() { config.device = 0; config.specify_input_name = true; - auto predictor = - CreatePaddlePredictor(config); - std::vector input_slots; - std::vector outputs; + std::vector input_slots, outputs; + std::unique_ptr predictor; Timer timer; + if (use_analysis) { + AnalysisConfig cfg; + cfg.prog_file = FLAGS_infer_model + "/__model__"; + cfg.param_file = FLAGS_infer_model + "/param"; + cfg.use_gpu = false; + cfg.device = 0; + cfg.specify_input_name = true; + cfg.enable_ir_optim = true; + predictor = + CreatePaddlePredictor(cfg); + } else { + predictor = + CreatePaddlePredictor(config); + } if (FLAGS_test_all_data) { LOG(INFO) << "test all data"; @@ -165,10 +177,51 @@ void TestChineseNERPrediction() { for (size_t i = 0; i < std::min(11UL, size); i++) { PADDLE_ENFORCE(result[i], chinese_ner_result_data[i]); } + + if (use_analysis) { + // run once for comparion as reference + auto ref_predictor = + CreatePaddlePredictor(config); + std::vector ref_outputs_slots; + ref_predictor->Run(input_slots, &ref_outputs_slots); + EXPECT_EQ(ref_outputs_slots.size(), outputs.size()); + auto &ref_out = ref_outputs_slots[0]; + size_t ref_size = + std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, + [](int a, int b) { return a * b; }); + EXPECT_EQ(size, ref_size); + int64_t *pdata_ref = static_cast(ref_out.data.data()); + for (size_t i = 0; i < size; ++i) { + EXPECT_EQ(pdata_ref[i], result[i]); + } + + AnalysisPredictor *analysis_predictor = + dynamic_cast(predictor.get()); + auto &fuse_statis = analysis_predictor->analysis_argument() + .Get>( + framework::ir::kFuseStatisAttr); + for (auto &item : fuse_statis) { + LOG(INFO) << "fused " << item.first << " " << item.second; + } + int num_ops = 0; + for (auto &node : + analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { + if (node->IsFunction()) { + ++num_ops; + } + } + LOG(INFO) << "has num ops: " << num_ops; + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); + EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); + EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2); + EXPECT_EQ(num_ops, 14); + } } -// Directly infer with the original model. -TEST(Analyzer, Chinese_ner) { TestChineseNERPrediction(); } +TEST(Analyzer_Chinese_ner, native) { TestChineseNERPrediction(false); } + +TEST(Analyzer_Chinese_ner, analysis) { TestChineseNERPrediction(true); } } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 4cf26d3c70eafd951d14c26335416ec2c71c001d..a496ae41aa0b5c3bed1e1b372f9270a528b23516 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -283,7 +283,6 @@ void TestDituRNNPrediction(bool use_analysis, bool activate_ir, base_predictor->Run(input_slots, &base_outputs); - LOG(INFO) << "===========profile result==========="; if (num_threads == 1) { // Prepare inputs. Timer timer; @@ -324,7 +323,6 @@ void TestDituRNNPrediction(bool use_analysis, bool activate_ir, threads[i].join(); } } - LOG(INFO) << "====================================="; if (use_analysis && activate_ir) { AnalysisPredictor *analysis_predictor = diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index b69948f40ab524e40e72f2c6858f77db79bcfa03..5df486f345a98d7737d326c94e4854d24535ff61 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -45,7 +45,6 @@ endfunction(inference_api_test) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor) cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis) - cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 79eeea88ea83ad862b5e2ac1390dae377b676685..2a9a7aed480e76edbac4d5ba6d7bc3b8b2dc5006 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -22,12 +22,25 @@ #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/platform/profiler.h" + +DECLARE_bool(profile); namespace paddle { bool AnalysisPredictor::Init( const std::shared_ptr& parent_scope) { VLOG(3) << "Predictor::init()"; +#if !defined(_WIN32) + if (FLAGS_profile) { + LOG(WARNING) << "Profiler is actived, might affect the performance"; + LOG(INFO) << "You can turn off by set gflags '-profile false'"; + auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll + : platform::ProfilerState::kCPU; + platform::EnableProfiler(tracking_device); + } +#endif + if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); LOG(WARNING) << "ir optimize only supports CPU currently"; diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 2c2ac656e8005369bb0e9033236a431cb09caa15..f6893be428feacbba85bab380e22972848eaeb93 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -124,9 +124,9 @@ std::string DescribeTensor(const PaddleTensor &tensor) { void PrintTime(int batch_size, int repeat, int num_threads, int tid, double latency) { - LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << repeat + LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat << ", threads: " << num_threads << ", thread id: " << tid - << ", latency: " << latency << "ms"; + << ", latency: " << latency << "ms ======"; } } // namespace inference