diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index e4c471d86b7bff1bfb3b697ab24219144b4667f5..ce429fefa77b81dff9bf997ba092e92d97cb0dc0 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -129,11 +129,13 @@ cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) -if(NOT WIN32) -cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) -cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog - shape_inference data_transform lod_tensor profiler) -endif(NOT WIN32) +if(WITH_NGRAPH) + if(NOT WIN32) + cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) + cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog + shape_inference data_transform lod_tensor profiler ngraph) + endif(NOT WIN32) +endif(WITH_NGRAPH) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) @@ -169,11 +171,15 @@ if(WITH_DISTRIBUTE) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() - if(NOT WIN32) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper) - else(NOT WIN32) + if(WITH_NGRAPH) + if(NOT WIN32) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph ngraph_operator variable_helper) + else(NOT WIN32) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) + endif(NOT WIN32) + else(WITH_NGRAPH) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) - endif(NOT WIN32) + endif(WITH_NGRAPH) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 73cec21e20f2fd26e144872f1f7b5bb7065adb74..e97cf44c75cfdc2e7df22aa870916866b18b3b5a 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -17,7 +17,6 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/ngraph_operator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/transfer_scope_cache.h" @@ -26,6 +25,10 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" +#ifdef PADDLE_WITH_NGRAPH +#include "paddle/fluid/framework/ngraph_operator.h" +#endif + DECLARE_bool(benchmark); DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); DEFINE_bool(use_ngraph, false, "Use NGRAPH to run"); @@ -88,11 +91,11 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, static void EnableFusedOp(ExecutorPrepareContext* ctx) { #ifdef PADDLE_WITH_NGRAPH VLOG(3) << "use_ngraph=True"; - auto intervals = FusedOperator::FusedOpIntervals(&ctx->ops_); + auto intervals = NgraphOperator::NgraphOpIntervals(&ctx->ops_); for (auto& interval : intervals) { - auto* fused_op = new FusedOperator(ctx->prog_, ctx->block_id_, - interval.at(0), interval.at(1)); - *interval[0] = std::unique_ptr(fused_op); + auto* ng_op = new NgraphOperator(ctx->prog_, ctx->block_id_, interval.at(0), + interval.at(1)); + *interval[0] = std::unique_ptr(ng_op); } for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) { ctx->ops_.erase(it->at(0) + 1, it->at(1)); diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc index e22c29037718a60ff7f24404d7749600e2edb80b..a5acfd70449e92663cb66ef90a141c087ff6ec88 100644 --- a/paddle/fluid/framework/ngraph_bridge.cc +++ b/paddle/fluid/framework/ngraph_bridge.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_NGRAPH #include #include #include @@ -27,14 +26,15 @@ namespace paddle { namespace framework { static std::shared_ptr GetNode( - const std::shared_ptr& op, const std::string prm, + const std::shared_ptr& op, const std::string name, const VariableNameMap& var_map, std::shared_ptr< std::unordered_map>> ngb_node_map) { - auto& var_names = var_map.at(prm); + auto& var_names = var_map.at(name); PADDLE_ENFORCE_EQ(var_names.size(), 1, - "op %s prm %s expects one associated var", op->Type(), prm); + "op %s name %s expects one associated var", op->Type(), + name); if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) { return (*ngb_node_map)[var_names[0]]; } else { @@ -43,42 +43,42 @@ static std::shared_ptr GetNode( } static std::shared_ptr GetInputNode( - const std::shared_ptr& op, const std::string prm, + const std::shared_ptr& op, const std::string name, std::shared_ptr< std::unordered_map>> ngb_node_map) { - return GetNode(op, prm, op->Inputs(), ngb_node_map); + return GetNode(op, name, op->Inputs(), ngb_node_map); } static std::shared_ptr GetOutputNode( - const std::shared_ptr& op, const std::string prm, + const std::shared_ptr& op, const std::string name, std::shared_ptr< std::unordered_map>> ngb_node_map) { - return GetNode(op, prm, op->Outputs(), ngb_node_map); + return GetNode(op, name, op->Outputs(), ngb_node_map); } static void SetOutputNode( - const std::shared_ptr& op, const std::string prm, + const std::shared_ptr& op, const std::string name, std::shared_ptr node, std::shared_ptr< std::unordered_map>> ngb_node_map) { - auto& var_names = op->Outputs().at(prm); + auto& var_names = op->Outputs().at(name); if (var_names.size() == 1) { (*ngb_node_map)[var_names[0]] = node; } else if (var_names.size() == 0) { (*ngb_node_map)[""] = node; } else { - PADDLE_THROW("prm %s has more than 1 var_names.", prm); + PADDLE_THROW("name %s has more than 1 var_names.", name); } } static bool HasOutput(const std::shared_ptr& op, - const std::string prm) { + const std::string name) { auto& outputs = op->Outputs(); - if (outputs.find(prm) == outputs.end()) return false; - return outputs.at(prm).size() > 0; + if (outputs.find(name) == outputs.end()) return false; + return outputs.at(name).size() > 0; } template @@ -118,4 +118,3 @@ void NgraphBridge::BuildNgNode(const std::shared_ptr& op) { } // namespace framework } // namespace paddle -#endif diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/framework/ngraph_bridge.h index 9ed6b9510942136a61faa5755fd8fa74286939a8..5ad7b8daeb6a782515e50fc87ca7188b46308390 100644 --- a/paddle/fluid/framework/ngraph_bridge.h +++ b/paddle/fluid/framework/ngraph_bridge.h @@ -14,8 +14,6 @@ limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_NGRAPH - #include #include #include @@ -53,4 +51,3 @@ class NgraphBridge { } // namespace framework } // namespace paddle -#endif diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index 3fea753f0659019395c9b214e52a7912058c501c..253de4c61160e52202a0192215a93284f27e5896 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_NGRAPH #include #include @@ -58,16 +57,16 @@ typedef enum { /* nGraph support state on ops */ } op_state; // perform graph build through bridge and execute computation -class NgraphOperator { +class NgraphEngine { public: - explicit NgraphOperator(const Scope& scope, const platform::Place& place, - const std::vector>& ops, - const std::unordered_map< - std::string, ngraph::element::Type>& var_type_map, - const std::unordered_set& persist, - const std::unordered_set& fetches, - const std::unordered_set& post_op_inputs, - op_state ng_op_state) + explicit NgraphEngine(const Scope& scope, const platform::Place& place, + const std::vector>& ops, + const std::unordered_map< + std::string, ngraph::element::Type>& var_type_map, + const std::unordered_set& persist, + const std::unordered_set& fetches, + const std::unordered_set& post_op_inputs, + op_state ng_op_state) : scope_(scope), place_(place), fused_ops_(ops), @@ -132,7 +131,7 @@ class NgraphOperator { }; std::vector>::iterator>> -FusedOperator::FusedOpIntervals( +NgraphOperator::NgraphOpIntervals( std::vector>* ops) { std::vector>::iterator>> intervals; @@ -185,7 +184,7 @@ FusedOperator::FusedOpIntervals( return intervals; } -FusedOperator::FusedOperator( +NgraphOperator::NgraphOperator( const ProgramDesc& prog, size_t block_id, std::vector>::iterator start, std::vector>::iterator end, @@ -215,7 +214,7 @@ FusedOperator::FusedOperator( Process(); } -void FusedOperator::Process() { +void NgraphOperator::Process() { auto& bdesc = pdesc_.Block(block_); for (auto& var : bdesc.AllVars()) { if (!(var->GetType() == proto::VarType::SELECTED_ROWS || @@ -251,8 +250,8 @@ void FusedOperator::Process() { } } -void FusedOperator::RunImpl(const Scope& scope, - const platform::Place& place) const { +void NgraphOperator::RunImpl(const Scope& scope, + const platform::Place& place) const { op_state ng_op_state = PARTIAL_TEST; auto& bdesc = pdesc_.Block(block_); for (auto* op : bdesc.AllOps()) { @@ -266,19 +265,19 @@ void FusedOperator::RunImpl(const Scope& scope, ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN; } - NgraphOperator ngraph_op(scope, place, fused_ops_, var_type_map_, - persistables_, fetches_, post_op_inputs_, - ng_op_state); - ngraph_op.Run(scope, place); + NgraphEngine ngraph_engine(scope, place, fused_ops_, var_type_map_, + persistables_, fetches_, post_op_inputs_, + ng_op_state); + ngraph_engine.Run(scope, place); } std::unordered_map> - NgraphOperator::func_cache_ = {}; + NgraphEngine::func_cache_ = {}; -std::shared_ptr NgraphOperator::backend_ = +std::shared_ptr NgraphEngine::backend_ = ngraph::runtime::Backend::create("CPU"); -void NgraphOperator::GetNgInputShape(std::shared_ptr op) { +void NgraphEngine::GetNgInputShape(std::shared_ptr op) { op->RuntimeInferShape(scope_, place_); for (auto& var_name_item : op->Inputs()) { for (auto& var_name : var_name_item.second) { @@ -301,7 +300,7 @@ void NgraphOperator::GetNgInputShape(std::shared_ptr op) { } } -void NgraphOperator::BuildNgNodes() { +void NgraphEngine::BuildNgNodes() { for (auto& var_name : var_out_) { if (var_node_map_->find(var_name) == var_node_map_->end()) { auto* var = scope_.FindVar(var_name); @@ -323,7 +322,7 @@ void NgraphOperator::BuildNgNodes() { } } -void NgraphOperator::BuildNgIO() { +void NgraphEngine::BuildNgIO() { std::unordered_set inputs; std::unordered_set outputs; @@ -395,7 +394,7 @@ void NgraphOperator::BuildNgIO() { } } -void NgraphOperator::BuildNgFunction() { +void NgraphEngine::BuildNgFunction() { BuildNgNodes(); ngraph_function_ = nullptr; ngraph::NodeVector func_outputs; @@ -416,7 +415,7 @@ void NgraphOperator::BuildNgFunction() { std::make_shared(func_outputs, func_inputs); } -std::shared_ptr NgraphOperator::GetCacheKey() { +std::shared_ptr NgraphEngine::GetCacheKey() { auto cache_key = std::make_shared(""); *cache_key += std::to_string(fused_ops_.size()); for (auto& op : fused_ops_) { @@ -444,7 +443,7 @@ std::shared_ptr NgraphOperator::GetCacheKey() { return cache_key; } -void NgraphOperator::GetNgFunction() { +void NgraphEngine::GetNgFunction() { bool cache_on = true; if (cache_on) { std::string cache_key_val = *GetCacheKey(); @@ -459,8 +458,7 @@ void NgraphOperator::GetNgFunction() { } } -void NgraphOperator::Run(const Scope& scope, - const platform::Place& place) const { +void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const { std::vector> t_in; std::vector> t_out; @@ -545,7 +543,6 @@ void NgraphOperator::Run(const Scope& scope, } backend_->call(ngraph_function_, t_out, t_in); -} // NgraphOperator::RunImpl +} // NgraphEngine::RunImpl } // namespace framework } // namespace paddle -#endif diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h index 3ca023e11111c5b447b2cabbfb8bb29877297f65..ede80f44bea208b66acc3b3f4bc0f4adee4fb860 100644 --- a/paddle/fluid/framework/ngraph_operator.h +++ b/paddle/fluid/framework/ngraph_operator.h @@ -14,8 +14,6 @@ limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_NGRAPH - #include #include #include @@ -34,14 +32,14 @@ limitations under the License. */ namespace paddle { namespace framework { -class FusedOperator : public OperatorBase { +class NgraphOperator : public OperatorBase { public: static std::vector< std::vector>::iterator>> - FusedOpIntervals( + NgraphOpIntervals( std::vector>* ops); - explicit FusedOperator( + explicit NgraphOperator( const ProgramDesc& prog, size_t block_id, std::vector>::iterator start, std::vector>::iterator end, @@ -64,4 +62,3 @@ class FusedOperator : public OperatorBase { }; } // namespace framework } // namespace paddle -#endif diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 36673e48c2047bca54f604b082dfec123f1e2c82..6d39bb3c524b4725dfebd6ef07594b0b45c65463 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -319,7 +319,7 @@ struct OpKernelRegistrarFunctorExSetMainProgram(program.release()); } else if (argument->model_program_path_valid() && argument->model_params_path_valid()) { - auto program = - LoadModel(argument->model_program_path(), argument->model_params_path(), - argument->scope_ptr(), place, argument->model_from_memory()); + auto program = LoadModel( + argument->model_program_path(), argument->model_params_path(), + argument->scope_ptr(), place, + argument->model_from_memory_valid() && argument->model_from_memory()); argument->SetMainProgram(program.release()); } else { PADDLE_THROW( diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index a07626a10315a6206f8c1ebc9a19df90663a88ee..8a4bc04b67879918c6ac8d1b40dae68a107034d4 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -1,4 +1,4 @@ -set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor) +set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor benchmark) if(WITH_GPU AND TENSORRT_FOUND) set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor) diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index d572ea0177c1e398229a02718ca31cc78a7059ef..8209a049f4614fe31c22c4e83c1968411b749b49 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -30,8 +30,10 @@ #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/tests/api/config_printer.h" #include "paddle/fluid/inference/tests/test_helper.h" +#include "paddle/fluid/inference/utils/benchmark.h" #include "paddle/fluid/platform/profiler.h" +DEFINE_string(model_name, "", "model name"); DEFINE_string(infer_model, "", "model path"); DEFINE_string(infer_data, "", "data file"); DEFINE_int32(batch_size, 1, "batch size."); @@ -40,6 +42,8 @@ DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); DEFINE_bool(use_analysis, true, "Running the inference program in analysis mode."); +DEFINE_bool(record_benchmark, false, + "Record benchmark after profiling the model"); DECLARE_bool(profile); DECLARE_int32(paddle_num_threads); @@ -192,8 +196,16 @@ void TestOneThreadPrediction( predictor->Run(inputs[j], outputs, batch_size); } } - PrintTime(batch_size, num_times, 1, 0, run_timer.toc() / num_times, - inputs.size()); + + double latency = run_timer.toc() / num_times; + PrintTime(batch_size, num_times, 1, 0, latency, inputs.size()); + if (FLAGS_record_benchmark) { + Benchmark benchmark; + benchmark.SetName(FLAGS_model_name); + benchmark.SetBatchSize(batch_size); + benchmark.SetLatency(latency); + benchmark.PersistToFile("benchmark_record.txt"); + } } } diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index ef612ce6148329c33f194842945bb5438afcf645..9eb3fb5da1065f14d9ad1c3520f6415fbadfdca1 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -135,6 +135,9 @@ TEST(TensorRT_resnext50, compare) { TEST(TensorRT_resnext50, profile) { std::string model_dir = FLAGS_infer_model + "/resnext50"; + // Set FLAGS_record_benchmark to true to record benchmark to file. + // FLAGS_record_benchmark=true; + FLAGS_model_name = "resnext50"; profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt); } diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc index d03aa11b75ee58524746212e43a5796773f47932..0bd526bcac2d9ceda95730dc3c5210aed8ccfb5c 100644 --- a/paddle/fluid/inference/utils/benchmark.cc +++ b/paddle/fluid/inference/utils/benchmark.cc @@ -30,7 +30,7 @@ std::string Benchmark::SerializeToString() const { ss << '\n'; ss << name_ << "\t"; - ss << batch_size_ << "\t"; + ss << batch_size_ << "\t\t"; ss << num_threads_ << "\t"; ss << latency_ << "\t"; ss << 1000.0 / latency_; diff --git a/paddle/fluid/inference/utils/visualizer.cc b/paddle/fluid/inference/utils/visualizer.cc index 040b6476fb4febc5ca1912c8db72dc63c3bced08..7c0dd64dea88e51b24c4bc04818d633ee0d2f722 100644 --- a/paddle/fluid/inference/utils/visualizer.cc +++ b/paddle/fluid/inference/utils/visualizer.cc @@ -26,9 +26,6 @@ DEFINE_string(model_dir, "", "model directory"); DEFINE_string(model_program_path, "", "model program path"); DEFINE_string(model_params_path, "", "model params path"); -USE_PASS(graph_viz_pass); -USE_PASS(graph_to_program_pass); - using paddle::inference::analysis::Argument; namespace paddle { @@ -40,7 +37,6 @@ void Visualizer::SetArgument(Argument *argument) { argument_ = argument; } bool Visualizer::Run() { paddle::framework::InitDevices(false); paddle::inference::analysis::Analyzer().Run(argument_); - return true; } @@ -77,7 +73,7 @@ int main(int argc, char *argv[]) { // Only 1 pass, default filename is 0_ir_origin.dot // For more details, looking for paddle::inference::analysis::IRPassManager - argument.SetIrAnalysisPasses({"graph_viz_pass"}); + argument.SetIrAnalysisPasses({"infer_clean_graph_pass", "graph_viz_pass"}); std::unique_ptr scope{ new paddle::framework::Scope()}; @@ -90,3 +86,7 @@ int main(int argc, char *argv[]) { return 0; } + +USE_PASS(infer_clean_graph_pass); +USE_PASS(graph_viz_pass); +USE_PASS(graph_to_program_pass); diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 87d549678a0e6c183aac89539cf1f6331729de2c..c7df3ea58a91579e35ff0d486516271a6daf054f 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -301,23 +301,22 @@ template struct GeluFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { - auto temp = - ((x * static_cast(M_SQRT1_2)).erf()).template cast().eval(); + auto temp = (x * static_cast(M_SQRT1_2)).erf(); out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); } }; template struct GeluGradFunctor : BaseActivationFunctor { - bool Inplace() const { return IsInplace("gelu"); } template void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp = (static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * x * - ((-static_cast(0.5) * x.square()).exp())) - .template cast() - .eval(); - dx.device(d) = dout * (out / x + temp); + auto first = static_cast(0.5) * + (static_cast(1) + ((x * static_cast(M_SQRT1_2)).erf())); + + auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * x * + (-static_cast(0.5) * x.square()).exp(); + dx.device(d) = dout * (first + second); } }; diff --git a/paddle/fluid/operators/concat_mkldnn_op.cc b/paddle/fluid/operators/concat_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..7ad674056f0d753d79408a11eff1aca47a84998a --- /dev/null +++ b/paddle/fluid/operators/concat_mkldnn_op.cc @@ -0,0 +1,152 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/operators/concat_op.h" +#include "paddle/fluid/platform/mkldnn_helper.h" + +namespace paddle { +namespace operators { + +using framework::DataLayout; +using framework::Tensor; +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::concat; +using mkldnn::stream; +using platform::to_void_cast; + +static void EnforceLayouts(const std::vector inputs) { + for (auto* input : inputs) { + const bool is_layout_correct = input->layout() == DataLayout::kMKLDNN; + const bool is_format_defined = + input->format() != memory::format::format_undef; + PADDLE_ENFORCE(is_layout_correct && is_format_defined, + "Wrong layout/format set for Input tensor"); + } +} + +static memory::primitive_desc CreateMemPrimDesc(const Tensor& input, + const mkldnn::engine& engine) { + constexpr auto data_type = mkldnn::memory::f32; + const auto dims = paddle::framework::vectorize2int(input.dims()); + const auto format = input.format(); + auto description = memory::desc(dims, data_type, format); + auto mem_prim_desc = memory::primitive_desc(description, engine); + return mem_prim_desc; +} + +static mkldnn::memory::format GetDstMemFormat( + const concat::primitive_desc& concat_pd) { + return (memory::format)concat_pd.dst_primitive_desc().desc().data.format; +} + +static platform::CPUPlace GetCpuPlace( + const paddle::framework::ExecutionContext& ctx) { + auto place = ctx.GetPlace(); + PADDLE_ENFORCE(paddle::platform::is_cpu_place(place), + "It must use CPUPlace."); + return boost::get(place); +} + +static const mkldnn::engine& GetMKLDNNEngine( + const paddle::framework::ExecutionContext& ctx) { + auto& dev_ctx = ctx.template device_context(); + return dev_ctx.GetEngine(); +} + +template +class ConcatPrimitiveFactory { + public: + concat::primitive_desc CreateConcatPrimDescriptor( + const std::vector multi_input, Tensor* output, + int concat_axis, const mkldnn::engine& mkldnn_engine) { + CreateSourcesDescriptors(multi_input, mkldnn_engine); + auto dst_desc = CreateDstMemDescriptor(output); + return concat::primitive_desc(dst_desc, concat_axis, srcs_pd); + } + + concat CreateConcatPrimitive(const concat::primitive_desc& concat_pd, + Tensor* output, platform::CPUPlace place) { + CreateSourcePrimitiveAts(); + dst_mem = CreateDstMemory(concat_pd, output, place); + return concat(concat_pd, inputs, dst_mem.get()); + } + + private: + memory::desc CreateDstMemDescriptor(Tensor* output) { + auto dst_dims = paddle::framework::vectorize2int(output->dims()); + return memory::desc(dst_dims, platform::MKLDNNGetDataType(), + memory::format::any); + } + + mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd, + Tensor* output, platform::CPUPlace place) { + return memory(concat_pd.dst_primitive_desc(), + output->mutable_data(place)); + } + + void CreateSourcesDescriptors(const std::vector multi_input, + const mkldnn::engine& mkldnn_engine) { + for (size_t i = 0; i < multi_input.size(); i++) { + auto mem_prim_desc = CreateMemPrimDesc(*multi_input[i], mkldnn_engine); + srcs_pd.push_back(mem_prim_desc); + srcs.push_back( + memory(mem_prim_desc, to_void_cast(multi_input[i]->data()))); + } + } + + void CreateSourcePrimitiveAts() { + inputs.reserve(srcs.size()); + for (size_t i = 0; i < srcs.size(); i++) { + inputs.push_back(srcs[i]); + } + } + + private: + std::vector srcs_pd; + std::vector srcs; + std::vector inputs; + boost::optional dst_mem; // TODO(mgallus): change to std::optional +}; // upon introduction of C++17 to paddle + +template +class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + auto place = GetCpuPlace(ctx); + const auto& mkldnn_engine = GetMKLDNNEngine(ctx); + + auto multi_input = ctx.MultiInput("X"); + EnforceLayouts(multi_input); + Tensor* output = ctx.Output("Out"); + int64_t concat_axis = static_cast(ctx.Attr("axis")); + + ConcatPrimitiveFactory prim_creator; + auto concat_pd = prim_creator.CreateConcatPrimDescriptor( + multi_input, output, static_cast(concat_axis), mkldnn_engine); + auto concat = prim_creator.CreateConcatPrimitive(concat_pd, output, place); + stream(stream::kind::eager).submit({concat}).wait(); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetDstMemFormat(concat_pd)); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace, + ops::ConcatMKLDNNOpKernel) diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 57817da71adfd80faad29a48b05ba2f326de6c07..194f9cf5033a3a73afeb8e92ddbdcc7b316fcd35 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -13,10 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/concat_op.h" - #include #include +#ifdef PADDLE_WITH_MKLDNN +#include +#endif + namespace paddle { namespace operators { using framework::Tensor; @@ -59,6 +62,22 @@ class ConcatOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", out_dims); ctx->ShareLoD("X", /*->*/ "Out"); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + framework::GetDataTypeOfVar(ctx.MultiInputVar("X")[0]); + +#ifdef PADDLE_WITH_MKLDNN + if (platform::CanMKLDNNBeUsed(ctx)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class ConcatOpMaker : public framework::OpProtoAndCheckerMaker { @@ -66,6 +85,10 @@ class ConcatOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "Input tensors of concat operator.").AsDuplicable(); AddOutput("Out", "Output tensor of concat operator."); + AddAttr( + "use_mkldnn", + "(bool, default false) Indicates if MKL-DNN kernel will be used") + .SetDefault(false); AddAttr("axis", "The axis along which the input tensors will be concatenated.") .SetDefault(0); diff --git a/paddle/fluid/operators/distributed/brpc_client.cc b/paddle/fluid/operators/distributed/brpc_client.cc index b394c678fb6503eb73a1e11e6feb814251e9e940..350969f74be258ffbfef687b56083a9c6508bc81 100644 --- a/paddle/fluid/operators/distributed/brpc_client.cc +++ b/paddle/fluid/operators/distributed/brpc_client.cc @@ -158,7 +158,7 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { for (int i = 0; i < FLAGS_brpc_channel_num; ++i) { std::shared_ptr c(new ChannelContext()); if (c->channel.Init(ep.c_str(), &options) != 0) { - LOG(ERROR) << "Fail to initialize channel"; + LOG(FATAL) << "Fail to initialize channel"; return nullptr; } diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index 857214aa211aee0251571e46049c66c084b470f1..f14dfcdb238a9580affde96e4d5a0093743eb6c8 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -390,8 +390,7 @@ void GRPCClient::Proceed() { VLOG(3) << c->GetVarHandlePtr()->String() << " process"; c->Process(); } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) { - // FIXME(gongwb): parse error_details? - LOG(ERROR) << c->GetVarHandlePtr()->String() + LOG(FATAL) << c->GetVarHandlePtr()->String() << " meets grpc error, error_code:" << c->status_.error_code() << " error_message:" << c->status_.error_message() << " error_details:" << c->status_.error_details(); diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py index 1cda2711f765622b0bda6f4c688f69352bbd2a6f..1c45a10a9ddde743dce9b343e4d18f568bb05e72 100644 --- a/python/paddle/fluid/tests/unittests/dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/dist_mnist.py @@ -93,7 +93,7 @@ class TestDistMnist2x2(TestDistRunnerBase): # TODO(typhoonzero): fix distributed adam optimizer # opt = fluid.optimizer.AdamOptimizer( # learning_rate=0.001, beta1=0.9, beta2=0.999) - opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9) + opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9) # Reader train_reader = paddle.batch( diff --git a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..0f2130f9049c7ee294444282e59c654551f76603 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py @@ -0,0 +1,61 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +from test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3 + + +class TestMKLDNNConcatOp(TestConcatOp): + def setUp(self): + super(TestMKLDNNConcatOp, self).setUp() + self.attrs["use_mkldnn"] = True + self._cpu_only = True + + def test_check_grad(self): + pass + + def init_kernel_type(self): + self.use_mkldnn = True + + +class TestMKLDNNConcatOp2(TestConcatOp2): + def setUp(self): + super(TestMKLDNNConcatOp2, self).setUp() + self.attrs["use_mkldnn"] = True + self._cpu_only = True + + def test_check_grad(self): + pass + + def init_kernel_type(self): + self.use_mkldnn = True + + +class TestMKLDNNConcatOp3(TestConcatOp3): + def setUp(self): + super(TestMKLDNNConcatOp3, self).setUp() + self.attrs["use_mkldnn"] = True + self._cpu_only = True + + def test_check_grad(self): + pass + + def init_kernel_type(self): + self.use_mkldnn = True + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 26fa20291b52e469066e23b5c29a8e11b40a1270..cedb3383ed4728306c61d7f987850000506457c7 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -32,7 +32,7 @@ DEFAULT_BATCH_SIZE = 2 class TestDistRunnerBase(object): - def get_model(self, batch_size=DEFAULT_BATCH_SIZE): + def get_model(self, batch_size=DEFAULT_BATCH_SIZE, lr=0.1): raise NotImplementedError( "get_model should be implemented by child classes.") @@ -56,6 +56,7 @@ class TestDistRunnerBase(object): return t def run_pserver(self, args): + self.lr = args.lr self.get_model(batch_size=args.batch_size) # NOTE: pserver should not call memory optimize t = self.get_transpiler(args.trainer_id, @@ -71,6 +72,7 @@ class TestDistRunnerBase(object): exe.run(pserver_prog) def run_trainer(self, args): + self.lr = args.lr test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=args.batch_size) @@ -189,6 +191,7 @@ def runtime_main(test_class): parser.add_argument( '--use_reader_alloc', action='store_true', required=False) parser.add_argument('--batch_size', required=False, type=int, default=2) + parser.add_argument('--lr', required=False, type=float, default=0.001) parser.add_argument( '--batch_merge_repeat', required=False, type=int, default=1) @@ -234,6 +237,7 @@ class TestDistBase(unittest.TestCase): self._dc_asgd = False # must use with async mode self._use_reader_alloc = True self._nccl2_mode = False + self._lr = 0.001 self._setup_config() self._after_setup_config() @@ -284,7 +288,8 @@ class TestDistBase(unittest.TestCase): batch_size=DEFAULT_BATCH_SIZE, batch_merge_repeat=1): - cmd = "%s %s --role trainer" % (self._python_interp, model) + cmd = "%s %s --role trainer --lr %f" % (self._python_interp, model, + self._lr) if batch_size != DEFAULT_BATCH_SIZE: cmd += " --batch_size %d" % batch_size if batch_merge_repeat > 1: @@ -330,13 +335,13 @@ class TestDistBase(unittest.TestCase): ps0_ep, ps1_ep = self._ps_endpoints.split(",") - tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver" + tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver --lr %f" tr0_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 0, ps0_ep, self._trainers) + 0, ps0_ep, self._trainers, self._lr) tr1_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 1, ps1_ep, self._trainers) + 1, ps1_ep, self._trainers, self._lr) if self._sync_mode: tr0_cmd += " --sync_mode" @@ -425,13 +430,13 @@ class TestDistBase(unittest.TestCase): worker_endpoints = self._ps_endpoints.split(",") w0_ep, w1_ep = worker_endpoints - tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2" + tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2 --lr %f" tr0_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 0, w0_ep) + 0, w0_ep, self._lr / 2) tr1_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 1, w1_ep) + 1, w1_ep, self._lr / 2) if self._mem_opt: tr0_cmd += " --mem_opt" diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py index 630bed198f4fc382d716373ea872e24b1b45bbf3..49a2ca40e3cb1dd35027345e9c38eb8b6912d2cd 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py @@ -36,7 +36,7 @@ class TestDistMnistNCCL2(TestDistBase): def test_dist_train(self): import paddle.fluid as fluid if fluid.core.is_compiled_with_cuda(): - self.check_with_place("dist_mnist.py", delta=1) + self.check_with_place("dist_mnist.py", delta=1e-5) class TestDistMnist2x2Lars(TestDistBase): diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py index 20f91cf4485f2e79c20fe90143c8b7deebb9fc49..62994eec7e7f56267a0990d9a5e3b5c62d7d5fe4 100644 --- a/python/paddle/fluid/tests/unittests/test_regularizer.py +++ b/python/paddle/fluid/tests/unittests/test_regularizer.py @@ -15,7 +15,12 @@ from __future__ import print_function import unittest - +from functools import partial +import contextlib +import numpy as np +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid import paddle.fluid.framework as framework import paddle.fluid.optimizer as optimizer import paddle.fluid.regularizer as regularizer @@ -97,5 +102,134 @@ class TestL1DecayRegularizer(unittest.TestCase): self.assertEqual(block.ops[-3].type, 'sign') +def bow_net(data, + label, + dict_dim, + is_sparse=False, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + BOW net + This model is from https://github.com/PaddlePaddle/models: + fluid/PaddleNLP/text_classification/nets.py + """ + emb = fluid.layers.embedding( + input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]) + bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') + bow_tanh = fluid.layers.tanh(bow) + fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") + fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") + prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + return avg_cost + + +class TestRegularizer(unittest.TestCase): + def setUp(self): + self.word_dict = paddle.dataset.imdb.word_dict() + reader = paddle.batch( + paddle.dataset.imdb.train(self.word_dict), batch_size=8)() + self.train_data = [next(reader) for _ in range(5)] + + def get_places(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + @contextlib.contextmanager + def scope_prog_guard(self, main_prog, startup_prog): + scope = fluid.core.Scope() + with fluid.unique_name.guard(): + with fluid.scope_guard(scope): + with fluid.program_guard(main_prog, startup_prog): + yield + + def run_program(self, place, feed_list): + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=feed_list, place=place) + exe.run(fluid.default_startup_program()) + + main_prog = fluid.default_main_program() + param_list = [var.name for var in main_prog.block(0).all_parameters()] + + param_sum = [] + for data in self.train_data: + out = exe.run(main_prog, + feed=feeder.feed(data), + fetch_list=param_list) + p_sum = 0 + for v in out: + p_sum += np.sum(np.abs(v)) + param_sum.append(p_sum) + return param_sum + + def check_l2decay_regularizer(self, place, model): + main_prog = fluid.framework.Program() + startup_prog = fluid.framework.Program() + startup_prog.random_seed = 1 + with self.scope_prog_guard( + main_prog=main_prog, startup_prog=startup_prog): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + avg_cost = model(data, label, len(self.word_dict)) + + optimizer = fluid.optimizer.Adagrad( + learning_rate=0.1, + regularization=fluid.regularizer.L2Decay(1.0)) + optimizer.minimize(avg_cost) + param_sum = self.run_program(place, [data, label]) + return param_sum + + def check_l2decay(self, place, model): + main_prog = fluid.framework.Program() + startup_prog = fluid.framework.Program() + startup_prog.random_seed = 1 + with self.scope_prog_guard( + main_prog=main_prog, startup_prog=startup_prog): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + avg_cost_l2 = model(data, label, len(self.word_dict)) + + param_list = fluid.default_main_program().block(0).all_parameters() + para_sum = [] + for para in param_list: + para_mul = fluid.layers.square(x=para) + para_sum.append(fluid.layers.reduce_sum(input=para_mul)) + avg_cost_l2 += fluid.layers.sums(para_sum) * .5 + + optimizer = fluid.optimizer.Adagrad(learning_rate=0.1) + optimizer.minimize(avg_cost_l2) + param_sum = self.run_program(place, [data, label]) + return param_sum + + def test_l2(self): + for place in self.get_places(): + dense_sparse_p_sum = [] + for sparse in [True, False]: + model = partial(bow_net, is_sparse=sparse) + framework_l2 = self.check_l2decay_regularizer(place, model) + l2 = self.check_l2decay(place, model) + assert len(l2) == len(framework_l2) + for i in range(len(l2)): + assert np.isclose(a=framework_l2[i], b=l2[i], rtol=5e-5) + dense_sparse_p_sum.append(framework_l2) + + assert len(dense_sparse_p_sum[0]) == len(dense_sparse_p_sum[1]) + for i in range(len(dense_sparse_p_sum[0])): + assert np.isclose( + a=dense_sparse_p_sum[0][i], + b=dense_sparse_p_sum[1][i], + rtol=5e-5) + + if __name__ == '__main__': unittest.main()