diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index c662ae8d85d280114e9bc1c19544789bd6508675..fd02372ae8f85586f6ca50aec9933ea874beec6c 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -64,7 +64,15 @@ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope d cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass buffer_shared_inplace_op_pass buffer_shared_cross_op_memory_reuse_pass set_reader_device_count_pass) +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto + sequential_execution_pass + modify_op_lock_and_record_event_pass + all_reduce_deps_pass + reference_count_pass + eager_deletion_pass + buffer_shared_inplace_op_pass + buffer_shared_cross_op_memory_reuse_pass + set_reader_device_count_pass) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope @@ -91,23 +99,22 @@ cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executo DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle) +set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass + multi_devices_graph_print_pass multi_devices_graph_check_pass + fuse_elewise_add_act_pass fuse_bn_act_pass + multi_batch_merge_pass + fuse_relu_depthwise_conv_pass + lock_free_optimize_pass + coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass + fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass + sync_batch_norm_pass runtime_context_cache_pass) +if(WITH_GPU) + set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass) +endif() if(WITH_NGRAPH) - set(NGRAPH_BS_DEPS ngraph) -else() - set(NGRAPH_BS_DEPS) + set(IR_PASS_DEPS ${IR_PASS_DEPS} ngraph) endif() - -cc_library(build_strategy SRCS build_strategy.cc DEPS - graph_viz_pass multi_devices_graph_pass - multi_devices_graph_print_pass multi_devices_graph_check_pass - fuse_elewise_add_act_pass fuse_bn_act_pass multi_batch_merge_pass - fuse_relu_depthwise_conv_pass - lock_free_optimize_pass - coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass - fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass - sync_batch_norm_pass runtime_context_cache_pass - pass_builder - ${NGRAPH_BS_DEPS}) +cc_library(build_strategy SRCS build_strategy.cc DEPS pass_builder ${IR_PASS_DEPS}) if (WITH_MKLDNN) target_link_libraries(build_strategy mkldnn_placement_pass) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index d7e066357f2cb99256db223350b4134b2bbc697d..7a0b2199c093688ba18630f5459d91da9fe18e41 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -166,9 +166,12 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { void AppendOpFusePasses() { AppendPassWithCheck(strategy_.fuse_relu_depthwise_conv_, "fuse_relu_depthwise_conv_pass"); + AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass"); +#ifdef PADDLE_WITH_CUDA + AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass"); +#endif AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_, "fuse_elewise_add_act_pass"); - AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass"); // for single card training, fuse_all_reduce_ops is unnecessary. // coalesce_grad_tensor_pass should be before of MultiDevPass. AppendPassWithCheck(strategy_.fuse_all_reduce_ops_, @@ -375,6 +378,12 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, "GPU, skipped."; continue; } + } else if (pass->Type() == "fusion_group_pass") { + pass->Set("use_gpu", new bool(use_cuda)); + if (!use_cuda) { + LOG(WARNING) << "fusion_group_pass is only supported on GPU, skipped."; + continue; + } } else if (pass->Type() == "fuse_bn_act_pass") { if (!use_cuda) { LOG(WARNING) << "fuse_bn_act_pass is only supported on " @@ -435,3 +444,6 @@ USE_PASS(mkldnn_placement_pass); #ifdef PADDLE_WITH_NGRAPH USE_PASS(ngraph_subgraph_pass); #endif +#ifdef PADDLE_WITH_CUDA +USE_PASS(fusion_group_pass); +#endif diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 0b12f16727f1ccc2c572cf6929e273c25d6545af..0e59969989868b088e117b63aeb28f7ca7dd1754 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -86,8 +86,9 @@ struct BuildStrategy { // Operator fusion // TODO(dev-paddle): fuse_elewise_add_act_ops may cause some models have // cycle. - bool fuse_elewise_add_act_ops_{false}; bool fuse_bn_act_ops_{false}; + bool fuse_elewise_add_act_ops_{false}; + bool enable_auto_fusion_{false}; // Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients // should not be sparse types boost::optional fuse_all_optimizer_ops_{false}; diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index fe2e1ab30dd7411545400f3ee7f5737ecc866888..b88fa857067cd0d605aaa9bb58a6b1aac9e6aba9 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -6,7 +6,7 @@ file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n") add_subdirectory(fuse_optimizer_ops_pass) add_subdirectory(memory_optimize_pass) add_subdirectory(multi_devices_graph_pass) -if(NOT APPLE AND NOT WIN32) +if(NOT APPLE AND NOT WIN32 AND WITH_GPU) add_subdirectory(fusion_group) endif() diff --git a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt index 1887f425d12a054efad1c5cafd27072778bdb6dc..fe2bd27524fbfc7f3d2b68f5e3da8feefd46557a 100644 --- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt +++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt @@ -1,9 +1,11 @@ -cc_library(code_generator SRCS operation.cc code_generator.cc code_generator_helper.cc DEPS graph) +cc_library(code_generator + SRCS operation.cc code_generator.cc code_generator_helper.cc + DEPS graph subgraph_detector) if(WITH_GPU) cc_test(test_code_generator SRCS code_generator_tester.cc DEPS code_generator device_code lod_tensor graph_viz_pass) endif() cc_library(fusion_group_pass SRCS fusion_group_pass.cc elementwise_group_detector.cc - DEPS graph_pattern_detector pass code_generator) + DEPS subgraph_detector fuse_pass_base code_generator device_code) cc_test(test_fusion_group_pass SRCS fusion_group_pass_tester.cc DEPS fusion_group_pass graph_viz_pass) diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.cc b/paddle/fluid/framework/ir/fusion_group/code_generator.cc index c41a2ed83591459289647ddad59db1636aabd3f7..0f9ee83a411084a25eb22ebe227087448493a345 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc @@ -33,7 +33,7 @@ CodeGenerator::CodeGenerator() { std::string CodeGenerator::Generate(SubGraph* subgraph) { std::vector expressions = ConvertToExpressions(subgraph); - return Generate(subgraph->func_name, expressions); + return Generate(subgraph->GetFuncName(), expressions); } static bool HasInput(Node* n, std::string name) { diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc index 9515237f964409adc083f83a03acde85983b32a4..a5409cb9d6abf9525cedf6851cdf8377f1879fe6 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc @@ -227,7 +227,7 @@ std::vector TestMain( std::string code_str = code_generator.Generate(subgraph); VLOG(3) << code_str; - TestMainImpl(subgraph->func_name, code_str, cpu_tensors, n, input_ids, + TestMainImpl(subgraph->GetFuncName(), code_str, cpu_tensors, n, input_ids, output_ids); // Need to check the accuracy according to expressions. diff --git a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc index fb6f7b8d746505c0f52298ece980fffe880b1cb8..970d97e8e3bf05b22a08786f1b21042a12f4ebb0 100644 --- a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc +++ b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc @@ -13,8 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h" +#include +#include +#include #include "paddle/fluid/framework/ir/fusion_group/operation.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/subgraph_detector.h" namespace paddle { namespace framework { @@ -26,20 +29,22 @@ static std::unordered_set unary_op_types; static std::unordered_set& GetBinaryOpTypes() { if (binary_op_types.empty()) { - binary_op_types = OperationMap::Instance().Find(0, 2); + binary_op_types = + OperationMap::Instance().Find(/* type= */ 0, /* num_operands= */ 2); } return binary_op_types; } static std::unordered_set& GetUnaryOpTypes() { if (unary_op_types.empty()) { - unary_op_types = OperationMap::Instance().Find(0, 1); + unary_op_types = + OperationMap::Instance().Find(/* type= */ 0, /* num_operands= */ 1); } return unary_op_types; } static bool IsSpecifiedOp(const std::unordered_set& op_types, - Node* n) { + const Node* n) { if (n && n->IsOp() && n->Op() && n->outputs.size() > 0U) { auto iter = op_types.find(n->Op()->Type()); if (iter != op_types.end()) { @@ -49,114 +54,63 @@ static bool IsSpecifiedOp(const std::unordered_set& op_types, return false; } -static bool IsBinaryOp(Node* n) { - if (IsSpecifiedOp(GetBinaryOpTypes(), n) && n->inputs.size() == 2U) { - auto* x = n->inputs[0]; - auto* y = n->inputs[1]; +static bool IsGradOp(const Node* n) { + PADDLE_ENFORCE_EQ(n && n->IsOp() && n->Op(), true, + platform::errors::InvalidArgument( + "Expected node %p to be an operator node.", n)); + std::string suffix = "_grad"; + std::string op_type = n->Op()->Type(); + size_t pos = op_type.rfind(suffix); + return pos != std::string::npos && + pos == (op_type.length() - suffix.length()); +} - std::vector x_shape; - std::vector y_shape; - if (x && x->IsVar() && x->Var()) { - x_shape = x->Var()->GetShape(); - } - if (y && y->IsVar() && y->Var()) { - y_shape = y->Var()->GetShape(); - } - if (x_shape.size() == 0U || x_shape.size() != y_shape.size()) { +static bool IsEqualAndNotEmpty(const std::vector& l, + const std::vector& r) { + return l.size() != 0U && r.size() != 0U && l == r; +} + +static bool IsBinaryOp(const Node* n) { + if (IsSpecifiedOp(GetBinaryOpTypes(), n)) { + if ((!IsGradOp(n) && n->inputs.size() != 2U) || n->inputs.size() == 0U) { return false; } - for (size_t i = 0; i < x_shape.size(); ++i) { - if (x_shape[i] != y_shape[i]) { + + // The shape of all inputs should be the same. + std::vector shape_0; + for (size_t i = 0; i < n->inputs.size(); ++i) { + auto* in_i = n->inputs[i]; + if (!(in_i && in_i->IsVar() && in_i->Var())) { return false; } - } - return true; - } - return false; -} - -static bool IsUnaryOp(Node* n) { return IsSpecifiedOp(GetUnaryOpTypes(), n); } - -bool ElementwiseGroupDetector::IsElementwiseOp(Node* n) { - return IsBinaryOp(n) || IsUnaryOp(n); -} -bool ElementwiseGroupDetector::IsInputOfElementwiseOp(Node* n, - std::string name) { - if (n && n->IsVar() && n->Var()) { - for (auto* op : n->outputs) { - if (IsElementwiseOp(op)) { - if (name.empty()) { - return true; - } else if (IsNthInput(n, op, name, 0)) { - return true; + std::vector shape_i = in_i->Var()->GetShape(); + if (i == 0U) { + shape_0 = shape_i; + } else { + if (!IsEqualAndNotEmpty(shape_0, shape_i)) { + return false; } } } + return true; } return false; } -bool ElementwiseGroupDetector::IsOutputOfElementwiseOp(Node* n) { - if (n && n->IsVar() && n->Var()) { - for (auto* op : n->inputs) { - if (IsElementwiseOp(op)) { - return true; - } - } - } - return false; +static bool IsUnaryOp(const Node* n) { + return IsSpecifiedOp(GetUnaryOpTypes(), n); } -int ElementwiseGroupDetector::Search(Node* n, std::vector except_nodes) { - std::unordered_set except_nodes_set; - for (size_t i = 0; i < except_nodes.size(); ++i) { - except_nodes_set.insert(except_nodes[i]); - } - - int num_operations = 0; - if (IsElementwiseOp(n)) { - subgraph_.Insert(n); - num_operations += 1; - for (auto* var : n->inputs) { - subgraph_.Insert(var); - if (except_nodes_set.find(var) == except_nodes_set.end()) { - num_operations += Search(var, {n}); - } - } - for (auto* var : n->outputs) { - subgraph_.Insert(var); - if (except_nodes_set.find(var) == except_nodes_set.end()) { - num_operations += Search(var, {n}); - } - } - } else if (n && n->IsVar() && n->Var()) { - for (auto* op : n->inputs) { - if (IsElementwiseOp(op) && - except_nodes_set.find(op) == except_nodes_set.end()) { - num_operations += Search(op, {n}); - } - } - for (auto* op : n->outputs) { - if (IsElementwiseOp(op) && - except_nodes_set.find(op) == except_nodes_set.end()) { - num_operations += Search(op, {n}); - } - } - } - return num_operations; +bool ElementwiseGroupDetector::IsElementwiseOp(const Node* n) { + return IsBinaryOp(n) || IsUnaryOp(n); } -int ElementwiseGroupDetector::operator()(Node* n) { - if (!IsOutputOfElementwiseOp(n) && IsInputOfElementwiseOp(n, "X")) { - name_ = n->Name(); - subgraph_.Insert(n); - num_operations_ = Search(n, n->inputs); - VLOG(4) << "Detect elementwise subgraph begin with " << name_ << ", " - << num_operations_ << " operations, " << GetSubgraph().GetNumNodes() - << " nodes"; - } - return num_operations_; +std::vector> ElementwiseGroupDetector::operator()( + Graph* graph) { + auto teller = [&](const Node* n) -> bool { return IsElementwiseOp(n); }; + + return SubgraphDetector(graph, teller)(); } } // namespace fusion_group diff --git a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h index 49d472eaab870015e2c69b021d3b1f99ff1f8774..ff4db720f5dead2f85d671da1303ab32438803c6 100644 --- a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h +++ b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h @@ -14,10 +14,8 @@ limitations under the License. */ #pragma once -#include -#include #include -#include "paddle/fluid/framework/ir/fusion_group/subgraph.h" +#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/node.h" namespace paddle { @@ -27,21 +25,10 @@ namespace fusion_group { class ElementwiseGroupDetector { public: - int operator()(Node* n); - - SubGraph GetSubgraph() const { return subgraph_; } - - private: - bool IsElementwiseOp(Node* n); - bool IsInputOfElementwiseOp(Node* n, std::string name = ""); - bool IsOutputOfElementwiseOp(Node* n); - - int Search(Node* n, std::vector except_nodes = {}); + std::vector> operator()(Graph* graph); private: - std::string name_; - int num_operations_{0}; - SubGraph subgraph_; + bool IsElementwiseOp(const Node* n); }; } // namespace fusion_group diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc index 4999acbf7daf2999ffbd00283889efbf00351c6d..1d9d4ab5d232dea26e62ddadf750e69efefbe7c0 100644 --- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc +++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc @@ -13,57 +13,88 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h" +#include +#include #include +#include "paddle/fluid/framework/ir/fusion_group/code_generator.h" #include "paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/platform/device_code.h" namespace paddle { namespace framework { namespace ir { void FusionGroupPass::ApplyImpl(ir::Graph* graph) const { - PADDLE_ENFORCE_NOT_NULL(graph); - - int num_elementwise_groups = DetectFusionGroup(graph, 0); - LOG(INFO) << "Detect " << num_elementwise_groups + FusePassBase::Init("fusion_group_pass", graph); + if (Get("use_gpu")) { + fusion_group::OperationMap::Init(); + int num_elementwise_groups = DetectFusionGroup(graph, 0); + VLOG(3) << "Detect " << num_elementwise_groups << " elementwise fusion groups."; + } } int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const { - std::vector subgraphs; - std::unordered_set all_nodes = graph->Nodes(); - for (Node* n : all_nodes) { - bool is_found = false; - for (auto& subgraph : subgraphs) { - if (subgraph.Has(n)) { - is_found = true; - break; - } - } - if (is_found) { - continue; + // TODO(liuyiqun): supported different places + platform::CUDAPlace place = platform::CUDAPlace(0); + int index = platform::DeviceCodePool::Init({place}).size(place); + + std::vector> subgraphs = + fusion_group::ElementwiseGroupDetector()(graph); + + int num_subgraphs = 0; + size_t min_subgraph_size = 2; + bool save_intermediate_out = true; + for (auto& vec : subgraphs) { + if (vec.size() >= min_subgraph_size) { + std::string func_name = "fused_elementwise_" + std::to_string(index++); + fusion_group::SubGraph subgraph( + type, func_name, save_intermediate_out, + std::unordered_set(vec.begin(), vec.end())); + VLOG(3) << "subgraph: {\n" + << DebugString(subgraph.SortedNodes()) << "}\n"; + + GenerateCode(&subgraph); + InsertFusionGroupOp(graph, &subgraph); + num_subgraphs++; } + } + return num_subgraphs; +} - fusion_group::SubGraph subgraph; - if (type == 0) { - fusion_group::ElementwiseGroupDetector detector; - int num_operations = detector(n); - if (num_operations >= 2) { - subgraph = detector.GetSubgraph(); - } - } +void FusionGroupPass::GenerateCode(fusion_group::SubGraph* subgraph) const { + fusion_group::CodeGenerator code_generator; + std::string code_str = code_generator.Generate(subgraph); + VLOG(3) << code_str; + + // TODO(liuyiqun): supported different places + platform::CUDAPlace place = platform::CUDAPlace(0); + std::unique_ptr device_code( + new platform::CUDADeviceCode(place, subgraph->GetFuncName(), code_str)); + device_code->Compile(); + + platform::DeviceCodePool& pool = platform::DeviceCodePool::Init({place}); + pool.Set(std::move(device_code)); +} - if (!subgraph.IsEmpty()) { - subgraphs.push_back(subgraph); +static int ExtractOpRole(fusion_group::SubGraph* subgraph) { + std::unordered_set op_roles; + std::string attr_name = OpProtoAndCheckerMaker::OpRoleAttrName(); + for (auto* n : subgraph->Nodes()) { + if (n && n->IsOp() && n->Op()) { + if (n->Op()->HasAttr(attr_name)) { + op_roles.insert(boost::get(n->Op()->GetAttr(attr_name))); + } } } - - // TODO(liuyiqun): check whether there are intersection between subgraphs - for (size_t i = 0; i < subgraphs.size(); ++i) { - InsertFusionGroupOp(graph, &subgraphs[i]); + if (op_roles.size() == 1U) { + return *(op_roles.begin()); + } else { + return static_cast(OpRole::kNotSpecified); } - return subgraphs.size(); } void FusionGroupPass::InsertFusionGroupOp( @@ -90,10 +121,12 @@ void FusionGroupPass::InsertFusionGroupOp( external_nodes.insert(n); } op_desc.SetOutput("Outs", output_names); - op_desc.SetAttr("type", subgraph->type); - op_desc.SetAttr("func_name", subgraph->func_name); + op_desc.SetAttr("type", subgraph->GetType()); + op_desc.SetAttr("func_name", subgraph->GetFuncName()); + op_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + ExtractOpRole(subgraph)); - auto fusion_group_node = graph->CreateOpNode(&op_desc); + Node* fusion_group_node = graph->CreateOpNode(&op_desc); for (auto* in : input_vars_of_subgraph) { IR_NODE_LINK_TO(in, fusion_group_node); } @@ -114,4 +147,5 @@ void FusionGroupPass::InsertFusionGroupOp( } // namespace framework } // namespace paddle -REGISTER_PASS(fusion_group_pass, paddle::framework::ir::FusionGroupPass); +REGISTER_PASS(fusion_group_pass, paddle::framework::ir::FusionGroupPass) + .RequirePassAttr("use_gpu"); diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h index 8bdddf8877c06d15bd3ce23d6dd60af4189f40f6..72c7250e7205eb1abff9723e44e1b36d427729c1 100644 --- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h +++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h @@ -16,19 +16,20 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fusion_group/subgraph.h" -#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -class FusionGroupPass : public Pass { +class FusionGroupPass : public FusePassBase { protected: void ApplyImpl(Graph* graph) const override; private: int DetectFusionGroup(Graph* graph, int type = 0) const; + void GenerateCode(fusion_group::SubGraph* subgraph) const; void InsertFusionGroupOp(Graph* graph, fusion_group::SubGraph* subgraph) const; diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc index 172ec0c0ee84d06440626caad4771199e5771aae..2446716019cda3959e4d976db0cb51b5a23700a8 100644 --- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc +++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc @@ -138,19 +138,15 @@ int TestMain(std::unique_ptr graph, std::string prefix) { } TEST(FusionGroupPass, elementwise_list) { - fusion_group::OperationMap::Init(); - - std::unique_ptr graph = BuildElementwiseListGraph(false); + std::unique_ptr graph = BuildElementwiseListGraph(true); int num_fusion_group_ops = TestMain(std::move(graph), "elementwise_list"); - EXPECT_EQ(num_fusion_group_ops, 1); + EXPECT_EQ(num_fusion_group_ops, 2); } TEST(FusionGroupPass, elementwise_tree) { - fusion_group::OperationMap::Init(); - - std::unique_ptr graph = BuildElementwiseTreeGraph(false); + std::unique_ptr graph = BuildElementwiseTreeGraph(true); int num_fusion_group_ops = TestMain(std::move(graph), "elementwise_tree"); - EXPECT_EQ(num_fusion_group_ops, 2); + EXPECT_EQ(num_fusion_group_ops, 4); } } // namespace ir diff --git a/paddle/fluid/framework/ir/fusion_group/subgraph.h b/paddle/fluid/framework/ir/fusion_group/subgraph.h index 1dd9caa10c98a9389d1468fb84975a9890d46109..b9810882e1cc78f55ebb1efe0ac4c7844d9b0188 100644 --- a/paddle/fluid/framework/ir/fusion_group/subgraph.h +++ b/paddle/fluid/framework/ir/fusion_group/subgraph.h @@ -20,48 +20,59 @@ limitations under the License. */ #include #include "paddle/fluid/framework/ir/fusion_group/operation.h" #include "paddle/fluid/framework/ir/node.h" -#include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/framework/ir/subgraph_detector.h" namespace paddle { namespace framework { namespace ir { namespace fusion_group { -struct SubGraph { - int type{-1}; - std::string func_name; - bool save_intermediate_out{false}; - +class SubGraph { + public: SubGraph() = default; - SubGraph(int t, std::string f, bool s, const std::unordered_set& n) - : type(t), func_name(f), save_intermediate_out(s), nodes_set(n) {} + explicit SubGraph(int type) : type_(type) {} + SubGraph(int type, std::string func_name, bool save_intermediate_out, + const std::unordered_set& nodes_set) + : type_(type), + func_name_(func_name), + save_intermediate_out_(save_intermediate_out) { + for (auto* n : nodes_set) { + nodes_set_.insert(n); + if (n && n->IsOp() && n->Op()) { + // If the node is an op node, then add its input/output var nodes + // into the subgraph. + for (auto* in : n->inputs) { + nodes_set_.insert(in); + } + for (auto* out : n->outputs) { + nodes_set_.insert(out); + } + } + } + } - bool IsEmpty() { return nodes_set.empty(); } + bool IsEmpty() { return nodes_set_.empty(); } - const std::unordered_set& Nodes() const { return nodes_set; } + int GetType() const { return type_; } + void SetFuncName(std::string func_name) { func_name_ = func_name; } + std::string GetFuncName() const { return func_name_; } + + const std::unordered_set& Nodes() const { return nodes_set_; } const std::vector& SortedNodes() { - if (!is_sorted) { - Sort(); + if (!is_sorted_) { + TopologicalSort(); } - return sorted_nodes; + return sorted_nodes_; } - size_t GetNumNodes() { return nodes_set.size(); } + size_t GetNumNodes() { return nodes_set_.size(); } - bool Has(Node* n) { return nodes_set.find(n) != nodes_set.end(); } - - void Insert(Node* n) { - if (nodes_set.find(n) == nodes_set.end()) { - VLOG(5) << "Insert " << n->Name() << " to subgraph " << this; - nodes_set.insert(n); - is_sorted = false; - } - } + bool Has(Node* n) { return nodes_set_.find(n) != nodes_set_.end(); } int GetNumOperations() { int num_operations = 0; - for (auto* n : nodes_set) { + for (auto* n : nodes_set_) { if (n && n->IsOp() && n->Op()) { num_operations++; } @@ -96,203 +107,108 @@ struct SubGraph { std::vector GetOutputVarNodes() { // The order of output nodes should be consistant anywhere.. - std::vector output_vars; + std::vector output_vars_all; for (auto* n : SortedNodes()) { if (n && n->IsVar() && n->Var()) { - if (save_intermediate_out) { - // If the var_node is the output of some op_node in the subgraph, it - // is considered the output var node of the subgraph. - bool is_found = false; - for (auto* in : n->inputs) { - if (Has(in)) { - is_found = true; - } - } - if (is_found) { - output_vars.push_back(n); - } - } else { - // If one of the var_node's outputs is the input of some operator - // outside the subgraph, it is considered the output var node of the - // subgraph. - bool is_found = true; - if (n->outputs.size() == 0U) { - is_found = false; - } - for (auto* out : n->outputs) { - if (!Has(out)) { - is_found = false; - } - } - if (!is_found) { - output_vars.push_back(n); + // If the var_node is the output of some op_node in the subgraph, it + // is considered the output var node of the subgraph. + bool is_found = false; + for (auto* in : n->inputs) { + if (Has(in)) { + is_found = true; } } + if (is_found) { + output_vars_all.push_back(n); + } } } - return output_vars; - } - private: - int FindIndexInSortedNodes(Node* n) { - for (size_t i = 0; i < sorted_nodes.size(); ++i) { - if (n == sorted_nodes[i]) { - return static_cast(i); - } + if (save_intermediate_out_) { + return output_vars_all; } - return -1; - } - - void SortVarsBasedOnSortedOps() { - // Insert var nodes to sorted_nodes. - std::unordered_map sorted_vars; - for (auto* n : nodes_set) { - if (n && n->IsVar() && n->Var()) { - int from = 0; - int to = sorted_nodes.size(); - - for (auto* in : n->inputs) { - if (in && in->IsOp() && in->Op()) { - int index = FindIndexInSortedNodes(in); - // Insert after input op node - if (index >= 0) { - from = index + 1 > from ? index + 1 : from; - } - } - } - - for (auto* out : n->outputs) { - if (out && out->IsOp() && out->Op()) { - int index = FindIndexInSortedNodes(out); - // Insert before output op node - if (index >= 0) { - to = index < to ? index : to; - } - } - } - if (from > to) { - LOG(INFO) << "subgraph: {\n" << DebugString(Nodes()) << "}\n"; - LOG(INFO) << "sorted nodes: {\n" - << DebugString(sorted_nodes) << "}\n"; + std::vector output_vars_outside; + for (auto* n : output_vars_all) { + // If one of the var_node's outputs is the input of some operator + // outside the subgraph, it is considered the output var node of the + // subgraph. + bool is_found = true; + if (n->outputs.size() == 0U) { + is_found = false; + } + for (auto* out : n->outputs) { + if (!Has(out)) { + is_found = false; } - PADDLE_ENFORCE_LE(from, to, "Range [%d, %d] is invalid.", from, to); - sorted_nodes.insert(sorted_nodes.begin() + to, n); - sorted_vars[n->Name()] = n; + } + if (!is_found) { + output_vars_outside.push_back(n); } } + return output_vars_outside; } - std::vector SortedOps() { - Node* start_op_n = nullptr; - std::unordered_set ops; - for (auto* op_n : nodes_set) { - if (op_n && op_n->IsOp() && op_n->Op()) { - // Initialize ops to all ops in the subgraph. - ops.insert(op_n); + private: + void TopologicalSort() { + if (!is_sorted_) { + std::unordered_map> inputs_map; + std::unordered_map> outputs_map; + for (auto* n : nodes_set_) { + inputs_map[n] = n->inputs; + outputs_map[n] = n->outputs; + } - if (!start_op_n) { - // Find start op node whose inputs are produced outside the subgraph. - bool is_found = false; - for (auto* prev_op_n : GetPrevOpNodes(op_n)) { - if (Has(prev_op_n)) { - is_found = true; - break; + for (auto* n : nodes_set_) { + if (n && n->IsVar() && n->Var()) { + // Set the input of subgraph's input var node to null. + std::vector inputs; + for (auto* in : n->inputs) { + if (Has(in)) { + inputs.push_back(in); } } - if (!is_found) { - start_op_n = op_n; + // Set the output of subgraph's output var node to null. + std::vector outputs; + for (auto* out : n->outputs) { + if (Has(out)) { + outputs.push_back(out); + } } + n->inputs = inputs; + n->outputs = outputs; } } - } - - std::vector sorted_ops; - sorted_ops.push_back(start_op_n); - ops.erase(start_op_n); - while (ops.size() > 0U) { - std::unordered_set erased_ops; - for (auto* op_n : ops) { - bool found_connected_ops = false; - int from = 1; - int to = sorted_ops.size(); - std::unordered_set prev_op_nodes = GetPrevOpNodes(op_n); - std::unordered_set next_op_nodes = GetNextOpNodes(op_n); - for (int i = sorted_ops.size(); i >= 0; --i) { - if (prev_op_nodes.find(sorted_ops[i]) != prev_op_nodes.end()) { - // Insert after i (i + 1) - found_connected_ops = true; - from = (i + 1 > from) ? i + 1 : from; - } - if (next_op_nodes.find(sorted_ops[i]) != next_op_nodes.end()) { - // Insert before i - found_connected_ops = true; - to = (i < to) ? i : to; - } - } - if (found_connected_ops) { - if (from > to) { - LOG(INFO) << "subgraph: {\n" << DebugString(Nodes()) << "}\n"; - } - PADDLE_ENFORCE_LE(from, to, "Range [%d, %d] is invalid.", from, to); - sorted_ops.insert(sorted_ops.begin() + to, op_n); - erased_ops.insert(op_n); + // Collect the start points of the subgraph. + std::vector start_points; + for (auto* n : nodes_set_) { + if (n->inputs.empty()) { + start_points.push_back(n); } } - PADDLE_ENFORCE_GT(erased_ops.size(), 0U); - for (auto* op_n : erased_ops) { - ops.erase(op_n); + // Sort the subgraph. + NodesTSIterator x(start_points); + for (auto& n : iterator_range( + NodesTSIterator(start_points), NodesTSIterator())) { + sorted_nodes_.push_back(&n); } - } - return sorted_ops; - } - - std::unordered_set GetPrevOpNodes(Node* op_n) { - PADDLE_ENFORCE_EQ(op_n && op_n->IsOp() && op_n->Op(), true, - "Node %p is not a op node.", op_n); - - std::unordered_set prev_op_nodes; - for (auto* in_var : op_n->inputs) { - if (in_var && in_var->IsVar() && in_var->Var()) { - for (auto* prev_op_n : in_var->inputs) { - if (prev_op_n && prev_op_n->IsOp() && prev_op_n->Op()) { - prev_op_nodes.insert(prev_op_n); - } - } + // Reset the inputs, outputs. + for (auto* n : nodes_set_) { + n->inputs = inputs_map[n]; + n->outputs = outputs_map[n]; } } - return prev_op_nodes; - } - - std::unordered_set GetNextOpNodes(Node* op_n) { - PADDLE_ENFORCE_EQ(op_n && op_n->IsOp() && op_n->Op(), true, - "Node %p is not a op node.", op_n); - - std::unordered_set next_op_nodes; - for (auto* out_var : op_n->outputs) { - if (out_var && out_var->IsVar() && out_var->Var()) { - for (auto* next_op_n : out_var->outputs) { - if (next_op_n && next_op_n->IsOp() && next_op_n->Op()) { - next_op_nodes.insert(next_op_n); - } - } - } - } - return next_op_nodes; - } - - void Sort() { - if (!is_sorted) { - sorted_nodes = SortedOps(); - SortVarsBasedOnSortedOps(); - } - is_sorted = true; + is_sorted_ = true; } private: - std::unordered_set nodes_set; - bool is_sorted{false}; - std::vector sorted_nodes; + int type_{-1}; + std::string func_name_; + bool save_intermediate_out_{true}; + + std::unordered_set nodes_set_; + bool is_sorted_{false}; + std::vector sorted_nodes_; }; } // namespace fusion_group diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu index 6121389c12864140b4822cb1e6a9bb3ec60b2239..d12a25f360eece7bda47411c5bb4cd8625cdf729 100644 --- a/paddle/fluid/operators/interpolate_op.cu +++ b/paddle/fluid/operators/interpolate_op.cu @@ -9,9 +9,11 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include "paddle/fluid/operators/interpolate_op.h" #include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_launch_config.h" namespace paddle { namespace operators { @@ -586,17 +588,18 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx, int out_chw = c * out_hw; int pixelNum = n * out_chw; - int grid_dim = (pixelNum + 512 - 1) / 512; - grid_dim = grid_dim > 8 ? 8 : grid_dim; + + platform::GpuLaunchConfig config = + platform::getGpuLaunchConfig(pixelNum, ctx); if ("nearest" == interp_method) { - KeNearestNeighborInterpFw< - T><<>>( + KeNearestNeighborInterpFw<<>>( input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); } else if ("bilinear" == interp_method) { - KeBilinearInterpFw< - T><<>>( + KeBilinearInterpFw<<>>( input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout); } @@ -696,12 +699,13 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx, int out_cdhw = c * out_dhw; int pixelNum = n * out_cdhw; - int grid_dim = (pixelNum + 512 - 1) / 512; - grid_dim = grid_dim > 8 ? 8 : grid_dim; + + platform::GpuLaunchConfig config = + platform::getGpuLaunchConfig(pixelNum, ctx); if ("trilinear" == interp_method) { - KeTrilinearInterpFw< - T><<>>( + KeTrilinearInterpFw<<>>( input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners, align_mode, data_layout); @@ -787,17 +791,18 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, int out_chw = c * out_hw; int pixelNum = n * out_chw; - int grid_dim = (pixelNum + 512 - 1) / 512; - grid_dim = grid_dim > 8 ? 8 : grid_dim; + + platform::GpuLaunchConfig config = + platform::getGpuLaunchConfig(pixelNum, ctx); if ("nearest" == interp_method) { - KeNearestNeighborInterpBw< - T><<>>( + KeNearestNeighborInterpBw<<>>( input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); } else if ("bilinear" == interp_method) { - KeBilinearInterpBw< - T><<>>( + KeBilinearInterpBw<<>>( input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout); @@ -892,12 +897,13 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx, int out_cdhw = c * out_dhw; int pixelNum = n * out_cdhw; - int grid_dim = (pixelNum + 512 - 1) / 512; - grid_dim = grid_dim > 8 ? 8 : grid_dim; + + platform::GpuLaunchConfig config = + platform::getGpuLaunchConfig(pixelNum, ctx); if ("trilinear" == interp_method) { - KeTrilinearInterpBw< - T><<>>( + KeTrilinearInterpBw<<>>( input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d, out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners, align_mode, data_layout); diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h index a3186f82d0c0cc6c9585735ddf7e9bb4db7126cb..9d9f7ef00b8a12088225fd3620cb30b43ef9dce9 100644 --- a/paddle/fluid/operators/math/sequence2batch.h +++ b/paddle/fluid/operators/math/sequence2batch.h @@ -50,11 +50,11 @@ class LoDTensor2BatchFunctor { // seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)} // struct SeqInfo { - SeqInfo(int start, int length, int seq_idx) + SeqInfo(size_t start, size_t length, size_t seq_idx) : start(start), length(length), seq_idx(seq_idx) {} - int start; - int length; - int seq_idx; + size_t start; + size_t length; + size_t seq_idx; }; public: @@ -82,7 +82,7 @@ class LoDTensor2BatchFunctor { std::vector seq_info; for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) { - int length = lod[seq_id + 1] - lod[seq_id]; + size_t length = lod[seq_id + 1] - lod[seq_id]; seq_info.emplace_back(lod[seq_id], length, seq_id); } @@ -118,8 +118,8 @@ class LoDTensor2BatchFunctor { batch_lods.emplace_back(std::vector{0}); // batch_lods[0] is the start positions for batch LoDTensor - int max_seqlen = seq_info[0].length; - batch_lods[0].resize(static_cast(max_seqlen + 1)); + size_t max_seqlen = seq_info[0].length; + batch_lods[0].resize(max_seqlen + 1); // batch_lods[1] is the raw index in the input LoDTensor batch_lods[1].resize(static_cast(lod_tensor.dims()[0])); // batch_lods[2] is the sort order for the input LoDTensor. @@ -128,11 +128,11 @@ class LoDTensor2BatchFunctor { size_t* batch_starts = batch_lods[0].data(); size_t* seq2batch_idx = batch_lods[1].data(); batch_starts[0] = 0; - for (int n = 0; n < max_seqlen; n++) { - auto batch_id = static_cast(batch_starts[n]); + for (size_t n = 0; n < max_seqlen; n++) { + size_t batch_id = batch_starts[n]; for (size_t i = 0; i < seq_info.size(); ++i) { - int seq_len = seq_info[i].length; - int start = seq_info[i].start; + size_t seq_len = seq_info[i].length; + size_t start = seq_info[i].start; if (n < seq_len) { seq2batch_idx[batch_id] = is_reverse ? start + seq_len - 1 - n : start + n; @@ -141,7 +141,7 @@ class LoDTensor2BatchFunctor { break; } } - batch_starts[n + 1] = static_cast(batch_id); + batch_starts[n + 1] = batch_id; } size_t* seq_order = batch_lods[2].data(); for (size_t i = 0; i < seq_info.size(); ++i) { diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index df2f1ec749377415dea34efde8597a1d4f08e696..7516909f4516b711bd712a5f54489423ca746dbd 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -29,10 +29,11 @@ inline std::vector get_new_shape( auto tensor = list_new_shape_tensor[i]; PADDLE_ENFORCE_EQ( tensor->dims(), framework::make_ddim({1}), - "ShapeError: If the element type of 'shape' in ReshapeOp is Tensor, " - "the element's shape must be [1]. But received the element's shape " - "is [%s]", - tensor->dims()); + platform::errors::InvalidArgument( + "If the element type of 'shape' in ReshapeOp is Tensor, " + "the element's shape must be [1]. But received the element's shape " + "is [%s]", + tensor->dims())); if (platform::is_gpu_place(tensor->place())) { framework::Tensor temp; TensorCopySync(*tensor, platform::CPUPlace(), &temp); @@ -64,10 +65,11 @@ class ReshapeOp : public framework::OperatorWithKernel { auto ShapeTensor = ctx->Inputs("ShapeTensor"); PADDLE_ENFORCE_GT( ShapeTensor.size(), 0, - "ShapeError: When `shape` in ReshapeOp is a list or tuple " - "which contains Tensor, the shape's size can't be zero. " - "But received shape's size is %d.", - ShapeTensor.size()); + platform::errors::InvalidArgument( + "When `shape` in ReshapeOp is a list or tuple " + "which contains Tensor, the shape's size can't be zero. " + "But received shape's size is %d.", + ShapeTensor.size())); auto infer_shape = ctx->Attrs().Get>("shape"); const int64_t copy_dim_val = 0; auto in_dims = ctx->GetInputDim("X"); @@ -75,10 +77,11 @@ class ReshapeOp : public framework::OperatorWithKernel { if (infer_shape[i] == copy_dim_val) { PADDLE_ENFORCE_LT( static_cast(i), in_dims.size(), - "ShapeError: The index of 0 in `shape` must be less than " - "the input tensor X's dimensions. But received shape[%d] " - "= 0, X's dimensions = %d, X's shape = [%s].", - i, in_dims.size(), in_dims); + platform::errors::InvalidArgument( + "The index of 0 in `shape` must be less than " + "the input tensor X's dimensions. But received shape[%d] " + "= 0, X's dimensions = %d, X's shape = [%s].", + i, in_dims.size(), in_dims)); infer_shape[i] = in_dims[i]; } } @@ -108,10 +111,10 @@ class ReshapeOp : public framework::OperatorWithKernel { return; } - PADDLE_ENFORCE_EQ( - !shape.empty(), true, - "ShapeError: The parameter 'shape' in ReshapeOp must be set. " - "But received 'shape' is empty."); + PADDLE_ENFORCE_EQ(!shape.empty(), true, + platform::errors::InvalidArgument( + "The parameter 'shape' in ReshapeOp must be set. " + "But received 'shape' is empty.")); auto x_dims = ctx->GetInputDim("X"); auto out_dims = ValidateShape(shape, x_dims); ctx->SetOutputDim("Out", out_dims); @@ -140,25 +143,28 @@ class ReshapeOp : public framework::OperatorWithKernel { if (shape[i] == unk_dim_val) { PADDLE_ENFORCE_EQ( unk_dim_idx, -1, - "ShapeError: Only one dimension value of 'shape' in ReshapeOp can " - "be -1. But received shape = [%s], shape[%d] is also -1.", - framework::make_ddim(shape), i); + platform::errors::InvalidArgument( + "Only one dimension value of 'shape' in ReshapeOp can " + "be -1. But received shape = [%s], shape[%d] is also -1.", + framework::make_ddim(shape), i)); unk_dim_idx = i; } else if (shape[i] == copy_dim_val) { PADDLE_ENFORCE_LT( static_cast(i), in_dims.size(), - "ShapeError: The index of 0 in `shape` must be less than " - "the input tensor X's dimensions. " - "But received shape = [%s], shape[%d] = 0, X's shape = [%s], " - "X's dimensions = %d.", - framework::make_ddim(shape), i, in_dims, in_dims.size()); + platform::errors::InvalidArgument( + "The index of 0 in `shape` must be less than " + "the input tensor X's dimensions. " + "But received shape = [%s], shape[%d] = 0, X's shape = [%s], " + "X's dimensions = %d.", + framework::make_ddim(shape), i, in_dims, in_dims.size())); } else { PADDLE_ENFORCE_GT( shape[i], 0, - "ShapeError: Each dimension value of 'shape' in ReshapeOp must not " - "be negtive except one unknown dimension. " - "But received shape = [%s], shape[%d] = %d.", - framework::make_ddim(shape), i, shape[i]); + platform::errors::InvalidArgument( + "Each dimension value of 'shape' in ReshapeOp must not " + "be negtive except one unknown dimension. " + "But received shape = [%s], shape[%d] = %d.", + framework::make_ddim(shape), i, shape[i])); } capacity *= (shape[i] ? shape[i] : in_dims[i]); @@ -180,8 +186,7 @@ class ReshapeOp : public framework::OperatorWithKernel { "The input tensor X'size must be divisible by known " "capacity of 'shape'. " "But received X's shape = [%s], X's size = %d, " - "'shape' is [%s], known " - "capacity of 'shape' is %d.", + "'shape' is [%s], known capacity of 'shape' is %d.", in_dims, in_size, framework::make_ddim(shape), capacity)); } else { output_shape[unk_dim_idx] = -1; @@ -190,12 +195,13 @@ class ReshapeOp : public framework::OperatorWithKernel { if (all_positive) { PADDLE_ENFORCE_EQ( capacity, in_size, - "ShapeError: The 'shape' in ReshapeOp is invalid. " - "The input tensor X'size must be equal to the capacity of 'shape'. " - "But received X's shape = [%s], X's size = %d, 'shape' is [%s], " - "the " - "capacity of 'shape' is %d.", - in_dims, in_size, framework::make_ddim(shape), capacity); + platform::errors::InvalidArgument( + "The 'shape' in ReshapeOp is invalid. " + "The input tensor X'size must be equal to the capacity of " + "'shape'. " + "But received X's shape = [%s], X's size = %d, 'shape' is " + "[%s], the capacity of 'shape' is %d.", + in_dims, in_size, framework::make_ddim(shape), capacity)); } } return framework::make_ddim(output_shape); diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index 09b171fe901c35376f081b5fdc5fbe1bb8687fab..8a41d79433a8dade7bd931b3c68c8c2c40f0250a 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -90,4 +90,5 @@ REGISTER_OP_CPU_KERNEL( ops::SaveOpKernel, ops::SaveOpKernel, ops::SaveOpKernel, + ops::SaveOpKernel, ops::SaveOpKernel); diff --git a/paddle/fluid/platform/gpu_launch_config.h b/paddle/fluid/platform/gpu_launch_config.h new file mode 100644 index 0000000000000000000000000000000000000000..d57478b89781ed073cef0fa73e201784f73dfc6b --- /dev/null +++ b/paddle/fluid/platform/gpu_launch_config.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace platform { + +struct GpuLaunchConfig { + // Number of threads per block. + int threads; + // Number of blocks for GPU kernel launch. + int blocks; + + GpuLaunchConfig(int threads, int blocks) : threads(threads), blocks(blocks) {} +}; + +inline GpuLaunchConfig getGpuLaunchConfig( + const int N, const framework::ExecutionContext& ctx) { + int threads = + std::min(1024, ctx.cuda_device_context().GetMaxThreadsPerBlock()); + int physical_thread_count = + std::min(ctx.cuda_device_context().GetMaxPhysicalThreadCount(), N); + int blocks = std::min((physical_thread_count + threads - 1) / threads, + ctx.cuda_device_context().GetSMCount()); + + GpuLaunchConfig config(threads, blocks); + + return config; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 07eb3ee44decff599a1efe7d55ef4261743c77d8..e31f6e15f3b723ef8036b77ee62debdd3db17964 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1984,6 +1984,27 @@ All parameter, weight, gradient are variables in Paddle. build_strategy = fluid.BuildStrategy() build_strategy.fuse_bn_act_ops = True )DOC") + .def_property( + "enable_auto_fusion", + [](const BuildStrategy &self) { return self.enable_auto_fusion_; }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, + platform::errors::PreconditionNotMet( + "BuildStrategy is finlaized.")); + self.enable_auto_fusion_ = b; + }, + R"DOC((bool, optional): Whether to enable fusing subgraph to a + fusion_group. Now we only support fusing subgraph that composed + of elementwise-like operators, such as elementwise_add/mul + without broadcast and activations. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + build_strategy = fluid.BuildStrategy() + build_strategy.enable_auto_fusion = True + )DOC") .def_property( "fuse_relu_depthwise_conv", [](const BuildStrategy &self) { diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 0dbec7d672967e8051ba9e8bacbd74636b4ce9dd..966d4611c00d31376213eac70028f2291f2ecfbe 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -1110,6 +1110,26 @@ def _get_son_parent_block_idx_dict(program, current_block_idx): return son_parent_block_idx_dict +def _get_no_grad_set_name(no_grad_set): + no_grad_set_name = set() + if no_grad_set is not None: + if isinstance(no_grad_set, (set, list, tuple)): + for i, no_grad_var in enumerate(no_grad_set): + if isinstance(no_grad_var, framework.Variable): + no_grad_set_name.add(no_grad_var.name) + elif isinstance(no_grad_var, six.string_types): + no_grad_set_name.add(no_grad_var) + else: + raise TypeError( + "The type of no_grad_set's member must be paddle.fluid.Variable or str, but received %s." + % (type(no_grad_var))) + else: + raise TypeError( + "The type of no_grad_set should be set or list or tuple, but received {}". + format(type(no_grad_set))) + return no_grad_set_name + + def append_backward(loss, parameter_list=None, no_grad_set=None, @@ -1133,11 +1153,11 @@ def append_backward(loss, If it is None, all parameters will be updated. Default: None. - no_grad_set(set[str], optional): Variable names in the :ref:`api_guide_Block_en` 0 whose gradients + no_grad_set(set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients should be ignored. All variables with `stop_gradient=True` from all blocks will be automatically added into this set. - If this parameter is not None, the names in this set will be added to the default set. + If this parameter is not None, the Variables or Variable.names in this set will be added to the default set. Default: None. callbacks(list[callable object], optional): List of callback functions. The callbacks are used for @@ -1174,18 +1194,40 @@ def append_backward(loss, .. code-block:: python import paddle.fluid as fluid - x = fluid.data(name='x', shape=[None, 13], dtype='float32') - y = fluid.data(name='y', shape=[None, 1], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) + x = fluid.data(name='x', shape=[None, 13], dtype='int64') + y = fluid.data(name='y', shape=[None, 1], dtype='float32') + x_emb = fluid.embedding(x, size=[100, 256]) + y_predict = fluid.layers.fc(input=x_emb, size=1, act=None, name='my_fc') loss = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_loss = fluid.layers.mean(loss) - param_grad_list = fluid.backward.append_backward(loss=avg_loss) - p_g_list1 = fluid.backward.append_backward(loss=avg_loss) # len(p_g_list1) == 2 - p_g_list2 = fluid.backward.append_backward(loss=avg_loss, parameter_list=[p_g_list1[0][0].name]) # len(p_g_list1) == 1 - p_g_list3 = fluid.backward.append_backward(loss=avg_loss, no_grad_set=set([p_g_list1[0][0].name])) # len(p_g_list1) == 1 - p_g_list4 = fluid.backward.append_backward(loss=avg_loss, parameter_list=[p_g_list1[0][0].name], no_grad_set=set([p_g_list1[0][0].name])) # len(p_g_list1) == 0 + + # Get all weights in main_program, not include bias. + all_weights = [param for param in fluid.default_main_program().block(0).all_parameters() if 'w_' in param.name] + all_weights_name = [w.name for w in all_weights] + + # return all param_grads needed to be updated if parameter_list set default None. + p_g_list1 = fluid.backward.append_backward(loss=avg_loss) + # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)] + + # return the param_grads corresponding to parameter_list that can be list of param (Variable). + p_g_list2 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights) + # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)] + + # parameter_list can be list of param.name (str). + p_g_list3 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights_name) + # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)] + + # no_grad_set can be set of Variables that means grad will be cut off from these Variables. + p_g_list4 = fluid.backward.append_backward(loss=avg_loss, no_grad_set=set([x_emb])) + # output: [(my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)] + + # no_grad_set can be set of Variable.name when the Variable is created inside layers and can't be specified explicitly. + p_g_list5 = fluid.backward.append_backward(loss=avg_loss, no_grad_set=set(['my_fc.b_0'])) + # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)] + + # return [] because all param_grads are filtered by no_grad_set. + p_g_list6 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights)) """ assert isinstance(loss, framework.Variable) @@ -1215,7 +1257,8 @@ def append_backward(loss, if no_grad_set is None: no_grad_set = set() - no_grad_set = copy.copy(no_grad_set) + else: + no_grad_set = _get_no_grad_set_name(copy.copy(no_grad_set)) no_grad_dict = _get_stop_gradients_(program) # no_grad_set only contains vars in block 0 # Todo(liym27): support vars in sub block @@ -1501,12 +1544,15 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): Args: targets(Variable|list[Variable]): The target variables inputs(Variable|list[Variable]): The input variables - target_gradients (Variable|list[Variable]|None): The gradient variables + target_gradients (Variable|list[Variable], optional): The gradient variables of targets which has the same shape with targets, If None, ones will be created for them. - no_grad_set(set[string]): The names of variables that have no gradients - in Block 0. All variables with `stop_gradient=True` from all blocks - will be automatically added. + no_grad_set(set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients + should be ignored. All variables with + `stop_gradient=True` from all blocks will + be automatically added into this set. + If this parameter is not None, the Variables or Variable.names in this set will be added to the default set. + Default: None. Return: (list[Variable]): A list of gradients for inputs @@ -1532,7 +1578,8 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): if no_grad_set is None: no_grad_set = set() - no_grad_set = copy.copy(no_grad_set) + else: + no_grad_set = _get_no_grad_set_name(copy.copy(no_grad_set)) no_grad_dict = _get_stop_gradients_(prog) no_grad_dict[0].update(list(map(_append_grad_suffix_, no_grad_set))) @@ -1623,12 +1670,13 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None): Args: targets (Variable|list[Variable]): The target variables. inputs (Variable|list[Variable]): The input variables. - target_gradients (Variable|list[Variable]|None): The gradient variables + target_gradients (Variable|list[Variable], optional): The gradient variables of targets which has the same shape with targets, If None, ones will be created for them. - no_grad_set (set[string]): The names of variables that have no gradients - in Block 0. All variables with `stop_gradient=True` from all blocks - will be automatically added. + no_grad_set (set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients + should be ignored. All variables with `stop_gradient=True` from all blocks will + be automatically added into this set. If this parameter is not None, the Variables or Variable.names + in this set will be added to the default set. Default: None. Return: (list[Variable]): A list of gradients for inputs @@ -1640,7 +1688,7 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None): import paddle.fluid as fluid - x = fluid.layers.data(name='x', shape=[2,8,8], dtype='float32') + x = fluid.data(name='x', shape=[None,2,8,8], dtype='float32') x.stop_gradient=False y = fluid.layers.conv2d(x, 4, 1, bias_attr=False) y = fluid.layers.relu(y) diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py index d0d69ae91a16b670494f1ec5310a029e52d8894b..ae2298e10a38fe5eb14c8b43cc022383e4bc7ab8 100644 --- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py +++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py @@ -16,10 +16,10 @@ import os import re import logging import numpy as np -from ....executor import global_scope from .... import io from .... import core from .... import framework +from ....executor import global_scope, Executor from ....framework import IrGraph from ....log_helper import get_logger from .quantization_pass import QuantizationTransformPass @@ -27,12 +27,31 @@ from .quantization_pass import QuantizationFreezePass from .quantization_pass import AddQuantDequantPass from .quantization_pass import _op_real_in_out_name -__all__ = ['PostTrainingQuantization'] +__all__ = ['PostTrainingQuantization', 'WeightQuantization'] _logger = get_logger( __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s') +def _load_variable_data(scope, var_name): + ''' + Load variable value from scope + ''' + return np.array(scope.find_var(var_name).get_tensor()) + + +def _set_variable_data(scope, place, var_name, np_value): + ''' + Set the value of var node by name, if the node exits, + ''' + assert isinstance(np_value, np.ndarray), \ + 'The type of value should be numpy array.' + var_node = scope.find_var(var_name) + if var_node != None: + tensor = var_node.get_tensor() + tensor.set(np_value, place) + + class PostTrainingQuantization(object): def __init__(self, executor, @@ -297,12 +316,12 @@ class PostTrainingQuantization(object): ''' for var_name in self._quantized_weight_var_name: if var_name not in self._sampling_data: - var_tensor = self._load_var_value(var_name) + var_tensor = _load_variable_data(self._scope, var_name) self._sampling_data[var_name] = var_tensor if self._is_use_cache_file: for var_name in self._quantized_act_var_name: - var_tensor = self._load_var_value(var_name) + var_tensor = _load_variable_data(self._scope, var_name) var_tensor = var_tensor.ravel() save_path = os.path.join(self._cache_dir, var_name + "_" + str(iter) + ".npy") @@ -311,7 +330,7 @@ class PostTrainingQuantization(object): for var_name in self._quantized_act_var_name: if var_name not in self._sampling_data: self._sampling_data[var_name] = [] - var_tensor = self._load_var_value(var_name) + var_tensor = _load_variable_data(self._scope, var_name) var_tensor = var_tensor.ravel() self._sampling_data[var_name].append(var_tensor) @@ -397,11 +416,17 @@ class PostTrainingQuantization(object): # save scale factor to scale var node for key, val in self._quantized_var_scale_factor.items(): - self._set_var_node_value( - key + ".scale", np.array( + _set_variable_data( + self._scope, + self._place, + key + ".scale", + np.array( [val], dtype=np.float32)) - self._set_var_node_value( - key + ".quant_dequant.scale", np.array( + _set_variable_data( + self._scope, + self._place, + key + ".quant_dequant.scale", + np.array( [val], dtype=np.float32)) # apply QuantizationFreezePass, and obtain the final quant model @@ -430,23 +455,6 @@ class PostTrainingQuantization(object): self._quantized_var_scale_factor[ output_var_name]) - def _load_var_value(self, var_name): - ''' - Load variable value from scope - ''' - return np.array(self._scope.find_var(var_name).get_tensor()) - - def _set_var_node_value(self, var_node_name, np_value): - ''' - Set the value of var node by name, if the node exits, - ''' - assert isinstance(np_value, np.ndarray), \ - 'The type of value should be numpy array.' - var_node = self._scope.find_var(var_node_name) - if var_node != None: - tensor = var_node.get_tensor() - tensor.set(np_value, self._place) - def _is_input_all_not_persistable(self, op, persistable_var_names): ''' Analyze the real inputs of the op are all not persistable. @@ -566,3 +574,132 @@ class PostTrainingQuantization(object): tmp_sum1 += p_idx * (math.log(Q_sum * p_idx)) tmp_sum2 += p_idx * (math.log(P_sum * q_idx)) return (tmp_sum1 - tmp_sum2) / P_sum + + +class WeightQuantization(object): + _supported_quantizable_op_type = ['conv2d', 'depthwise_conv2d', 'mul'] + + def __init__(self, model_dir, model_filename=None, params_filename=None): + ''' + This class quantizes the weight of some ops to reduce the size of model + or improve the perforemace. + + Args: + model_dir(str): The path of the fp32 model that will be quantized, + and the model and params files are under the path. + model_filename(str, optional): The name of file to load the inference + program. If it is None, the default filename '__model__' will + be used. Default is 'None'. + params_filename(str, optional): The name of file to load all parameters. + When all parameters were saved in a single binary file, set it + as the real filename. If parameters were saved in separate files, + set it as 'None'. Default is 'None'. + ''' + self._model_dir = model_dir + self._model_filename = model_filename + self._params_filename = params_filename + + def quantize_weight_to_int(self, + save_model_dir, + save_model_filename=None, + save_params_filename=None, + quantizable_op_type=["conv2d", "mul"], + quantize_weight_bits=8, + threshold_rate=0.0): + ''' + In order to reduce the size of model, this api quantizes the weight + of some ops from float32 to int8/16. In the inference stage, the + quantized weight will be dequantized to float32 again. + + Args: + save_model_dir(str): The path to save the quantized model. + save_model_filename(str, optional): The name of file to + save the inference program. If it is None, the default + filename '__model__' will be used. Default is 'None'. + save_params_filename(str, optional): The name of file to + save all parameters. If it is None, parameters were + saved in separate files. If it is not None, all + parameters were saved in a single binary file. + quantizable_op_type(list[str], optional): The list of ops + that will be quantized, and the quantized ops should be + contained in ["conv2d", "depthwise_conv2d", "mul"]. + Default is ["conv2d","mul"]. + quantize_weight_bits(int, optional): The bits for the quantized + weight, and it should be 8 or 16. Default is 8. + threshold_rate(float, optional): This api uses abs_max methd to + quantize the weight from float32 to int8/16, and the abs max + value is important for quantization diff. When the abs_max + value is far away from the center of the numerical distribution, + we can set threshold_rate between 1e-6 and 1e-8, so the abs max + value will be optimized. Default is 0.0. + ''' + for op_type in quantizable_op_type: + assert op_type in self._supported_quantizable_op_type, \ + "input error:" + op_type + \ + " is not supported for weight quantization." + assert quantize_weight_bits in [8, 16], \ + "input error: quantize_weight_bits should be 8 or 16." + quantize_range = (1 << (quantize_weight_bits - 1)) - 1 + save_weight_dtype = np.int8 if quantize_weight_bits == 8 else np.int16 + + place = core.CPUPlace() + exe = Executor(place) + scope = global_scope() + [program, feed_list, fetch_list] = \ + io.load_inference_model(dirname=self._model_dir, + executor=exe, + model_filename=self._model_filename, + params_filename=self._params_filename) + + persistable_var_names = [] + for var in program.list_vars(): + if var.persistable: + persistable_var_names.append(var.name) + for op in program.global_block().ops: + if op.type in quantizable_op_type: + for var_name in op.input_arg_names: + if var_name in persistable_var_names: + var_tensor_data = _load_variable_data(scope, var_name) + if abs(threshold_rate) < 1e-10: + threshold_value = np.max(np.abs(var_tensor_data)) + else: + threshold_value = self._calculate_threshold(\ + var_tensor_data, threshold_rate) + var_tensor_data[var_tensor_data > + threshold_value] = threshold_value + var_tensor_data[var_tensor_data < + -threshold_value] = -threshold_value + scale = threshold_value / quantize_range + quantized_var_tensor_data = \ + np.around(var_tensor_data / scale) + quantized_var_tensor_data = \ + quantized_var_tensor_data.astype(save_weight_dtype) + _set_variable_data(scope, place, var_name, + quantized_var_tensor_data) + op._set_attr(var_name + "_quant_scale", [scale]) + op._set_attr('quantize_weight_bits', + quantize_weight_bits) + + io.save_inference_model( + dirname=save_model_dir, + feeded_var_names=feed_list, + target_vars=fetch_list, + executor=exe, + main_program=program, + model_filename=save_model_filename, + params_filename=save_params_filename) + + def _calculate_threshold(self, input, threshold_rate, histogram_bins=5000): + input_abs = np.abs(input) + hist, hist_edeges = np.histogram( + input_abs, bins=histogram_bins, range=(0, np.max(input_abs))) + hist = hist / float(sum(hist)) + hist_sum = 0 + hist_index = 0 + for i in range(len(hist)): + hist_sum += hist[i] + if hist_sum >= 1.0 - threshold_rate: + hist_index = i + 1 + break + bin_width = hist_edeges[1] - hist_edeges[0] + return hist_index * bin_width diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt index 65fd4984d95e9fdf98f86a23b06d52ef477ecd8b..8b80cac9018b71b3bffb5184e308f641dca08f18 100644 --- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt +++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt @@ -63,6 +63,7 @@ if(WIN32) list(REMOVE_ITEM TEST_OPS test_light_nas) list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mobilenetv1) list(REMOVE_ITEM TEST_OPS test_post_training_quantization_resnet50) + list(REMOVE_ITEM TEST_OPS test_weight_quantization_mobilenetv1) endif() # int8 image classification python api test diff --git a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py new file mode 100644 index 0000000000000000000000000000000000000000..c6380adf6b63cffbcbcc7d5e75a86926e6bcde8b --- /dev/null +++ b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py @@ -0,0 +1,91 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import os +import time +from paddle.dataset.common import download, DATA_HOME +from paddle.fluid.contrib.slim.quantization import WeightQuantization + + +class TestWeightQuantization(unittest.TestCase): + def setUp(self): + self.weight_quantization_dir = 'weight_quantization' + self.cache_folder = os.path.join(DATA_HOME, + self.weight_quantization_dir) + + def download_model(self, model_name, data_url, data_md5): + download(data_url, self.weight_quantization_dir, data_md5) + file_name = data_url.split('/')[-1] + file_path = os.path.join(self.cache_folder, file_name) + print(model_name + ' is downloaded at ' + file_path) + + unziped_path = os.path.join(self.cache_folder, model_name) + self.cache_unzipping(unziped_path, file_path) + print(model_name + ' is unziped at ' + unziped_path) + return unziped_path + + def cache_unzipping(self, target_folder, zip_path): + if not os.path.exists(target_folder): + cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder, + zip_path) + os.system(cmd) + + def run_test(self, model_name, model_data_url, model_data_md5, + quantize_weight_bits, quantizable_op_type, threshold_rate): + + model_dir = self.download_model(model_name, model_data_url, + model_data_md5) + + timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) + save_model_dir = os.path.join( + os.getcwd(), + model_name + "_wq_" + str(quantize_weight_bits) + "_" + timestamp) + weight_quant = WeightQuantization(model_dir=model_dir + "/model") + weight_quant.quantize_weight_to_int( + save_model_dir=save_model_dir, + quantize_weight_bits=quantize_weight_bits, + quantizable_op_type=quantizable_op_type, + threshold_rate=threshold_rate) + print("finish weight quantization for " + model_name + "\n") + + try: + os.system("rm -rf {}".format(save_model_dir)) + except Exception as e: + print("Failed to delete {} due to {}".format(save_model_dir, str( + e))) + + +class TestWeightQuantizationMobilenetv1(TestWeightQuantization): + model_name = "mobilenetv1" + model_data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz" + model_data_md5 = "13892b0716d26443a8cdea15b3c6438b" + + def test_weight_quantization_mobilenetv1_8bit(self): + quantize_weight_bits = 8 + quantizable_op_type = ['conv2d', 'depthwise_conv2d', 'mul'] + threshold_rate = 0.0 + self.run_test(self.model_name, self.model_data_url, self.model_data_md5, + quantize_weight_bits, quantizable_op_type, threshold_rate) + + def test_weight_quantization_mobilenetv1_16bit(self): + quantize_weight_bits = 16 + quantizable_op_type = ['conv2d', 'depthwise_conv2d', 'mul'] + threshold_rate = 1e-9 + self.run_test(self.model_name, self.model_data_url, self.model_data_md5, + quantize_weight_bits, quantizable_op_type, threshold_rate) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 99eb435ff0845aebb22c6b258f5d4d9967d178e0..24ffe8b4ae480db6176a9bb4e7ce813b45cd2dd1 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -23,7 +23,7 @@ from paddle.fluid.framework import Program, Variable, name_scope, default_main_p from . import framework from . import layers from . import unique_name -from .backward import append_backward, _some_in_set_, _append_grad_suffix_ +from .backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name from .clip import append_gradient_clip_ops, error_clip_callback from .framework import program_guard from .initializer import Constant @@ -592,7 +592,7 @@ class Optimizer(object): parameter_list (list, optional): List of ``Variable`` or ``Variable.name`` to update to minimize ``loss``. The default value is None, at this time all parameters will be updated. - no_grad_set (set, optional): Set of ``Variable`` objects that don't need + no_grad_set (set, optional): Set of ``Variable`` or ``Variable.name`` that don't need to be updated. The default value is None. callbacks (list, optional): list of callable objects to run when appending backward operator for one parameter. The default value is None. @@ -705,14 +705,7 @@ class Optimizer(object): return optimize_ops def _get_no_grad_set(self, loss, no_grad_set=None): - if no_grad_set is None: - no_grad_set = set() - elif isinstance(no_grad_set, set) or isinstance( - no_grad_set, list) or isinstance(no_grad_set, tuple): - no_grad_set = set(no_grad_set) - else: - assert "no_grad_set should be a set, but the passed type is {}".format( - type(no_grad_set)) + no_grad_set = _get_no_grad_set_name(no_grad_set) parameters = loss.block.program.global_block().all_parameters() param_no_trainable = set( [param.name for param in parameters if param.trainable is False]) @@ -770,7 +763,7 @@ class Optimizer(object): parameter_list (list, optional): List of ``Variable`` or ``Variable.name`` to update to minimize ``loss``. The default value is None, at this time all parameters will be updated. - no_grad_set (set, optional): Set of ``Variable`` objects that don't need + no_grad_set (set, optional): Set of ``Variable`` or ``Variable.name`` that don't need to be updated. The default value is None. grad_clip (GradClipBase, optional) : Gradient clipping strategy, static graph mode does not need to use this argument. Currently, this argument @@ -3843,8 +3836,8 @@ class RecomputeOptimizer(Optimizer): loss (Variable): loss variable to run optimizations. startup_program (Program): startup_program for initializing parameters in `parameter_list`. - parameter_list (list): list of Variables to update. - no_grad_set (set|None): set of Variables should be ignored. + parameter_list (list): list of Variables or Variable.names to update. + no_grad_set (set|None): set of Variables or Variables.names should be ignored. callbacks (list|None): list of callables to run when appending backward operator for one parameter. checkpoints (list): list of Variables as checkpoints diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index ed3b2469b42cdb705a82a4ce9df93deec746de75..2f118373ccebb804d17199c6c2caad3efd65e12b 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -200,6 +200,10 @@ if (APPLE OR WIN32) list(REMOVE_ITEM TEST_OPS test_imperative_signal_handler) endif() +if(NOT WITH_GPU OR WIN32 OR APPLE) + list(REMOVE_ITEM TEST_OPS test_build_strategy_fusion_group_pass) +endif() + # Some ops need to check results when gc is enabled # Currently, only ops that register NoNeedBufferVarsInference need to do this test set(TEST_OPS_WITH_GC diff --git a/python/paddle/fluid/tests/unittests/test_backward.py b/python/paddle/fluid/tests/unittests/test_backward.py index ecf3c043f6b72055981fc10460b01efc4c6f8063..090ac1547f3b3a247587aac38243c83de4e2ee95 100644 --- a/python/paddle/fluid/tests/unittests/test_backward.py +++ b/python/paddle/fluid/tests/unittests/test_backward.py @@ -142,6 +142,21 @@ class TestBackward(unittest.TestCase): exe.run(startup) exe.run(feed=net.init_data()) + def _check_error_no_grad_set(self, net, no_grad_set): + place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + exe = fluid.Executor(place) + + main = fluid.Program() + startup = fluid.Program() + + with fluid.program_guard(main, startup): + loss = net.build_model() + optimizer = fluid.optimizer.SGD(learning_rate=0.1) + optimizer.minimize(loss, no_grad_set=no_grad_set) + exe.run(startup) + exe.run(feed=net.init_data()) + class SimpleNet(BackwardNet): def __init__(self): @@ -233,12 +248,25 @@ class TestSimpleNetWithErrorParamList(TestBackward): # The type of parameter_list argument must be list or tuple with self.assertRaises(TypeError): self._check_error_param_list(self.net, "test") - # The type of parameter_list's member must be varable or str + # The type of parameter_list's member must be Variable or str test = fluid.data(name='test', shape=[None, 90], dtype='float32') with self.assertRaises(TypeError): self._check_error_param_list(self.net, [test, "test", 3]) +class TestSimpleNetWithErrorNoGradSet(TestBackward): + def test_no_grad_set_type_error(self): + self.global_block_idx = 0 + self.net = SimpleNet() + # The type of no_grad_set argument must be set or list or tuple + with self.assertRaises(TypeError): + self._check_error_no_grad_set(self.net, "test") + # The type of no_grad_set's member must be Variable or str + test = fluid.data(name='test', shape=[None, 90], dtype='float32') + with self.assertRaises(TypeError): + self._check_error_no_grad_set(self.net, [test, "test", 3]) + + # TODO(Aurelius84): add conditional network test class ConditionalNet(BackwardNet): def __init__(self): diff --git a/python/paddle/fluid/tests/unittests/test_build_strategy_fusion_group_pass.py b/python/paddle/fluid/tests/unittests/test_build_strategy_fusion_group_pass.py new file mode 100644 index 0000000000000000000000000000000000000000..1405bf9d70b0456145728169c25fde6808a24134 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_build_strategy_fusion_group_pass.py @@ -0,0 +1,39 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid as fluid +import paddle.fluid.core as core +from test_eager_deletion_padding_rnn import RNNConfig, PaddingRNNTestBase + + +class FusionGroupPaddingRNNTest(PaddingRNNTestBase): + def set_customed_config(self): + self.build_strategy.enable_auto_fusion = True + + # Use CUDA executor + if core.is_compiled_with_cuda(): + self.exe = fluid.Executor(fluid.CUDAPlace(0)) + + def test_train_enable_fusion_group(self): + rnn_model = "static" + config = RNNConfig("test", rnn_model) + with fluid.scope_guard(fluid.Scope()): + self.train(config, parallel=True, use_program_cache=False) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py index c0fd448d433979e4930a31e8118c388c76d81a94..6c0bb97bf6f14b2cef5050a99c02c7843b86be92 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py @@ -21,7 +21,6 @@ import numpy as np import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.layers as layers -import time import os from paddle.fluid import ParamAttr @@ -118,8 +117,7 @@ def lm_model(hidden_size, num_steps=20, init_scale=0.1, dropout=None, - rnn_model='static', - use_py_reader=False): + rnn_model='static'): def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] @@ -279,38 +277,9 @@ def lm_model(hidden_size, gate_input = layers.elementwise_add(gate_input, bias) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) - try: - from paddle.fluid.contrib.layers import fused_elemwise_activation - # fluid.contrib.layers.fused_elemwise_activation can do a fused - # operation, like: - # 1) x + sigmoid(y); x + tanh(y) - # 2) tanh(x + y) - # Now the unary operation supported in this fused op is limit, and - # we will extent this operation to support more unary operations and - # do this kind of fusion automitically in future version of paddle.fluid. - # layers.sigmoid(i) * layers.tanh(j) - tmp0 = fused_elemwise_activation( - x=layers.tanh(j), - y=i, - functor_list=['elementwise_mul', 'sigmoid'], - save_intermediate_out=False) - # pre_cell * layers.sigmoid(f) - tmp1 = fused_elemwise_activation( - x=pre_cell, - y=f, - functor_list=['elementwise_mul', 'sigmoid'], - save_intermediate_out=False) - c = tmp0 + tmp1 - # layers.tanh(c) * layers.sigmoid(o) - m = fused_elemwise_activation( - x=layers.tanh(c), - y=o, - functor_list=['elementwise_mul', 'sigmoid'], - save_intermediate_out=False) - except ImportError: - c = pre_cell * layers.sigmoid(f) + layers.sigmoid( - i) * layers.tanh(j) - m = layers.tanh(c) * layers.sigmoid(o) + c = pre_cell * layers.sigmoid(f) + layers.sigmoid( + i) * layers.tanh(j) + m = layers.tanh(c) * layers.sigmoid(o) hidden_array[k] = m cell_array[k] = c @@ -342,23 +311,16 @@ def lm_model(hidden_size, return real_res, last_hidden, last_cell batch_size_each = batch_size - if use_py_reader: - feed_shapes = [[batch_size_each, num_steps, 1], - [batch_size_each * num_steps, 1]] - py_reader = fluid.layers.py_reader( - capacity=16, shapes=feed_shapes, dtypes=['int64', 'int64']) - x, y = fluid.layers.read_file(py_reader) - else: - x = layers.data( - name="x", - shape=[batch_size_each, num_steps, 1], - dtype='int64', - append_batch_size=False) - y = layers.data( - name="y", - shape=[batch_size_each * num_steps, 1], - dtype='int64', - append_batch_size=False) + x = layers.data( + name="x", + shape=[batch_size_each, num_steps, 1], + dtype='int64', + append_batch_size=False) + y = layers.data( + name="y", + shape=[batch_size_each * num_steps, 1], + dtype='int64', + append_batch_size=False) init_hidden = layers.data( name="init_hidden", @@ -472,10 +434,7 @@ def lm_model(hidden_size, layers.assign(input=last_hidden, output=init_hidden) feeding_list = ['x', 'y', 'init_hidden', 'init_cell'] - if use_py_reader: - return loss, last_hidden, last_cell, feeding_list, py_reader - else: - return loss, last_hidden, last_cell, feeding_list + return loss, last_hidden, last_cell, feeding_list class PaddingRNNTestBase(unittest.TestCase): @@ -483,7 +442,29 @@ class PaddingRNNTestBase(unittest.TestCase): self.reader = Reader() self.device_count = 1 - def prepare_program(self, config, parallel=True): + # The default exec_strategy used for PaddingRNN. + # You can change it in set_customed_config. + self.exec_strategy = fluid.ExecutionStrategy() + self.exec_strategy.num_threads = self.device_count + self.exec_strategy.num_iteration_per_drop_scope = 100 + + # The default build_strategy used for PaddingRNN. + # You can change it in set_customed_config. + self.build_strategy = fluid.BuildStrategy() + self.build_strategy.enable_inplace = True + self.build_strategy.memory_optimize = False + self.build_strategy.fuse_all_optimizer_ops = True + + # CPU executor is used for PaddingRNN default. + # You can change to CUDA executor in set_customed_config. + self.exe = Executor(fluid.CPUPlace()) + + def set_customed_config(self): + # This function will be called before training. + # You can override the function to set your own config. + pass + + def _prepare_program(self, config, parallel=True): self.main_program = fluid.Program() self.startup_program = fluid.Program() self.startup_program.random_seed = config.random_seed @@ -497,8 +478,7 @@ class PaddingRNNTestBase(unittest.TestCase): num_steps=config.num_steps, init_scale=config.init_scale, dropout=config.dropout, - rnn_model=config.rnn_model, - use_py_reader=False) + rnn_model=config.rnn_model) self.loss, self.last_hidden, self.last_cell, self.feed_order = res_vars fluid.clip.set_gradient_clip( @@ -515,28 +495,19 @@ class PaddingRNNTestBase(unittest.TestCase): optimizer = fluid.optimizer.SGD( learning_rate=self.learning_rate) optimizer.minimize(self.loss) - self.exe = Executor(fluid.CPUPlace()) + self.exe.run(self.startup_program) if parallel: - exec_strategy = fluid.ExecutionStrategy() - exec_strategy.num_threads = self.device_count - exec_strategy.num_iteration_per_drop_scope = 100 - - build_strategy = fluid.BuildStrategy() - build_strategy.enable_inplace = True - build_strategy.memory_optimize = False - build_strategy.fuse_all_optimizer_ops = True - self.train_program = fluid.compiler.CompiledProgram( self.main_program).with_data_parallel( loss_name=self.loss.name, - build_strategy=build_strategy, - exec_strategy=exec_strategy) + build_strategy=self.build_strategy, + exec_strategy=self.exec_strategy) else: self.train_program = self.main_program - def generate_init_data(self): + def _generate_init_data(self): init_hidden = np.zeros( (self.config.num_layers, self.config.batch_size, self.config.hidden_size), @@ -547,19 +518,19 @@ class PaddingRNNTestBase(unittest.TestCase): dtype='float32') return init_hidden, init_cell - def generate_new_lr(self, epoch_id=0, device_count=1): + def _generate_new_lr(self, epoch_id=0, device_count=1): new_lr = self.config.base_learning_rate * (self.config.lr_decay**max( epoch_id + 1 - self.config.epoch_start_decay, 0.0)) lr = np.ones((self.device_count), dtype='float32') * new_lr return lr - def prepare_input(self, - batch, - init_hidden=None, - init_cell=None, - epoch_id=0, - with_lr=True, - device_count=1): + def _prepare_input(self, + batch, + init_hidden=None, + init_cell=None, + epoch_id=0, + with_lr=True, + device_count=1): x, y = batch x = x.reshape((-1, self.config.num_steps, 1)) y = y.reshape((-1, 1)) @@ -572,19 +543,19 @@ class PaddingRNNTestBase(unittest.TestCase): if init_cell is not None: res['init_cell'] = init_cell if with_lr: - res['learning_rate'] = self.generate_new_lr(epoch_id, device_count) + res['learning_rate'] = self._generate_new_lr(epoch_id, device_count) return res - def train_an_epoch(self, epoch_id, batch_times, use_program_cache=True): + def _train_an_epoch(self, epoch_id, use_program_cache=True): train_data_iter = self.reader.get_data_iter(self.config) total_loss = 0 iters = 0 - init_hidden, init_cell = self.generate_init_data() + init_hidden, init_cell = self._generate_init_data() ppl = np.zeros(shape=(0)) for batch_id, batch in enumerate(train_data_iter): - input_data_feed = self.prepare_input( + input_data_feed = self._prepare_input( batch, init_hidden=init_hidden, init_cell=init_cell, @@ -592,7 +563,6 @@ class PaddingRNNTestBase(unittest.TestCase): with_lr=True, device_count=self.device_count) - batch_start_time = time.time() fetch_outs = self.exe.run(self.train_program, feed=input_data_feed, fetch_list=[ @@ -601,8 +571,6 @@ class PaddingRNNTestBase(unittest.TestCase): self.last_cell.name ], use_program_cache=use_program_cache) - batch_time = time.time() - batch_start_time - batch_times.append(batch_time) cost_train = np.array(fetch_outs[0]) lr = np.array(fetch_outs[1]) @@ -617,17 +585,13 @@ class PaddingRNNTestBase(unittest.TestCase): return ppl def train(self, config, parallel=True, use_program_cache=True): + self.set_customed_config() + self.config = config - self.prepare_program(config, parallel) - total_time = 0.0 + self._prepare_program(config, parallel) ppl = np.zeros(shape=(0, config.batch_size)) for epoch_id in range(config.max_epoch): - batch_times = [] - epoch_start_time = time.time() - train_ppl = self.train_an_epoch(epoch_id, batch_times, - use_program_cache) - epoch_time = time.time() - epoch_start_time - total_time += epoch_time + train_ppl = self._train_an_epoch(epoch_id, use_program_cache) ppl = np.append(ppl, train_ppl) return ppl diff --git a/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py index b7ebfc6b9ff7b3b68d2dc64cd1371b0d6d05f4b6..d756394535a9e5ef154b437c5f2764c6af06ff26 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py +++ b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py @@ -55,7 +55,7 @@ class TestFusedEmbeddingSeqPoolOp(OpTest): if ver.mkl() == "ON" and 'Linux' in platform.platform(): self.attrs = {'is_sparse': False} self.check_grad( - ['W'], 'Out', no_grad_set=('Ids'), check_dygraph=False) + ['W'], 'Out', no_grad_set=['Ids'], check_dygraph=False) class TestLookupTableOpWithPadding(TestFusedEmbeddingSeqPoolOp): @@ -89,7 +89,7 @@ class TestLookupTableOpWithPadding(TestFusedEmbeddingSeqPoolOp): self.attrs = {'padding_idx': int(padding_idx), 'is_sparse': False} # TODO(wangzhongpu): support lod in dygraph mode self.check_grad( - ['W'], 'Out', no_grad_set=('Ids'), check_dygraph=False) + ['W'], 'Out', no_grad_set=['Ids'], check_dygraph=False) class TestFusedEmbeddingSeqPoolApi(unittest.TestCase):