diff --git a/cmake/flags.cmake b/cmake/flags.cmake index e0556a0babc74ba6efa0a190d4f7b77416bef3bf..331b2af367bdf261ffbf96fb88f61cc6958ee647 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -27,7 +27,6 @@ endfunction() CheckCompilerCXX11Flag() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - # safe_set_flag # # Set a compile flag only if compiler is support @@ -71,6 +70,20 @@ macro(safe_set_nvflag flag_name) endif() endmacro() +macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared + if (BUILD_SHARED_LIBS) + return() # if build shared libs, the flags keep same with '/MD' + endif(BUILD_SHARED_LIBS) + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO + CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE + CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/MD") + endforeach(flag_var) +endmacro() CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS) if(NOT UINT64_MAX_EXISTS) @@ -97,9 +110,13 @@ SET(CMAKE_EXTRA_INCLUDE_FILES "") # Common flags. the compiler flag used for C/C++ sources whenever release or debug # Do not care if this flag is support for gcc. + +# https://github.com/PaddlePaddle/Paddle/issues/12773 +if (NOT WIN32) set(COMMON_FLAGS -fPIC -fno-omit-frame-pointer + -Werror -Wall -Wextra -Wnon-virtual-dtor @@ -114,11 +131,6 @@ set(COMMON_FLAGS -Wno-error=terminate # Warning in PADDLE_ENFORCE ) -# https://github.com/PaddlePaddle/Paddle/issues/12773 -if (NOT WIN32) -list(APPEND COMMON_FLAGS -Werror) -endif() - set(GPU_COMMON_FLAGS -fPIC -fno-omit-frame-pointer @@ -133,30 +145,53 @@ set(GPU_COMMON_FLAGS -Wno-error=array-bounds # Warnings in Eigen::array ) +else(NOT WIN32) +set(COMMON_FLAGS + "/w") #disable all warnings. +set(GPU_COMMON_FLAGS + "/w") #disable all warnings +endif(NOT WIN32) + if (APPLE) if(NOT CMAKE_CROSSCOMPILING) # On Mac OS X build fat binaries with x86_64 architectures by default. set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) endif() -else() +endif(APPLE) + +if(LINUX) set(GPU_COMMON_FLAGS -Wall -Wextra -Werror ${GPU_COMMON_FLAGS}) -endif() +endif(LINUX) if(UNIX AND NOT APPLE) # except apple from nix*Os family set(LINUX TRUE) endif(UNIX AND NOT APPLE) - foreach(flag ${COMMON_FLAGS}) safe_set_cflag(CMAKE_C_FLAGS ${flag}) safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag}) + endforeach() foreach(flag ${GPU_COMMON_FLAGS}) safe_set_nvflag(${flag}) endforeach() + +if(WIN32) +# windows build turn off warnings. +safe_set_static_flag() + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO + CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE + CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/W3") + string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/W3") + endforeach(flag_var) +endif(WIN32) diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index 0f9521616952a2857222feab8c38fb480761ee2d..a777a4974cc377db103a470698f817612a4e9a32 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -1,11 +1,9 @@ add_custom_target(paddle_apis ALL - DEPENDS paddle_v2_apis paddle_fluid_apis) + DEPENDS paddle_v2_apis) add_custom_target(paddle_docs ALL DEPENDS paddle_v2_docs paddle_v2_docs_cn - paddle_fluid_docs paddle_fluid_docs_cn paddle_mobile_docs paddle_mobile_docs_cn) add_subdirectory(v2) -add_subdirectory(fluid) add_subdirectory(mobile) diff --git a/paddle/contrib/float16/float16_transpiler.py b/paddle/contrib/float16/float16_transpiler.py index 66e0345c299730c113ffbdc8dd3c1fa32f872f3d..8d95dc0591e1d6bd815cc697528191c2ee8c1cfe 100644 --- a/paddle/contrib/float16/float16_transpiler.py +++ b/paddle/contrib/float16/float16_transpiler.py @@ -102,8 +102,8 @@ class Float16Transpiler: continue for input_arg in current_op.input_arg_names: if input_arg in self.input_map: - current_op.rename_input(input_arg, - self.input_map[input_arg]) + current_op._rename_input(input_arg, + self.input_map[input_arg]) def _remove_unused_var(self): ''' @@ -187,7 +187,7 @@ class Float16Transpiler: shape=var.shape, persistable=var.persistable) find_op(var) - var.op.rename_output(var_name, tmp_var_name) + var.op._rename_output(var_name, tmp_var_name) self.block._insert_op( i, type="cast", diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 3edfc7cdb02db2e8dfc0a2b431f0db030ca70543..f0b4c2028438ea19b39da06f08a226f1ca1e8a4d 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -6,26 +6,9 @@ paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords= paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.Operator.__init__ ArgSpec(args=['self', 'block', 'desc', 'type', 'inputs', 'outputs', 'attrs'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.output ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.rename_input ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.rename_output ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None) paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) -paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.name_scope ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) @@ -286,7 +269,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) -paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1)) +paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index aa95d3e9f6c8221f6e48d192b73ad5135539dc75..f5c286486520391906a6cd7545041c8a7df614ea 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -77,10 +77,12 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, const std::string BatchedCellPreAct = patterns::UniqueKey("BatchedCellPreAct"); const std::string BatchedGate = patterns::UniqueKey("BatchedGate"); + const std::string CheckedCell = patterns::UniqueKey("CheckedCell"); scope->Var(BatchedInput)->GetMutable(); scope->Var(BatchedCellPreAct)->GetMutable(); scope->Var(BatchedGate)->GetMutable(); + scope->Var(CheckedCell)->GetMutable(); op_desc.SetInput("H0", {}); op_desc.SetInput("C0", {}); @@ -90,6 +92,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, op_desc.SetOutput("BatchedGate", {BatchedGate}); op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct}); op_desc.SetOutput("BatchedInput", {BatchedInput}); + op_desc.SetOutput("CheckedCell", {CheckedCell}); op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse")); op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes")); // TODO(TJ): get from attr diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc index 8f548913e4e1d9d5bc5bdace8b92db9065cf3b5e..084a4ba2def87eaa8badb3ca2c39865c6e5cb981 100644 --- a/paddle/fluid/framework/ir/graph_traits.cc +++ b/paddle/fluid/framework/ir/graph_traits.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/framework/ir/graph_traits.h" +#include + namespace paddle { namespace framework { namespace ir { diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h index 06cf4a0f9f33af67343437baeb9623a35ddad183..19e5c2c73eac74dee030a4f7820531800f737e4e 100644 --- a/paddle/fluid/framework/op_info.h +++ b/paddle/fluid/framework/op_info.h @@ -38,31 +38,27 @@ struct OpInfo { OpAttrChecker* checker_{nullptr}; InferVarTypeFN infer_var_type_; InferShapeFN infer_shape_; - std::string op_type_; bool HasOpProtoAndChecker() const { return proto_ != nullptr && checker_ != nullptr; } const proto::OpProto& Proto() const { - PADDLE_ENFORCE_NOT_NULL(proto_, "Operator %s Proto has not been registered", - op_type_); + PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered"); PADDLE_ENFORCE(proto_->IsInitialized(), - "Operator %s Proto must be initialized in op info", - op_type_); + "Operator Proto must be initialized in op info"); return *proto_; } const OpCreator& Creator() const { - PADDLE_ENFORCE_NOT_NULL( - creator_, "Operator %s Creator has not been registered", op_type_); + PADDLE_ENFORCE_NOT_NULL(creator_, + "Operator Creator has not been registered"); return creator_; } const GradOpMakerFN& GradOpMaker() const { PADDLE_ENFORCE_NOT_NULL(grad_op_maker_, - "Operator %s GradOpMaker has not been registered.", - op_type_); + "Operator GradOpMaker has not been registered."); return grad_op_maker_; } @@ -77,9 +73,8 @@ class OpInfoMap { return map_.find(op_type) != map_.end(); } - void Insert(const std::string& type, OpInfo info) { + void Insert(const std::string& type, const OpInfo& info) { PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type); - info.op_type_ = type; map_.insert({type, info}); } diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc index 5ca864cfdf7176850dd31dd42ef3306061a742cf..928e1ad8b9168e61ddc5782066a4aa29a4296a94 100644 --- a/paddle/fluid/framework/selected_rows_test.cc +++ b/paddle/fluid/framework/selected_rows_test.cc @@ -27,8 +27,11 @@ class SelectedRowsTester : public ::testing::Test { selected_rows_.reset(new SelectedRows(rows, height)); Tensor* value = selected_rows_->mutable_value(); - value->mutable_data( + auto* data = value->mutable_data( make_ddim({static_cast(rows.size()), row_numel}), place_); + for (int64_t i = 0; i < value->numel(); ++i) { + data[i] = static_cast(i); + } } protected: @@ -60,6 +63,10 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) { ASSERT_EQ(selected_rows_->height(), dst_tensor.height()); ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims()); ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims()); + auto* dst_data = dst_tensor.value().data(); + for (int64_t i = 0; i < dst_tensor.value().numel(); ++i) { + ASSERT_EQ(dst_data[i], static_cast(i)); + } } TEST(SelectedRows, SparseTable) { diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 3b5be7f3ee33c73a9704bafa9f1b736c8a3cd9ea..f90910ac0d0a897ef01d4ca2bd0bca575baf4c40 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -37,12 +37,16 @@ TEST(Analyzer, analysis_without_tensorrt) { TEST(Analyzer, analysis_with_tensorrt) { FLAGS_IA_enable_tensorrt_subgraph_engine = true; Argument argument; + argument.Set("minimum_subgraph_size", new int(0)); + argument.Set("max_batch_size", new int(3)); + argument.Set("workspace_size", new int(1 << 20)); + argument.Set("precision_mode", new std::string("FP32")); argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); Analyzer analyser; analyser.Run(&argument); } -void TestWord2vecPrediction(const std::string &model_path) { +void TestWord2vecPrediction(const std::string& model_path) { NativeConfig config; config.model_dir = model_path; config.use_gpu = false; @@ -73,8 +77,8 @@ void TestWord2vecPrediction(const std::string &model_path) { // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(5UL, num_elements); i++) { LOG(INFO) << "data: " - << static_cast(outputs.front().data.data())[i]; - PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], + << static_cast(outputs.front().data.data())[i]; + PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], result[i]); } } diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index 5652940ec6d4cc7ba9a1d3a3e65f7dca1690d8c4..cb549f4b50cf56154a951d16b58b022dbad3e990 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -97,8 +97,10 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) { } } -void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, +void CreateTrtEngineOp(Node *node, Argument *argument, framework::proto::BlockDesc *block) { + PADDLE_ENFORCE(argument->main_dfg.get()); + const DataFlowGraph &graph = *(argument->main_dfg); static int counter{0}; PADDLE_ENFORCE(node->IsFunctionBlock()); framework::OpDesc desc; @@ -204,7 +206,10 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc"); // Set attrs + SetAttr(desc.Proto(), "subgraph", block->SerializeAsString()); + SetAttr(desc.Proto(), "max_batch_size", argument->Get("max_batch_size")); + SetAttr(desc.Proto(), "workspace_size", argument->Get("workspace_size")); SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++)); SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes())); SetAttr(desc.Proto(), "output_name_mapping", output_mapping); @@ -248,7 +253,7 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) { *block_desc.Proto()->mutable_vars() = argument_->origin_program_desc->blocks(0).vars(); PADDLE_ENFORCE(!block_desc.Proto()->vars().empty()); - CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto()); + CreateTrtEngineOp(node, argument_, block_desc.Proto()); auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex); auto *op = main_block->add_ops(); PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block"); diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc index b879067d2f2f6294c50e0adb21f9399a7c36698a..526bbbadfe90c3064d7c620cc22e30f7fef99088 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc @@ -309,6 +309,8 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); } void SubGraphFuse::ReplaceNodesWithSubGraphs() { auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)(); for (auto &subgraph : subgraphs) { + if (subgraph.size() <= argument_->Get("minimum_subgraph_size")) + continue; std::unordered_set subgraph_uniq(subgraph.begin(), subgraph.end()); // replace this sub-graph with the first node. Two steps: 1. Create a Block // Node that contains this subgraph 2. Mark the nodes inside the sub-graph diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.h b/paddle/fluid/inference/analysis/subgraph_splitter.h index a31afbe6933da8d3c7a88142cc12d63b98b55796..76e4fda0249e03c617d1b37c079dcd97f21387c1 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.h +++ b/paddle/fluid/inference/analysis/subgraph_splitter.h @@ -20,6 +20,7 @@ limitations under the License. */ #include +#include "paddle/fluid/inference/analysis/argument.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/node.h" @@ -63,8 +64,11 @@ class SubGraphFuse { public: using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller; - SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller) - : graph_(graph), node_inside_subgraph_teller_(teller) {} + SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller, + Argument *argument) + : graph_(graph), + node_inside_subgraph_teller_(teller), + argument_(argument) {} // The main method which run all the logic. void operator()(); @@ -76,6 +80,7 @@ class SubGraphFuse { private: DataFlowGraph *graph_; NodeInsideSubgraphTeller node_inside_subgraph_teller_; + Argument *argument_; }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc index 531a170512f727d891aa6644ee08a60c25f16876..e1dc89fab5fb76d456b07c316ab1cabe6de23b26 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc @@ -66,10 +66,12 @@ TEST(SubGraphSplitter, Split) { TEST(SubGraphSplitter, Fuse) { auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); + Argument argument; + argument.Set("minimum_subgraph_size", new int(3)); size_t count0 = dfg.nodes.size(); - SubGraphFuse fuse(&dfg, teller); + SubGraphFuse fuse(&dfg, teller, &argument); fuse(); int count1 = 0; diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc index faf876de6d65d20cf7a084cd97392cfc8d791a42..cc1746ecb34c983d219693bcec17c8789c38fa9f 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc @@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass( : node_inside_subgraph_teller_(teller) {} void TensorRTSubGraphPass::Run(DataFlowGraph *graph) { - SubGraphFuse(graph, node_inside_subgraph_teller_)(); + SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)(); VLOG(4) << "debug info " << graph->HumanReadableInfo(false /*show_values*/, true /*show_functions*/); diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h index 219e3f5470f627e81005aabf94f9c72c33fd2eed..3545da9109d79964f36c3d7e738620cc2e0f9a6c 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h @@ -33,7 +33,10 @@ class TensorRTSubGraphPass : public DataFlowGraphPass { explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller); - bool Initialize(Argument* argument) override { return true; } + bool Initialize(Argument* argument) override { + argument_ = argument; + return true; + } // This class get a sub-graph as input and determine whether to transform this // sub-graph into TensorRT. @@ -46,6 +49,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass { private: NodeInsideSubgraphTeller node_inside_subgraph_teller_; + Argument* argument_; }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc index 67a5af83d89b771536ea11be51b35244ff5c09d6..9748e24b06295a4e7c2995429e6588cd0f225fe6 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc @@ -36,6 +36,10 @@ TEST(TensorRTSubGraphPass, main) { }; Argument argument(FLAGS_inference_model_dir); + argument.Set("minimum_subgraph_size", new int(0)); + argument.Set("max_batch_size", new int(3)); + argument.Set("workspace_size", new int(1 << 20)); + argument.Set("precision_mode", new std::string("FP32")); DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"}; DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"}; diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc index 6c7e63971b2d93f58e219dbd93637c8d389deb7c..5ee6a5a93168f58770067f76ca7f6bb6f67b2965 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc @@ -35,8 +35,6 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { bool Init(const std::shared_ptr& parent_scope) { FLAGS_IA_enable_tensorrt_subgraph_engine = true; VLOG(3) << "Predictor::init()"; - FLAGS_tensorrt_max_batch_size = config_.max_batch_size; - FLAGS_tensorrt_workspace_size = config_.workspace_size; if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); } else { @@ -92,6 +90,14 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { void OptimizeInferenceProgram() { // Analyze inference_program Argument argument; + + argument.Set("minimum_subgraph_size", + new int(config_.minimum_subgraph_size)); + argument.Set("max_batch_size", new int(config_.max_batch_size)); + argument.Set("workspace_size", new int(config_.workspace_size)); + argument.Set("precision_mode", + new std::string(config_.precision_mode)); + if (!config_.model_dir.empty()) { argument.fluid_model_dir.reset(new std::string(config_.model_dir)); } else { diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 2b4e5ed73704041e18bdbce32338405f3601e082..01ea0d9c3ad37b3bcebe6853de77373810333776 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -194,6 +194,14 @@ struct MixedRTConfig : public NativeConfig { // For workspace_size, refer it from here: // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting int workspace_size{1 << 30}; + // We transform the Ops that can be converted into TRT layer in the model, + // and aggregate these Ops into subgraphs for TRT execution. + // We set this variable to control the minimum number of nodes in the + // subgraph, 3 as default value. + int minimum_subgraph_size = 3; + // Reserved configuration + // We just support "FP32" now, "FP16" and "INT8" will be supported. + std::string precision_mode = "FP32"; }; // NOTE WIP, not stable yet. diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index d7ab2ac980af2cf3bd9d95bfdbfa1887ef9a64d7..70f9e397c96cf3fe92779778950f3df71b5a67c9 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -90,3 +90,13 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI DEPS inference_anakin_api_shared dynload_cuda SERIAL) endif() endif() + +if(WITH_GPU AND TENSORRT_FOUND) + set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt") + if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}) + inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz") + endif() + cc_test(test_trt_models SRCS trt_models_tester.cc + ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models + DEPS paddle_inference_tensorrt_subgraph_engine) +endif() diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..bf320a0cbc2fff5f973c48768281e26d0fde232b --- /dev/null +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +namespace paddle { +using paddle::contrib::MixedRTConfig; + +DEFINE_string(dirname, "", "Directory of the inference model."); + +NativeConfig GetConfigNative() { + NativeConfig config; + config.model_dir = FLAGS_dirname; + // LOG(INFO) << "dirname " << config.model_dir; + config.fraction_of_gpu_memory = 0.45; + config.use_gpu = true; + config.device = 0; + return config; +} + +MixedRTConfig GetConfigTRT() { + MixedRTConfig config; + config.model_dir = FLAGS_dirname; + config.use_gpu = true; + config.fraction_of_gpu_memory = 0.2; + config.device = 0; + config.max_batch_size = 3; + return config; +} + +void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) { + NativeConfig config0 = GetConfigNative(); + config0.model_dir = model_dirname; + + MixedRTConfig config1 = GetConfigTRT(); + config1.model_dir = model_dirname; + config1.max_batch_size = batch_size; + + auto predictor0 = + CreatePaddlePredictor(config0); + auto predictor1 = + CreatePaddlePredictor(config1); + // Prepare inputs + int height = 224; + int width = 224; + float *data = new float[batch_size * 3 * height * width]; + memset(data, 0, sizeof(float) * (batch_size * 3 * height * width)); + data[0] = 1.0f; + + // Prepare inputs + PaddleTensor tensor; + tensor.name = "input_0"; + tensor.shape = std::vector({batch_size, 3, height, width}); + tensor.data = PaddleBuf(static_cast(data), + sizeof(float) * (batch_size * 3 * height * width)); + tensor.dtype = PaddleDType::FLOAT32; + std::vector paddle_tensor_feeds(1, tensor); + + // Prepare outputs + std::vector outputs0; + std::vector outputs1; + CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0)); + + CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size)); + + // Get output. + ASSERT_EQ(outputs0.size(), 1UL); + ASSERT_EQ(outputs1.size(), 1UL); + + const size_t num_elements = outputs0.front().data.length() / sizeof(float); + const size_t num_elements1 = outputs1.front().data.length() / sizeof(float); + EXPECT_EQ(num_elements, num_elements1); + + auto *data0 = static_cast(outputs0.front().data.data()); + auto *data1 = static_cast(outputs1.front().data.data()); + + ASSERT_GT(num_elements, 0UL); + for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) { + EXPECT_NEAR(data0[i], data1[i], 1e-3); + } +} + +TEST(trt_models_test, main) { + std::vector infer_models = {"mobilenet", "resnet50", + "resnext50"}; + for (auto &model_dir : infer_models) { + CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + model_dir); + } +} +} // namespace paddle diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc index dfaa7456f917c1308984b361afed752f96ea6f59..0784920064a879963cd9725cd9acf4cec7b874ce 100644 --- a/paddle/fluid/operators/auc_op.cc +++ b/paddle/fluid/operators/auc_op.cc @@ -36,11 +36,16 @@ class AucOp : public framework::OperatorWithKernel { "Out and Label should have same height."); int num_pred_buckets = ctx->Attrs().Get("num_thresholds") + 1; + int slide_steps = ctx->Attrs().Get("slide_steps"); + + PADDLE_ENFORCE_GE(num_pred_buckets, 1, "num_thresholds must larger than 1"); + PADDLE_ENFORCE_GE(slide_steps, 0, "slide_steps must be natural number"); ctx->SetOutputDim("AUC", {1}); - ctx->SetOutputDim("BatchAUC", {1}); - ctx->SetOutputDim("StatPosOut", {num_pred_buckets}); - ctx->SetOutputDim("StatNegOut", {num_pred_buckets}); + + slide_steps = slide_steps == 0 ? 1 : slide_steps; + ctx->SetOutputDim("StatPosOut", {slide_steps, num_pred_buckets}); + ctx->SetOutputDim("StatNegOut", {slide_steps, num_pred_buckets}); } protected: @@ -62,6 +67,7 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Label", "A 2D int tensor indicating the label of the training data. " "shape: [batch_size, 1]"); + // TODO(typhoonzero): support weight input AddInput("StatPos", "Statistic value when label = 1"); AddInput("StatNeg", "Statistic value when label = 0"); @@ -69,18 +75,19 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("AUC", "A scalar representing the " "current area-under-the-curve."); - AddOutput("BatchAUC", "The AUC for current batch"); + AddOutput("StatPosOut", "Statistic value when label = 1"); AddOutput("StatNegOut", "Statistic value when label = 0"); AddAttr("curve", "Curve type, can be 'ROC' or 'PR'.") .SetDefault("ROC"); - AddAttr("num_thresholds", - "The number of thresholds to use when discretizing the" - " roc curve.") + AddAttr( + "num_thresholds", + "The number of thresholds to use when discretizing the roc curve.") .SetDefault((2 << 12) - 1); - + AddAttr("slide_steps", "Use slide steps to calc batch auc.") + .SetDefault(1); AddComment(R"DOC( Area Under The Curve (AUC) Operator. diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h index fb0517d70635e090f8c5b59ff9d8420fc34c747b..fb370842d1942c3b3eebecb1fe5e8ffb845cb34b 100644 --- a/paddle/fluid/operators/auc_op.h +++ b/paddle/fluid/operators/auc_op.h @@ -32,7 +32,9 @@ class AucKernel : public framework::OpKernel { std::string curve = ctx.Attr("curve"); int num_thresholds = ctx.Attr("num_thresholds"); + // buckets contain numbers from 0 to num_thresholds int num_pred_buckets = num_thresholds + 1; + int slide_steps = ctx.Attr("slide_steps"); // Only use output var for now, make sure it's persistable and // not cleaned up for each batch. @@ -40,16 +42,19 @@ class AucKernel : public framework::OpKernel { auto *stat_pos = ctx.Output("StatPosOut"); auto *stat_neg = ctx.Output("StatNegOut"); - auto *stat_pos_data = stat_pos->mutable_data(ctx.GetPlace()); - auto *stat_neg_data = stat_neg->mutable_data(ctx.GetPlace()); - calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds, - auc); + auto *origin_stat_pos = stat_pos->mutable_data(ctx.GetPlace()); + auto *origin_stat_neg = stat_neg->mutable_data(ctx.GetPlace()); - auto *batch_auc = ctx.Output("BatchAUC"); - std::vector stat_pos_batch(num_pred_buckets, 0); - std::vector stat_neg_batch(num_pred_buckets, 0); - calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(), - num_thresholds, batch_auc); + std::vector stat_pos_data(num_pred_buckets, 0); + std::vector stat_neg_data(num_pred_buckets, 0); + + auto stat_pos_calc = stat_pos_data.data(); + auto stat_neg_calc = stat_neg_data.data(); + + statAuc(label, predict, num_pred_buckets, num_thresholds, slide_steps, + origin_stat_pos, origin_stat_neg, &stat_pos_calc, &stat_neg_calc); + + calcAuc(ctx, stat_pos_calc, stat_neg_calc, num_thresholds, auc); } private: @@ -58,29 +63,76 @@ class AucKernel : public framework::OpKernel { return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; } - inline static void calcAuc(const framework::ExecutionContext &ctx, - const framework::Tensor *label, + inline static void statAuc(const framework::Tensor *label, const framework::Tensor *predict, - int64_t *stat_pos, int64_t *stat_neg, - int num_thresholds, - framework::Tensor *auc_tensor) { + const int num_pred_buckets, + const int num_thresholds, const int slide_steps, + int64_t *origin_stat_pos, int64_t *origin_stat_neg, + int64_t **stat_pos, int64_t **stat_neg) { size_t batch_size = predict->dims()[0]; size_t inference_width = predict->dims()[1]; const T *inference_data = predict->data(); const auto *label_data = label->data(); - auto *auc = auc_tensor->mutable_data(ctx.GetPlace()); - for (size_t i = 0; i < batch_size; i++) { uint32_t binIdx = static_cast( inference_data[i * inference_width + 1] * num_thresholds); if (label_data[i]) { - stat_pos[binIdx] += 1.0; + (*stat_pos)[binIdx] += 1.0; } else { - stat_neg[binIdx] += 1.0; + (*stat_neg)[binIdx] += 1.0; } } + int bucket_length = num_pred_buckets * sizeof(int64_t); + + // will stat auc unlimited. + if (slide_steps == 0) { + for (int slide = 0; slide < num_pred_buckets; ++slide) { + origin_stat_pos[slide] += (*stat_pos)[slide]; + origin_stat_neg[slide] += (*stat_neg)[slide]; + } + + *stat_pos = origin_stat_pos; + *stat_neg = origin_stat_neg; + + } else { + for (int slide = 1; slide < slide_steps; ++slide) { + int dst_idx = (slide - 1) * num_pred_buckets; + int src_inx = slide * num_pred_buckets; + std::memcpy(origin_stat_pos + dst_idx, origin_stat_pos + src_inx, + bucket_length); + std::memcpy(origin_stat_neg + dst_idx, origin_stat_neg + src_inx, + bucket_length); + } + + std::memcpy(origin_stat_pos + (slide_steps - 1) * num_pred_buckets, + *stat_pos, bucket_length); + std::memcpy(origin_stat_neg + (slide_steps - 1) * num_pred_buckets, + *stat_neg, bucket_length); + + std::memset(*stat_pos, 0, bucket_length); + std::memset(*stat_neg, 0, bucket_length); + + for (int slide = 0; slide < num_pred_buckets; ++slide) { + int stat_pos_steps = 0; + int stat_neg_steps = 0; + for (int step = 0; step < slide_steps; ++step) { + stat_pos_steps += origin_stat_pos[slide + step * num_pred_buckets]; + stat_neg_steps += origin_stat_neg[slide + step * num_pred_buckets]; + } + (*stat_pos)[slide] += stat_pos_steps; + (*stat_neg)[slide] += stat_neg_steps; + } + } + } + + inline static void calcAuc(const framework::ExecutionContext &ctx, + int64_t *stat_pos, int64_t *stat_neg, + int num_thresholds, + framework::Tensor *auc_tensor) { + auto *auc = auc_tensor->mutable_data(ctx.GetPlace()); + *auc = 0.0f; double totPos = 0.0; @@ -96,7 +148,6 @@ class AucKernel : public framework::OpKernel { totPos += stat_pos[idx]; totNeg += stat_neg[idx]; *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev); - --idx; } diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index 8ca79d20ec4f6412b00dbf3990068f81b65e2efd..23e8edd18d037a7f9127482951f25be3abf1b62f 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -76,12 +76,18 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); PADDLE_ENFORCE_EQ(b_dims[0], 1, "The first dimension of Input(Bias) should be 1."); - PADDLE_ENFORCE_EQ( - b_dims[1], (ctx->Attrs().Get("use_peepholes") ? 7 : 4) * frame_size, - "The second dimension of Input(Bias) should be " - "7 * %d if enable peepholes connection or" - "4 * %d if disable peepholes", - frame_size, frame_size); + if (ctx->Attrs().Get("use_peepholes")) { + PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size, + "The second dimension of Input(Bias) should be " + "7 * %d if enable peepholes connection", + frame_size); + ctx->SetOutputDim("CheckedCell", {2, frame_size}); + } else { + PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size, + "The second dimension of Input(Bias) should be " + "4 * %d if disable peepholes", + frame_size); + } framework::DDim out_dims({x_dims[0], frame_size}); ctx->SetOutputDim("Hidden", out_dims); @@ -173,6 +179,8 @@ void FusionLSTMOpMaker::Make() { AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate(); AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate(); AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate(); + AddOutput("CheckedCell", "(Tensor) (2 x D) only for peephole.") + .AsIntermediate(); AddAttr("use_peepholes", "(bool, defalut: True) " "whether to enable diagonal/peephole connections.") @@ -250,19 +258,19 @@ class FuisonLSTMKernel : public framework::OpKernel { const int D3 = D * 3; \ const int D4 = wh_dims[1]; -#define INIT_BASE_INPUT_DATAS \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wc_data = bias->data() + D4; \ - /* for peephole only*/ \ - Tensor checked_cell; \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - checked_cell_data = checked_cell.mutable_data({2, D}, place); \ +#define INIT_BASE_INPUT_DATAS \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wc_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ } /// Compute LSTM diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc index 326c58ee1c09d6f745e6c8abfb92030d11d8c1c6..a0d640b2020958af53a4405ae886eadb2a1e117e 100644 --- a/paddle/fluid/operators/read_op.cc +++ b/paddle/fluid/operators/read_op.cc @@ -45,12 +45,10 @@ class ReadInferVarType : public framework::VarTypeInference { framework::VarDesc* reader = block->FindVarRecursive(reader_name); auto dtypes = reader->GetDataTypes(); PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); - auto lod_levels = reader->GetLoDLevels(); for (size_t i = 0; i < dtypes.size(); ++i) { framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); out.SetType(framework::proto::VarType::LOD_TENSOR); out.SetDataType(dtypes[i]); - out.SetLoDLevel(lod_levels[i]); } } }; diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index bf4df4f600c14050b636b7ee6d7b6973b57adb94..981969d2aaa684731a615ec64ca7f7718b35cf09 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -77,8 +77,10 @@ class ScaleOpVarTypeInference : public framework::VarTypeInference { auto out_var_name = op_desc.Output("Out").front(); auto *out_var = block->FindVarRecursive(out_var_name); - out_var->SetType(in_var.GetType()); - out_var->SetDataType(in_var.GetDataType()); + if (in_var_name != out_var_name) { + out_var->SetType(in_var.GetType()); + out_var->SetDataType(in_var.GetDataType()); + } } }; diff --git a/paddle/fluid/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu index 9527e7ba300e10a6af1a0dd4b312c0323115256e..4722be7a666d3e8f3c25c9499f88ddda835f60e3 100644 --- a/paddle/fluid/operators/sgd_op.cu +++ b/paddle/fluid/operators/sgd_op.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#define EIGEN_USE_GPU #include "paddle/fluid/operators/sgd_op.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -33,21 +33,22 @@ __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate, } } -template +template __global__ void SparseSGDFunctorKernel(const T* selected_rows, const int64_t* rows, const T* learning_rate, T* tensor_out, - int64_t row_numel, int64_t limit) { - for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) { - const T* selected_rows_ptr = selected_rows + i * row_numel; - T* tensor_out_ptr = tensor_out + rows[i] * row_numel; - for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) { - // Since index in rows of SelectedRows can be duplicate, we have to use - // Atomic Operation to avoid concurrent write error. - paddle::platform::CudaAtomicAdd( - tensor_out_ptr + index, - -1.0 * learning_rate[0] * selected_rows_ptr[index]); - } + int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + + selected_rows += ty * row_numel; + tensor_out += rows[ty] * row_numel; + + for (int index = tid; index < row_numel; index += block_size) { + // Since index in rows of SelectedRows can be duplicate, we have to use + // Atomic Operation to avoid concurrent write error. + paddle::platform::CudaAtomicAdd( + tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]); } } } // namespace @@ -96,15 +97,13 @@ class SGDOpCUDAKernel : public framework::OpKernel { auto* in_data = in_value.data(); auto* out_data = param_out->data(); - const int kThreadsPerBlock = 256; - int thread_x = kThreadsPerBlock; - int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount(); - int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); - - SparseSGDFunctorKernel<<>>( + const int block_size = 256; + dim3 threads(block_size, 1); + dim3 grid(1, in_rows.size()); + SparseSGDFunctorKernel< + T, 256><<>>( in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data(), - out_data, in_row_numel, in_rows.size()); + out_data, in_row_numel); } else { PADDLE_THROW("Unsupported Variable Type of Grad"); diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc index e008e130e34f60a78bf44e211c42c4b7786d1721..29d2fb989754f5621222768a279a1c898ea1c355 100644 --- a/paddle/fluid/operators/shrink_rnn_memory_op.cc +++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc @@ -52,26 +52,16 @@ class ShrinkRNNMemoryOp : public ArrayOp { size_t height = dst_num_rows; // do shrink for the top level LoD - if (x_tensor.lod().size() > 0 && x_tensor.lod()[0].size() > static_cast(dst_num_rows)) { - if (x_tensor.lod().size() > 1) { // MultiLevel LoD - auto lod_offset = framework::GetSubLoDAndAbsoluteOffset( - x_tensor.lod(), 0, dst_num_rows, 0); - height = lod_offset.second.second; - auto out_lod = out_tensor.mutable_lod(); - framework::AppendLoD(out_lod, lod_offset.first); - } else { - // Shrink LoD - auto lod_item = x_tensor.lod()[0]; - lod_item.resize(dst_num_rows + 1); - out_tensor.set_lod({lod_item}); - const auto &const_lod_item = lod_item; - height = const_lod_item.back(); - } + auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(x_tensor.lod(), 0, + dst_num_rows, 0); + height = lod_offset.second.second; + auto out_lod = out_tensor.mutable_lod(); + framework::AppendLoD(out_lod, lod_offset.first); } - if (height != 0) { + if (dst_num_rows != 0) { out_tensor.mutable_data(place, x_tensor.type()); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx, @@ -144,11 +134,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { } else { auto &dout_tensor = dout_var->Get(); auto height = dout_tensor.dims()[0]; - if (height != 0) { - auto slice = dx_tensor.Slice(0, static_cast(height)); - framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx, - &slice); - } + auto slice = dx_tensor.Slice(0, static_cast(height)); + framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx, &slice); if (dx_tensor.dims()[0] > height) { auto rest_tensor = dx_tensor.Slice( static_cast(height), static_cast(dx_tensor.dims()[0])); diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 6dffe527c1072ee97fcde1725bfc1a47ed1ad74a..7c61e38f6222886a49a3de47867f26aeb6273a6b 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -32,7 +32,7 @@ class SumKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto in_vars = context.MultiInputVar("X"); - int N = in_vars.size(); + size_t in_num = in_vars.size(); auto out_var = context.OutputVar("Out"); bool in_place = out_var == in_vars[0]; @@ -53,7 +53,7 @@ class SumKernel : public framework::OpKernel { auto &place = *context.template device_context().eigen_device(); // If in_place, just skip the first tensor - for (int i = in_place ? 1 : 0; i < N; i++) { + for (size_t i = in_place ? 1 : 0; i < in_num; i++) { if (in_vars[i]->IsType()) { auto &in_t = in_vars[i]->Get(); if (in_t.numel() == 0) { @@ -101,13 +101,13 @@ class SumKernel : public framework::OpKernel { // Runtime InferShape size_t first_dim = 0; - for (int i = 0; i < N; i++) { + for (size_t i = 0; i < in_num; i++) { auto &sel_row = get_selected_row(i); first_dim += sel_row.rows().size(); } std::vector in_dim; - for (int i = 0; i < N; i++) { + for (size_t i = 0; i < in_num; i++) { auto &sel_row = get_selected_row(i); if (sel_row.rows().size() > 0) { in_dim = framework::vectorize(sel_row.value().dims()); @@ -116,7 +116,8 @@ class SumKernel : public framework::OpKernel { } if (in_dim.empty()) { VLOG(3) << "WARNING: all the inputs are empty"; - in_dim = framework::vectorize(get_selected_row(N - 1).value().dims()); + in_dim = + framework::vectorize(get_selected_row(in_num - 1).value().dims()); } else { in_dim[0] = static_cast(first_dim); } @@ -133,7 +134,7 @@ class SumKernel : public framework::OpKernel { math::SelectedRowsAddTo functor; int64_t offset = 0; - for (int i = 0; i < N; i++) { + for (size_t i = 0; i < in_num; i++) { auto &sel_row = get_selected_row(i); if (sel_row.rows().size() == 0) { continue; diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc index 1048d3017140c9e31426a1580b2862667116a024..41a5786fe8c3295390144732221280e152d0a15a 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt_engine_op.cc @@ -22,8 +22,6 @@ namespace paddle { DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT"); -DEFINE_int32(tensorrt_max_batch_size, 1, "TensorRT maximum batch size"); -DEFINE_int32(tensorrt_workspace_size, 16 << 20, "TensorRT workspace size"); namespace operators { @@ -34,6 +32,8 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Ys", "A list of outputs").AsDuplicable(); AddAttr("subgraph", "the subgraph."); AddAttr("engine_uniq_key", "unique key for the TRT engine."); + AddAttr("max_batch_size", "the maximum batch size."); + AddAttr("workspace_size", "the workspace size."); AddComment("TensorRT engine operator."); } }; diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index 69173ff5178d32634f9ab291b7d709a3f91cb368..3c78c29c1a30d74947be84cd2b52ad308e732a2d 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -28,8 +28,6 @@ namespace paddle { DECLARE_int32(tensorrt_engine_batch_size); -DECLARE_int32(tensorrt_max_batch_size); -DECLARE_int32(tensorrt_workspace_size); namespace operators { @@ -92,14 +90,14 @@ class TensorRTEngineKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto engine_name = context.Attr("engine_uniq_key"); + int max_batch_size = context.Attr("max_batch_size"); if (!Singleton::Global().HasEngine(engine_name)) { Prepare(context); } auto* engine = Singleton::Global().Get(engine_name); auto input_names = context.op().Inputs("Xs"); PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs"); - PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, - FLAGS_tensorrt_max_batch_size); + PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size); std::vector output_maps = context.Attr>("output_name_mapping"); @@ -173,8 +171,9 @@ class TensorRTEngineKernel : public framework::OpKernel { // Get the ProgramDesc and pass to convert. framework::proto::BlockDesc block_desc; block_desc.ParseFromString(context.Attr("subgraph")); - int max_batch = FLAGS_tensorrt_max_batch_size; - auto max_workspace = FLAGS_tensorrt_workspace_size; + int max_batch_size = context.Attr("max_batch_size"); + int workspace_size = context.Attr("workspace_size"); + auto params = context.Attr>("parameters"); std::unordered_set parameters; for (const auto& param : params) { @@ -186,7 +185,7 @@ class TensorRTEngineKernel : public framework::OpKernel { // TODO(Superjomn) replace this with a different stream auto* engine = Singleton::Global().Create( - max_batch, max_workspace, nullptr /*engine hold its own stream*/, + max_batch_size, workspace_size, nullptr /*engine hold its own stream*/, context.Attr("engine_uniq_key"), boost::get(context.GetPlace()).device); diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc index 27c1d29762b3de5e57f877b271aae52e71eb7cf9..e21101e8d12f210af08284dbcebe5c14c1af6dd3 100644 --- a/paddle/fluid/operators/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc @@ -58,8 +58,6 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block, using inference::analysis::SetAttr; TEST(TensorRTEngineOp, manual) { - FLAGS_tensorrt_engine_batch_size = 2; - FLAGS_tensorrt_max_batch_size = 2; framework::ProgramDesc program; auto* block_ = program.Proto()->add_blocks(); block_->set_idx(0); @@ -101,6 +99,8 @@ TEST(TensorRTEngineOp, manual) { engine_op_desc.SetOutput("Ys", std::vector({"z0"})); SetAttr(engine_op_desc.Proto(), "subgraph", block_->SerializeAsString()); + SetAttr(engine_op_desc.Proto(), "max_batch_size", 2); + SetAttr(engine_op_desc.Proto(), "workspace_size", 2 << 10); SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); SetAttr>(engine_op_desc.Proto(), "parameters", std::vector({})); @@ -129,8 +129,6 @@ TEST(TensorRTEngineOp, manual) { } void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { - FLAGS_tensorrt_engine_batch_size = batch_size; - FLAGS_tensorrt_max_batch_size = batch_size; framework::ProgramDesc program; framework::Scope scope; platform::CUDAPlace place; @@ -195,8 +193,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { SetAttr(engine_op_desc.Proto(), "subgraph", block_->SerializeAsString()); - SetAttr(engine_op_desc.Proto(), "max_batch", batch_size); - SetAttr(engine_op_desc.Proto(), "max_workspace", 2 << 10); + SetAttr(engine_op_desc.Proto(), "max_batch_size", batch_size); + SetAttr(engine_op_desc.Proto(), "workspace_size", 2 << 10); SetAttr>( engine_op_desc.Proto(), "parameters", std::vector({"y0", "y1", "y2", "y3"})); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 1b283fc9725fb8d01da913312844d0faea29daf6..dfc079e986e93c7f02f17b299e5d6293edbedd05 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -201,7 +201,6 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) compute_capability = GetCUDAComputeCapability(place_.device); multi_process = GetCUDAMultiProcessors(place_.device); max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device); - grid_max_dims_ = GpuMaxGridDim(place_.device); PADDLE_ENFORCE(cudaStreamCreate(&stream_)); eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_->Reinitialize(&stream_, place); @@ -240,10 +239,6 @@ int CUDADeviceContext::GetMaxPhysicalThreadCount() const { return multi_process * max_threads_per_mp; } -std::tuple CUDADeviceContext::GetMaxGridDims() const { - return grid_max_dims_; -} - Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { return eigen_device_.get(); } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index da32b0dad4b8cfe75bf82f59ec58db8136b899f2..79539195157d74d4d757edee5e008cbb76c93ee2 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -13,7 +13,6 @@ limitations under the License. */ #include #include // NOLINT #include -#include #include #include @@ -92,8 +91,6 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return the max physical thread count in the device context */ int GetMaxPhysicalThreadCount() const; - std::tuple GetMaxGridDims() const; - /*! \brief Return eigen device in the device context. */ Eigen::GpuDevice* eigen_device() const; @@ -138,8 +135,6 @@ class CUDADeviceContext : public DeviceContext { cudaStream_t stream_; cublasHandle_t cublas_handle_; - std::tuple grid_max_dims_; - int compute_capability; int multi_process; int max_threads_per_mp; diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h index 2806d726d2b1ac6b717a9041af19e7ee62be6883..c153e80fe42aecb33d3aa97874d2881bce9029be 100644 --- a/paddle/fluid/platform/for_range.h +++ b/paddle/fluid/platform/for_range.h @@ -48,54 +48,35 @@ __global__ static void ForRangeElemwiseOpGridIsOne(Function func) { } template -__global__ static void ForRangeElemwiseOp(Function func, size_t limit) { +__global__ static void ForRangeElemwiseOp(Function func, int limit) { size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); if (idx < limit) { func(idx); } } -template -__global__ static void ForRangeElemwiseOpGridLarge(Function func, size_t limit, - int grid_dim) { - size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - while (idx < limit) { - func(idx); - idx += grid_dim; - } -} - template <> struct ForRange { ForRange(const CUDADeviceContext& dev_ctx, size_t limit) - : dev_ctx_(dev_ctx), limit_(limit) {} + : dev_ctx_(dev_ctx), limit_(static_cast(limit)) {} template inline void operator()(Function func) const { constexpr int num_threads = 1024; int block_size = limit_ <= num_threads ? limit_ : num_threads; - size_t grid_size = (limit_ + num_threads - 1) / num_threads; - - int max_grid_dim = std::get<0>(dev_ctx_.GetMaxGridDims()); - - if (grid_size < max_grid_dim) { - int grid_size_int = static_cast(grid_size); - if (grid_size == 1) { - ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>( - func); - } else { - ForRangeElemwiseOp<<>>( - func, limit_); - } + int grid_size = (limit_ + num_threads - 1) / num_threads; + + if (grid_size == 1) { + ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>( + func); } else { - ForRangeElemwiseOpGridLarge<<>>(func, limit_, - max_grid_dim); + ForRangeElemwiseOp<<>>( + func, limit_); } } const CUDADeviceContext& dev_ctx_; - size_t limit_; + int limit_; }; #endif diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index b88523728407803a1ea9d343dc2d33c6a38d5de9..126636d879213b1c8f242db8fbdf6a358a1d2da9 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -152,22 +152,5 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) { PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream), "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync"); } - -std::tuple GpuMaxGridDim(int id) { - std::tuple result; - PADDLE_ENFORCE( - cudaDeviceGetAttribute(&std::get<0>(result), cudaDevAttrMaxBlockDimX, id), - "cudaDeviceGetAttribute failed in " - "cudaDevAttrMaxBlockDim"); - PADDLE_ENFORCE( - cudaDeviceGetAttribute(&std::get<1>(result), cudaDevAttrMaxBlockDimY, id), - "cudaDeviceGetAttribute failed in " - "cudaDevAttrMaxBlockDim"); - PADDLE_ENFORCE( - cudaDeviceGetAttribute(&std::get<2>(result), cudaDevAttrMaxBlockDimZ, id), - "cudaDeviceGetAttribute failed in " - "cudaDevAttrMaxBlockDim"); - return result; -} } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index b748c6e8a519d27acd211f815a210c7a74ff32c8..f4640d3eaa2165c35e8e14690d83e9e7e7168c0b 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -19,7 +19,6 @@ limitations under the License. */ #include #include #include -#include namespace paddle { namespace platform { @@ -73,8 +72,6 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, //! Set memory dst with value count size asynchronously void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream); -std::tuple GpuMaxGridDim(int id); - } // namespace platform } // namespace paddle diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 67501186d150171728194f23bc02d2c014848dd7..a5bc44122028c1191f511157bdde2e7c2d30c6aa 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -285,12 +285,12 @@ void BindOpDesc(pybind11::module *m) { .def("set_output", &pd::OpDesc::SetOutput) .def("input_arg_names", &pd::OpDesc::InputArgumentNames) .def("output_arg_names", &pd::OpDesc::OutputArgumentNames) - .def("rename_input", &pd::OpDesc::RenameInput) - .def("rename_output", &pd::OpDesc::RenameOutput) + .def("_rename_input", &pd::OpDesc::RenameInput) + .def("_rename_output", &pd::OpDesc::RenameOutput) .def("has_attr", &pd::OpDesc::HasAttr) .def("attr_type", &pd::OpDesc::GetAttrType) .def("attr_names", &pd::OpDesc::AttrNames) - .def("set_attr", &pd::OpDesc::SetAttr) + .def("_set_attr", &pd::OpDesc::SetAttr) .def("attr", &pd::OpDesc::GetAttr) .def("set_block_attr", &pd::OpDesc::SetBlockAttr) .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr) @@ -300,8 +300,8 @@ void BindOpDesc(pybind11::module *m) { std::string ser(seriralized); self.SetAttr(name, ser); }) - .def("block_attr_id", &pd::OpDesc::GetBlockAttrId) - .def("blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds) + .def("_block_attr_id", &pd::OpDesc::GetBlockAttrId) + .def("_blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds) .def("check_attrs", &pd::OpDesc::CheckAttrs) .def("infer_shape", &pd::OpDesc::InferShape) .def("infer_var_type", &pd::OpDesc::InferVarType) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index e6a9524382be219e550017ed4f1a6070dca22fbf..c397f070e947ba787c13397dfc07e4b1e4e37e73 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -381,7 +381,7 @@ function run_mac_test() { EOF # TODO: jiabin need to refine this part when these tests fixed on mac - ctest --output-on-failure -j8 + ctest --output-on-failure -j $1 # make install should also be test when unittest make install -j 8 pip install /usr/local/opt/paddle/share/wheels/*.whl @@ -629,10 +629,10 @@ EOF function gen_capi_package() { if [[ ${WITH_C_API} == "ON" ]]; then - install_prefix="${PADDLE_ROOT}/build/capi_output" - rm -rf $install_prefix - make DESTDIR="$install_prefix" install - cd $install_prefix/usr/local + capi_install_prefix=${INSTALL_PREFIX:-/paddle/build}/capi_output + rm -rf $capi_install_prefix + make DESTDIR="$capi_install_prefix" install + cd $capi_install_prefix/ ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz fi } @@ -729,7 +729,7 @@ function main() { maccheck) cmake_gen ${PYTHON_ABI:-""} build_mac - run_mac_test + run_mac_test ${PROC_RUN:-1} ;; cicheck_py35) cmake_gen ${PYTHON_ABI:-""} diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py index ece4046f5b7a7eff5be724d6f890665be7f3344e..58a4c66c206c3f783437126c855c2890644f1bc0 100644 --- a/python/paddle/dataset/common.py +++ b/python/paddle/dataset/common.py @@ -77,13 +77,14 @@ def download(url, module_name, md5sum, save_name=None): retry_limit = 3 while not (os.path.exists(filename) and md5file(filename) == md5sum): if os.path.exists(filename): - print("file md5", md5file(filename), md5sum) + sys.stderr.write("file %s md5 %s" % (md5file(filename), md5sum)) if retry < retry_limit: retry += 1 else: raise RuntimeError("Cannot download {0} within retry limit {1}". format(url, retry_limit)) - print("Cache file %s not found, downloading %s" % (filename, url)) + sys.stderr.write("Cache file %s not found, downloading %s" % + (filename, url)) r = requests.get(url, stream=True) total_length = r.headers.get('content-length') @@ -100,10 +101,11 @@ def download(url, module_name, md5sum, save_name=None): dl += len(data) f.write(data) done = int(50 * dl / total_length) - sys.stdout.write("\r[%s%s]" % ('=' * done, + sys.stderr.write("\r[%s%s]" % ('=' * done, ' ' * (50 - done))) sys.stdout.flush() - + sys.stderr.write("\n") + sys.stdout.flush() return filename diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 88eaae10dd55edcc7e811163acf17579eb32cbf1..17fe8dc3c8a28ad129e2d377820da95e8e7a02d9 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -38,8 +38,8 @@ def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None): op_desc = op_descs[i] if isinstance(op_desc, tuple): op_desc = op_desc[0] - op_desc.rename_input(old_name, new_name) - op_desc.rename_output(old_name, new_name) + op_desc._rename_input(old_name, new_name) + op_desc._rename_output(old_name, new_name) def _create_op_desc_(op_type, inputs, outputs, attrs): @@ -70,7 +70,7 @@ def _create_op_desc_(op_type, inputs, outputs, attrs): if isinstance(val, framework.Block): op_desc.set_block_attr(name, val.desc) else: - op_desc.set_attr(name, val) + op_desc._set_attr(name, val) return op_desc @@ -346,7 +346,7 @@ def _append_backward_ops_(block, grad_sub_block_list = [] # If the op has its own sub-block, deal with the sub-block first if op.has_attr("sub_block"): - sub_block = program.block(op.block_attr_id("sub_block")) + sub_block = program.block(op._block_attr_id("sub_block")) grad_sub_block = program._create_block() grad_sub_block._set_forward_block_idx(sub_block.idx) cb = _callback_lookup_(op) @@ -382,7 +382,7 @@ def _append_backward_ops_(block, for op_desc in grad_op_descs: new_op_desc = target_block.desc.append_op() new_op_desc.copy_from(op_desc) - new_op_desc.set_attr(op_role_attr_name, backward) + new_op_desc._set_attr(op_role_attr_name, backward) grad_to_var["__current_op_desc__"] = new_op_desc if callbacks is not None: assert (isinstance(callbacks, list)) @@ -408,7 +408,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): for op_idx in range(start_op_idx, block.desc.op_size()): op_desc = block.desc.op(op_idx) if op_desc.has_attr("sub_block"): - sub_block = block.program.block(op_desc.block_attr_id("sub_block")) + sub_block = block.program.block(op_desc._block_attr_id("sub_block")) _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map) new_vars = set() # create new gradient variables @@ -438,12 +438,12 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map): op_desc = block.desc.op(op_idx) for name in op_desc.input_arg_names(): if name in var_map: - op_desc.rename_input(name, var_map[name]) + op_desc._rename_input(name, var_map[name]) for name in op_desc.output_arg_names(): if block.desc.find_var(name.encode("ascii")): new_name = unique_name.generate(name) - op_desc.rename_output(name, new_name) + op_desc._rename_output(name, new_name) var_map[name] = new_name for g, ng in six.iteritems(var_map): @@ -542,9 +542,9 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, if loss.op is None: raise ValueError("loss.op is None. Should not happend") - loss.op.set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(), - int(core.op_proto_and_checker_maker.OpRole.Forward) | - int(core.op_proto_and_checker_maker.OpRole.Loss)) + loss.op._set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(), + int(core.op_proto_and_checker_maker.OpRole.Forward) | + int(core.op_proto_and_checker_maker.OpRole.Loss)) if callbacks is not None: isinstance(callbacks, list) @@ -631,7 +631,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, attr_val = [p.name, g.name] if g.op.has_attr(op_role_var_attr_name): attr_val.extend(g.op.attr(op_role_var_attr_name)) - g.op.set_attr(op_role_var_attr_name, attr_val) + g.op._set_attr(op_role_var_attr_name, attr_val) return params_and_grads diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 32b8f1189fd65ba1e8da5aeaf316fc0ae05af552..e884185528282021fd16289ccc6a3533e22b9967 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -75,8 +75,8 @@ class ErrorClipByValue(BaseErrorClipAttr): clip_op_desc.set_type("clip") clip_op_desc.set_input("X", [grad_name]) clip_op_desc.set_output("Out", [grad_name]) - clip_op_desc.set_attr("min", self.min) - clip_op_desc.set_attr("max", self.max) + clip_op_desc._set_attr("min", self.min) + clip_op_desc._set_attr("max", self.max) def error_clip_callback(block, context): diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index bced5fd1d9c617ab614212c811e86422d65a2e56..d795b92d79b2b9c616639d2fc56f3d2be383f376 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -40,11 +40,9 @@ PADDLE_ON_MODEL_CE = os.environ.get('PADDLE_ON_MODEL_CE', None) is not None __all__ = [ 'Program', - 'Operator', 'default_startup_program', 'default_main_program', 'program_guard', - 'get_var', 'name_scope', ] @@ -663,11 +661,11 @@ class Operator(object): self._update_desc_attr(attr_name, attr_val) self.desc.check_attrs() - if self.has_kernel(type): + if self._has_kernel(type): self.desc.infer_var_type(self.block.desc) self.desc.infer_shape(self.block.desc) - def has_kernel(self, op_type): + def _has_kernel(self, op_type): return op_type not in self.OP_WITHOUT_KERNEL_SET def to_string(self, throw_on_error): @@ -708,7 +706,7 @@ class Operator(object): """ return self.desc.input(name) - def rename_input(self, old_name, new_name): + def _rename_input(self, old_name, new_name): """ Rename the `old_name` to `new_name`. @@ -719,9 +717,9 @@ class Operator(object): Returns: None """ - self.desc.rename_input(old_name, new_name) + self.desc._rename_input(old_name, new_name) - def rename_output(self, old_name, new_name): + def _rename_output(self, old_name, new_name): """ Rename the `old_name` to `new_name`. @@ -732,7 +730,7 @@ class Operator(object): Returns: None """ - self.desc.rename_output(old_name, new_name) + self.desc._rename_output(old_name, new_name) @property def input_names(self): @@ -796,7 +794,7 @@ class Operator(object): """ return self.desc.attr_type(name) - def set_attr(self, name, val): + def _set_attr(self, name, val): """ Set the value of attribute by attribute's name. @@ -829,7 +827,7 @@ class Operator(object): isinstance(val, core.ProgramDesc): self.desc.set_serialized_attr(name, val.serialize_to_string()) else: - self.desc.set_attr(name, val) + self.desc._set_attr(name, val) @property def attr_names(self): @@ -848,7 +846,7 @@ class Operator(object): """ return self.desc.attr(name) - def block_attr_id(self, name): + def _block_attr_id(self, name): """ Get the block attribute's id by name. @@ -858,9 +856,9 @@ class Operator(object): Returns: int: the block index. """ - return self.desc.block_attr_id(name) + return self.desc._block_attr_id(name) - def block_attr(self, name): + def _block_attr(self, name): """ Get the block attribute by name. @@ -871,11 +869,11 @@ class Operator(object): block: the block attribute. """ - id = self.block_attr_id(name) + id = self._block_attr_id(name) assert (id >= 0 and id < len(self.block.program.blocks)) return self.block.program.blocks[id] - def blocks_attr(self, name): + def _blocks_attr(self, name): """ Get the blocks attribute by name. @@ -886,13 +884,13 @@ class Operator(object): list: list of the blocks attribute. """ attrs = [] - for i in self.blocks_attr_ids(name): + for i in self._blocks_attr_ids(name): assert (i >= 0 and i < len(self.block.program.blocks)) attrs.append(self.block.program.blocks[i]) return attrs - def blocks_attr_ids(self, name): + def _blocks_attr_ids(self, name): """ Get the blocks attribute's ids by name. @@ -903,7 +901,7 @@ class Operator(object): list: list of the blocks ids. """ - return self.desc.blocks_attr_ids(name) + return self.desc._blocks_attr_ids(name) def all_attrs(self): """ @@ -917,11 +915,11 @@ class Operator(object): for n in attr_names: attr_type = self.desc.attr_type(n) if attr_type == core.AttrType.BLOCK: - attr_map[n] = self.block_attr(n) + attr_map[n] = self._block_attr(n) continue if attr_type == core.AttrType.BLOCKS: - attr_map[n] = self.blocks_attr(n) + attr_map[n] = self._blocks_attr(n) continue attr_map[n] = self.attr(n) @@ -1795,7 +1793,7 @@ class Program(object): for j in six.moves.range(block.op_size()): op = block.op(j) if op.has_attr('is_test'): - op.set_attr('is_test', True) + op._set_attr('is_test', True) res.blocks = [ Block(res, i) for i in six.moves.range(res.desc.num_blocks()) ] @@ -2169,7 +2167,7 @@ def program_guard(main_program, startup_program=None): switch_startup_program(startup_program) -def get_var(name, program=None): +def _get_var(name, program=None): """ Get a variable by name from the global block of a program. diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 75c29b12724d53783b9748d6df066c52bd232482..d56fa76300e7054ef71a7729483a579fa35f1dac 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -311,7 +311,6 @@ def _copy_reader_var_(block, var): new_var = block.create_var(name=var.name, type=core.VarDesc.VarType.READER) new_var.desc.set_shapes(var.desc.shapes()) new_var.desc.set_dtypes(var.desc.dtypes()) - new_var.desc.set_lod_levels(var.desc.lod_levels()) new_var.persistable = True return new_var @@ -633,7 +632,6 @@ def py_reader(capacity, }) startup_var.desc.set_dtypes(dtypes) - startup_var.desc.set_lod_levels(lod_levels) startup_var.persistable = True main_prog_var = _copy_reader_var_(default_main_program().current_block(), diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py index b1598bfec210474ae1e17f9f88e8b57aa80b8452..a3064b565d096f7feda18379c66ffc8bf2f4a55c 100644 --- a/python/paddle/fluid/layers/metric_op.py +++ b/python/paddle/fluid/layers/metric_op.py @@ -78,7 +78,12 @@ def accuracy(input, label, k=1, correct=None, total=None): return acc_out -def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): +def auc(input, + label, + curve='ROC', + num_thresholds=2**12 - 1, + topk=1, + slide_steps=1): """ **Area Under the Curve (AUC) Layer** @@ -105,6 +110,8 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): num_thresholds(int): The number of thresholds to use when discretizing the roc curve. Default 200. topk(int): only topk number of prediction output will be used for auc. + slide_steps: when calc batch auc, we can not only use step currently but the previous steps can be used. slide_steps=1 means use the current step, slide_steps=3 means use current step and the previous second steps, slide_steps=0 use all of the steps. + Returns: Variable: A scalar representing the current AUC. @@ -120,16 +127,48 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): auc_out = helper.create_tmp_variable(dtype="float64") batch_auc_out = helper.create_tmp_variable(dtype="float64") # make tp, tn, fp, fn persistable, so that can accumulate all batches. + + # for batch auc + batch_stat_pos = helper.create_global_variable( + persistable=True, + dtype='int64', + shape=[slide_steps, num_thresholds + 1]) + batch_stat_neg = helper.create_global_variable( + persistable=True, + dtype='int64', + shape=[slide_steps, num_thresholds + 1]) + + # for global auc stat_pos = helper.create_global_variable( - persistable=True, dtype='int64', shape=[num_thresholds + 1]) + persistable=True, dtype='int64', shape=[1, num_thresholds + 1]) stat_neg = helper.create_global_variable( - persistable=True, dtype='int64', shape=[num_thresholds + 1]) + persistable=True, dtype='int64', shape=[1, num_thresholds + 1]) - for var in [stat_pos, stat_neg]: + for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]: helper.set_variable_initializer( var, Constant( value=0.0, force_cpu=True)) + # Batch AUC + helper.append_op( + type="auc", + inputs={ + "Predict": [input], + "Label": [label], + "StatPos": [batch_stat_pos], + "StatNeg": [batch_stat_neg] + }, + attrs={ + "curve": curve, + "num_thresholds": num_thresholds, + "slide_steps": slide_steps + }, + outputs={ + "AUC": [batch_auc_out], + "StatPosOut": [batch_stat_pos], + "StatNegOut": [batch_stat_neg] + }) + # Global AUC helper.append_op( type="auc", inputs={ @@ -138,12 +177,16 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): "StatPos": [stat_pos], "StatNeg": [stat_neg] }, - attrs={"curve": curve, - "num_thresholds": num_thresholds}, + attrs={ + "curve": curve, + "num_thresholds": num_thresholds, + "slide_steps": 0 + }, outputs={ "AUC": [auc_out], - "BatchAUC": [batch_auc_out], "StatPosOut": [stat_pos], "StatNegOut": [stat_neg] }) - return auc_out, batch_auc_out, [stat_pos, stat_neg] + return auc_out, batch_auc_out, [ + batch_stat_pos, batch_stat_neg, stat_pos, stat_neg + ] diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 7867bfe00e25711643eab1ab8d0141dbbad3da52..363e9200bedfd9cc843513c2db16eadb5ffa33b4 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -63,6 +63,8 @@ for _OP in set(__all__): # e.g.: test_program_code.py, test_dist_train.py globals()['_scale'] = generate_layer_fn('scale') +globals()['_elementwise_div'] = generate_layer_fn('elementwise_div') + __all__ += __activations_noattr__ for _OP in set(__activations_noattr__): diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index ad09005d866b10146e6fcd7cf108c51f34322607..1b9571f6d3a6a69d1ac35f6be74b80eaa2ce6251 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -26,6 +26,7 @@ from .layer_helper import LayerHelper from .regularizer import append_regularization_ops from .clip import append_gradient_clip_ops, error_clip_callback from contextlib import contextmanager +from .layers import ops __all__ = [ 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl', @@ -1301,7 +1302,7 @@ class ModelAverage(Optimizer): x=tmp, dtype='float32' if self._dtype == None else self._dtype) sum = layers.cast( x=sum, dtype='float32' if self._dtype == None else self._dtype) - layers.elementwise_div(x=sum, y=tmp, out=param) + ops._elementwise_div(x=sum, y=tmp, out=param) def _add_average_restore_op(self, block, param_grad): param = block._clone_variable(param_grad[0]) diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py new file mode 100644 index 0000000000000000000000000000000000000000..902dc6544ed6858c4cd8d64b14d6af2367059091 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_ctr.py @@ -0,0 +1,109 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import paddle +import paddle.fluid as fluid + +import dist_ctr_reader +from test_dist_base import TestDistRunnerBase, runtime_main + +IS_SPARSE = True + +# Fix seed for test +fluid.default_startup_program().random_seed = 1 +fluid.default_main_program().random_seed = 1 + + +class TestDistCTR2x2(TestDistRunnerBase): + def get_model(self, batch_size=2): + dnn_input_dim, lr_input_dim = dist_ctr_reader.load_data_meta() + """ network definition """ + dnn_data = fluid.layers.data( + name="dnn_data", + shape=[-1, 1], + dtype="int64", + lod_level=1, + append_batch_size=False) + lr_data = fluid.layers.data( + name="lr_data", + shape=[-1, 1], + dtype="int64", + lod_level=1, + append_batch_size=False) + label = fluid.layers.data( + name="click", + shape=[-1, 1], + dtype="int64", + lod_level=0, + append_batch_size=False) + + # build dnn model + dnn_layer_dims = [128, 64, 32, 1] + dnn_embedding = fluid.layers.embedding( + is_distributed=False, + input=dnn_data, + size=[dnn_input_dim, dnn_layer_dims[0]], + param_attr=fluid.ParamAttr( + name="deep_embedding", + initializer=fluid.initializer.Constant(value=0.01)), + is_sparse=IS_SPARSE) + dnn_pool = fluid.layers.sequence_pool( + input=dnn_embedding, pool_type="sum") + dnn_out = dnn_pool + for i, dim in enumerate(dnn_layer_dims[1:]): + fc = fluid.layers.fc( + input=dnn_out, + size=dim, + act="relu", + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01)), + name='dnn-fc-%d' % i) + dnn_out = fc + + # build lr model + lr_embbding = fluid.layers.embedding( + is_distributed=False, + input=lr_data, + size=[lr_input_dim, 1], + param_attr=fluid.ParamAttr( + name="wide_embedding", + initializer=fluid.initializer.Constant(value=0.01)), + is_sparse=IS_SPARSE) + lr_pool = fluid.layers.sequence_pool(input=lr_embbding, pool_type="sum") + + merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1) + + predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax') + acc = fluid.layers.accuracy(input=predict, label=label) + auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict, + label=label) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + inference_program = paddle.fluid.default_main_program().clone() + + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001) + sgd_optimizer.minimize(avg_cost) + + dataset = dist_ctr_reader.Dataset() + train_reader = paddle.batch(dataset.train(), batch_size=batch_size) + test_reader = paddle.batch(dataset.test(), batch_size=batch_size) + + return inference_program, avg_cost, train_reader, test_reader, None, predict + + +if __name__ == "__main__": + runtime_main(TestDistCTR2x2) diff --git a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..95e39d891f7e6a3dcb57540bd96fe70027443cda --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py @@ -0,0 +1,172 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import paddle +import tarfile + +logging.basicConfig() +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) + +DATA_URL = "http://paddle-ctr-data.cdn.bcebos.com/avazu_ctr_data.tgz" +DATA_MD5 = "c11df99fbd14e53cd4bfa6567344b26e" +""" +avazu_ctr_data/train.txt +avazu_ctr_data/infer.txt +avazu_ctr_data/test.txt +avazu_ctr_data/data.meta.txt +""" + + +def read_data(file_name): + path = paddle.dataset.common.download(DATA_URL, "avazu_ctr_data", DATA_MD5) + tar = tarfile.open(path, "r:gz") + tar_info = None + for member in tar.getmembers(): + if member.name.endswith(file_name): + tar_info = member + f = tar.extractfile(tar_info) + ret_lines = [_.decode('utf-8') for _ in f.readlines()] + return ret_lines + + +class TaskMode: + TRAIN_MODE = 0 + TEST_MODE = 1 + INFER_MODE = 2 + + def __init__(self, mode): + self.mode = mode + + def is_train(self): + return self.mode == self.TRAIN_MODE + + def is_test(self): + return self.mode == self.TEST_MODE + + def is_infer(self): + return self.mode == self.INFER_MODE + + @staticmethod + def create_train(): + return TaskMode(TaskMode.TRAIN_MODE) + + @staticmethod + def create_test(): + return TaskMode(TaskMode.TEST_MODE) + + @staticmethod + def create_infer(): + return TaskMode(TaskMode.INFER_MODE) + + +class ModelType: + CLASSIFICATION = 0 + REGRESSION = 1 + + def __init__(self, mode): + self.mode = mode + + def is_classification(self): + return self.mode == self.CLASSIFICATION + + def is_regression(self): + return self.mode == self.REGRESSION + + @staticmethod + def create_classification(): + return ModelType(ModelType.CLASSIFICATION) + + @staticmethod + def create_regression(): + return ModelType(ModelType.REGRESSION) + + +def load_dnn_input_record(sent): + return list(map(int, sent.split())) + + +def load_lr_input_record(sent): + res = [] + for _ in [x.split(':') for x in sent.split()]: + res.append(int(_[0])) + return res + + +feeding_index = {'dnn_input': 0, 'lr_input': 1, 'click': 2} + + +class Dataset(object): + def train(self): + ''' + Load trainset. + ''' + file_name = "train.txt" + logger.info("load trainset from %s" % file_name) + mode = TaskMode.create_train() + return self._parse_creator(file_name, mode) + + def test(self): + ''' + Load testset. + ''' + file_name = "test.txt" + logger.info("load testset from %s" % file_name) + mode = TaskMode.create_test() + return self._parse_creator(file_name, mode) + + def infer(self): + ''' + Load infer set. + ''' + file_name = "infer.txt" + logger.info("load inferset from %s" % file_name) + mode = TaskMode.create_infer() + return self._parse_creator(file_name, mode) + + def _parse_creator(self, file_name, mode): + ''' + Parse dataset. + ''' + + def _parse(): + data = read_data(file_name) + for line_id, line in enumerate(data): + fs = line.strip().split('\t') + dnn_input = load_dnn_input_record(fs[0]) + lr_input = load_lr_input_record(fs[1]) + if not mode.is_infer(): + click = int(fs[2]) + yield [dnn_input, lr_input, click] + else: + yield [dnn_input, lr_input] + + return _parse + + +def load_data_meta(): + ''' + load data meta info from path, return (dnn_input_dim, lr_input_dim) + ''' + lines = read_data('data.meta.txt') + err_info = "wrong meta format" + assert len(lines) == 2, err_info + assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[ + 1], err_info + res = map(int, [_.split(':')[1] for _ in lines]) + res = list(res) + logger.info('dnn input dim: %d' % res[0]) + logger.info('lr input dim: %d' % res[1]) + return res diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py index 85a96c0b53f6bc08687965048d6251265055a6fe..877d21ae882ab4efb49beb6a846ab71a22c2aab7 100644 --- a/python/paddle/fluid/tests/unittests/dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/dist_mnist.py @@ -47,7 +47,7 @@ def cnn_model(data): pool_stride=2, act="relu", param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( - value=0.3))) + value=0.01))) conv_pool_2 = fluid.nets.simple_img_conv_pool( input=conv_pool_1, filter_size=5, @@ -56,7 +56,7 @@ def cnn_model(data): pool_stride=2, act="relu", param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( - value=0.2))) + value=0.01))) SIZE = 10 input_shape = conv_pool_2.shape @@ -68,7 +68,7 @@ def cnn_model(data): size=SIZE, act="softmax", param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.Constant(value=0.1))) + initializer=fluid.initializer.Constant(value=0.01))) return predict diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py new file mode 100644 index 0000000000000000000000000000000000000000..6456d1b53a129db04ace7ff4413a3d76e922ccde --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py @@ -0,0 +1,238 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import argparse +import time +import math +import random + +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +from paddle.fluid import core +import unittest +from multiprocessing import Process +import os +import signal +from functools import reduce +from test_dist_base import TestDistRunnerBase, runtime_main + +DTYPE = "int64" +DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/simnet.train.1000' +DATA_MD5 = '24e49366eb0611c552667989de2f57d5' + +# For Net +base_lr = 0.2 +emb_lr = base_lr * 3 +dict_dim = 1500 +emb_dim = 128 +hid_dim = 128 +margin = 0.1 +sample_rate = 1 + +# Fix seed for test +fluid.default_startup_program().random_seed = 1 +fluid.default_main_program().random_seed = 1 + + +def get_acc(cos_q_nt, cos_q_pt, batch_size): + cond = fluid.layers.less_than(cos_q_nt, cos_q_pt) + cond = fluid.layers.cast(cond, dtype='float64') + cond_3 = fluid.layers.reduce_sum(cond) + acc = fluid.layers.elementwise_div( + cond_3, + fluid.layers.fill_constant( + shape=[1], value=batch_size * 1.0, dtype='float64'), + name="simnet_acc") + return acc + + +def get_loss(cos_q_pt, cos_q_nt): + loss_op1 = fluid.layers.elementwise_sub( + fluid.layers.fill_constant_batch_size_like( + input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'), + cos_q_pt) + loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) + loss_op3 = fluid.layers.elementwise_max( + fluid.layers.fill_constant_batch_size_like( + input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'), + loss_op2) + avg_cost = fluid.layers.mean(loss_op3) + return avg_cost + + +def get_optimizer(): + # SGD optimizer + optimizer = fluid.optimizer.SGD(learning_rate=base_lr) + return optimizer + + +def train_network(batch_size, is_distributed=False, is_sparse=False): + # query + q = fluid.layers.data( + name="query_ids", shape=[1], dtype="int64", lod_level=1) + ## embedding + q_emb = fluid.layers.embedding( + input=q, + is_distributed=is_distributed, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__", + learning_rate=emb_lr), + is_sparse=is_sparse) + ## vsum + q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') + q_ss = fluid.layers.softsign(q_sum) + ## fc layer after conv + q_fc = fluid.layers.fc( + input=q_ss, + size=hid_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__q_fc__", + learning_rate=base_lr)) + # label data + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + # pt + pt = fluid.layers.data( + name="pos_title_ids", shape=[1], dtype="int64", lod_level=1) + ## embedding + pt_emb = fluid.layers.embedding( + input=pt, + is_distributed=is_distributed, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__", + learning_rate=emb_lr), + is_sparse=is_sparse) + ## vsum + pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') + pt_ss = fluid.layers.softsign(pt_sum) + ## fc layer + pt_fc = fluid.layers.fc( + input=pt_ss, + size=hid_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__fc__", + learning_rate=base_lr), + bias_attr=fluid.ParamAttr(name="__fc_b__")) + # nt + nt = fluid.layers.data( + name="neg_title_ids", shape=[1], dtype="int64", lod_level=1) + ## embedding + nt_emb = fluid.layers.embedding( + input=nt, + is_distributed=is_distributed, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__", + learning_rate=emb_lr), + is_sparse=is_sparse) + ## vsum + nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') + nt_ss = fluid.layers.softsign(nt_sum) + ## fc layer + nt_fc = fluid.layers.fc( + input=nt_ss, + size=hid_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__fc__", + learning_rate=base_lr), + bias_attr=fluid.ParamAttr(name="__fc_b__")) + cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc) + cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc) + # loss + avg_cost = get_loss(cos_q_pt, cos_q_nt) + # acc + acc = get_acc(cos_q_nt, cos_q_pt, batch_size) + return [avg_cost, acc, cos_q_pt] + + +def combination(x, y): + res = [[[xi, yi] for yi in y] for xi in x] + return res[0] + + +def get_one_data(file_list): + for file in file_list: + contents = [] + with open(file, "r") as fin: + for i in fin: + contents.append(i.strip()) + for index, q in enumerate(contents): + try: + one_data = [[int(j) for j in i.split(" ")] + for i in q.split(";")[:-1]] + if one_data[1][0] + one_data[1][1] != len(one_data) - 3: + q = fin.readline() + continue + tmp = combination(one_data[3:3 + one_data[1][0]], + one_data[3 + one_data[1][0]:]) + except Exception as e: + continue + + for each in tmp: + yield [one_data[2], 0, each[0], each[1]] + + +def get_batch_reader(file_list, batch_size): + def batch_reader(): + res = [] + for i in get_one_data(file_list): + if random.random() <= sample_rate: + res.append(i) + if len(res) >= batch_size: + yield res + res = [] + + return batch_reader + + +def get_train_reader(batch_size): + # The training data set. + train_file = os.path.join(paddle.dataset.common.DATA_HOME, "simnet", + "train") + train_reader = get_batch_reader([train_file], batch_size) + train_feed = ["query_ids", "pos_title_ids", "neg_title_ids", "label"] + return train_reader, train_feed + + +class TestDistSimnetBow2x2(TestDistRunnerBase): + def get_model(self, batch_size=2): + # Train program + avg_cost, acc, predict = \ + train_network(batch_size, bool(int(os.environ["IS_DISTRIBUTED"])), bool(int(os.environ["IS_SPARSE"]))) + + inference_program = fluid.default_main_program().clone() + + # Optimization + opt = get_optimizer() + opt.minimize(avg_cost) + + # Reader + train_reader, _ = get_train_reader(batch_size) + return inference_program, avg_cost, train_reader, train_reader, acc, predict + + +if __name__ == "__main__": + paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train") + runtime_main(TestDistSimnetBow2x2) diff --git a/python/paddle/fluid/tests/unittests/dist_text_classification.py b/python/paddle/fluid/tests/unittests/dist_text_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..095a474fd3ac056c678f9051ed80ef363ae968c9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py @@ -0,0 +1,231 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import argparse +import time +import math + +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +from paddle.fluid import core +import unittest +from multiprocessing import Process +import os +import signal +import six +import tarfile +import string +import re +from functools import reduce +from test_dist_base import TestDistRunnerBase, runtime_main + +DTYPE = "float32" +VOCAB_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/imdb.vocab' +VOCAB_MD5 = '23c86a0533c0151b6f12fa52b106dcc2' +DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/text_classification.tar.gz' +DATA_MD5 = '29ebfc94f11aea9362bbb7f5e9d86b8a' + + +# Load dictionary. +def load_vocab(filename): + vocab = {} + if six.PY2: + with open(filename, 'r') as f: + for idx, line in enumerate(f): + vocab[line.strip()] = idx + else: + with open(filename, 'r', encoding="utf-8") as f: + for idx, line in enumerate(f): + vocab[line.strip()] = idx + return vocab + + +def get_worddict(dict_path): + word_dict = load_vocab(dict_path) + word_dict[""] = len(word_dict) + dict_dim = len(word_dict) + return word_dict, dict_dim + + +def conv_net(input, + dict_dim, + emb_dim=128, + window_size=3, + num_filters=128, + fc0_dim=96, + class_dim=2): + emb = fluid.layers.embedding( + input=input, + size=[dict_dim, emb_dim], + is_sparse=False, + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( + value=0.01))) + + conv_3 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=num_filters, + filter_size=window_size, + act="tanh", + pool_type="max", + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01))) + + fc_0 = fluid.layers.fc( + input=[conv_3], + size=fc0_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01))) + + prediction = fluid.layers.fc( + input=[fc_0], + size=class_dim, + act="softmax", + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01))) + + return prediction + + +def inference_network(dict_dim): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + out = conv_net(data, dict_dim) + return out + + +def get_reader(word_dict, batch_size): + # The training data set. + train_reader = paddle.batch(train(word_dict), batch_size=batch_size) + + # The testing data set. + test_reader = paddle.batch(test(word_dict), batch_size=batch_size) + + return train_reader, test_reader + + +def get_optimizer(learning_rate): + optimizer = fluid.optimizer.SGD(learning_rate=learning_rate) + return optimizer + + +class TestDistTextClassification2x2(TestDistRunnerBase): + def get_model(self, batch_size=2): + vocab = os.path.join(paddle.dataset.common.DATA_HOME, + "text_classification", "imdb.vocab") + word_dict, dict_dim = get_worddict(vocab) + + # Input data + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + predict = conv_net(data, dict_dim) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=predict, label=label) + inference_program = fluid.default_main_program().clone() + + # Optimization + opt = get_optimizer(learning_rate=0.001) + opt.minimize(avg_cost) + + # Reader + train_reader, test_reader = get_reader(word_dict, batch_size) + + return inference_program, avg_cost, train_reader, test_reader, acc, predict + + +def tokenize(pattern): + """ + Read files that match the given pattern. Tokenize and yield each file. + """ + + with tarfile.open( + paddle.dataset.common.download(DATA_URL, 'text_classification', + DATA_MD5)) as tarf: + # Note that we should use tarfile.next(), which does + # sequential access of member files, other than + # tarfile.extractfile, which does random access and might + # destroy hard disks. + tf = tarf.next() + while tf != None: + if bool(pattern.match(tf.name)): + # newline and punctuations removal and ad-hoc tokenization. + yield tarf.extractfile(tf).read().rstrip(six.b( + "\n\r")).translate( + None, six.b(string.punctuation)).lower().split() + tf = tarf.next() + + +def reader_creator(pos_pattern, neg_pattern, word_idx): + UNK = word_idx[''] + INS = [] + + def load(pattern, out, label): + for doc in tokenize(pattern): + out.append(([word_idx.get(w, UNK) for w in doc], label)) + + load(pos_pattern, INS, 0) + load(neg_pattern, INS, 1) + + def reader(): + for doc, label in INS: + yield doc, label + + return reader + + +def train(word_idx): + """ + IMDB training set creator. + + It returns a reader creator, each sample in the reader is an zero-based ID + sequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + re.compile("train/pos/.*\.txt$"), + re.compile("train/neg/.*\.txt$"), word_idx) + + +def test(word_idx): + """ + IMDB test set creator. + + It returns a reader creator, each sample in the reader is an zero-based ID + sequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Test reader creator + :rtype: callable + """ + return reader_creator( + re.compile("test/pos/.*\.txt$"), + re.compile("test/neg/.*\.txt$"), word_idx) + + +if __name__ == "__main__": + paddle.dataset.common.download(VOCAB_URL, 'text_classification', VOCAB_MD5) + paddle.dataset.common.download(DATA_URL, 'text_classification', DATA_MD5) + runtime_main(TestDistTextClassification2x2) diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py index 175bd130e5a8324227953eeeb769474e78f94fd2..a2cc57425841100a2b61279d1b447b88ed4b9a54 100644 --- a/python/paddle/fluid/tests/unittests/dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -1488,7 +1488,7 @@ def wrap_decoder(trg_vocab_size, if weight_sharing: predict = layers.matmul( x=dec_output, - y=fluid.get_var(word_emb_param_names[0]), + y=fluid.framework._get_var(word_emb_param_names[0]), transpose_y=True) else: predict = layers.fc(input=dec_output, @@ -1699,10 +1699,9 @@ class DistTransformer2x2(TestDistRunnerBase): exe.run(startup_prog) exe.run(pserver_prog) - def run_trainer(self, use_cuda, args): - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - TrainTaskConfig.use_gpu = use_cuda - sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program = get_model( + def run_trainer(self, args): + TrainTaskConfig.use_gpu = args.use_cuda + sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model( args.is_dist, not args.sync_mode) if args.is_dist: @@ -1718,6 +1717,11 @@ class DistTransformer2x2(TestDistRunnerBase): TrainTaskConfig.batch_size = 20 trainer_prog = fluid.default_main_program() + if args.use_cuda: + place = fluid.CUDAPlace(0) + else: + place = fluid.CPUPlace() + startup_exe = fluid.Executor(place) TrainTaskConfig.local = not args.is_dist diff --git a/python/paddle/fluid/tests/unittests/dist_word2vec.py b/python/paddle/fluid/tests/unittests/dist_word2vec.py index f3e740fc7027a4a562b836c3113b87d55062c185..835306edd0f17490dd10110db40f42dce30b25bb 100644 --- a/python/paddle/fluid/tests/unittests/dist_word2vec.py +++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py @@ -122,4 +122,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase): if __name__ == "__main__": + import os + os.environ['CPU_NUM'] = '1' + os.environ['USE_CUDA'] = "FALSE" runtime_main(TestDistWord2vec2x2) diff --git a/python/paddle/fluid/tests/unittests/test_auc_op.py b/python/paddle/fluid/tests/unittests/test_auc_op.py index 1de4a9d016a177944253d12094722d3a05614be2..810e8a1a8547a92de923877695178e780981edeb 100644 --- a/python/paddle/fluid/tests/unittests/test_auc_op.py +++ b/python/paddle/fluid/tests/unittests/test_auc_op.py @@ -36,7 +36,11 @@ class TestAucOp(OpTest): "StatPos": stat_pos, "StatNeg": stat_neg } - self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds} + self.attrs = { + 'curve': 'ROC', + 'num_thresholds': num_thresholds, + "slide_steps": 1 + } python_auc = metrics.Auc(name="auc", curve='ROC', @@ -45,7 +49,6 @@ class TestAucOp(OpTest): self.outputs = { 'AUC': np.array(python_auc.eval()), - 'BatchAUC': np.array(python_auc.eval()), 'StatPosOut': np.array(python_auc._stat_pos), 'StatNegOut': np.array(python_auc._stat_neg) } diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 37cad73019c529f64868b6ad3c6e2fffe59cc0d8..856980e546eb55f4cd83f7f3862c714e0e996207 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -18,23 +18,27 @@ import time import unittest import os import sys -import six import signal import subprocess +import six import argparse +import paddle.fluid as fluid + +RUN_STEP = 10 + class TestDistRunnerBase(object): def get_model(self, batch_size=2): raise NotImplementedError( "get_model should be implemented by child classes.") - def get_transpiler(self, trainer_id, main_program, pserver_endpoints, - trainers, sync_mode): + @staticmethod + def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers, + sync_mode): # NOTE: import fluid until runtime, or else forking processes will cause error. - import paddle - import paddle.fluid as fluid - t = fluid.DistributeTranspiler() + config = fluid.DistributeTranspilerConfig() + t = fluid.DistributeTranspiler(config=config) t.transpile( trainer_id=trainer_id, program=main_program, @@ -44,9 +48,9 @@ class TestDistRunnerBase(object): return t def run_pserver(self, args): - import paddle - import paddle.fluid as fluid + self.get_model(batch_size=2) + if args.mem_opt: fluid.memory_optimize(fluid.default_main_program()) t = self.get_transpiler(args.trainer_id, @@ -61,12 +65,10 @@ class TestDistRunnerBase(object): exe.run(startup_prog) exe.run(pserver_prog) - def run_trainer(self, use_cuda, args): - import paddle - import paddle.fluid as fluid - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + def run_trainer(self, args): test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=2) + if args.mem_opt: fluid.memory_optimize(fluid.default_main_program()) if args.is_dist: @@ -74,16 +76,23 @@ class TestDistRunnerBase(object): fluid.default_main_program(), args.endpoints, args.trainers, args.sync_mode) + trainer_prog = t.get_trainer_program() else: trainer_prog = fluid.default_main_program() + if args.use_cuda: + place = fluid.CUDAPlace(0) + else: + place = fluid.CPUPlace() + startup_exe = fluid.Executor(place) startup_exe.run(fluid.default_startup_program()) strategy = fluid.ExecutionStrategy() strategy.num_threads = 1 strategy.allow_op_delay = False + build_stra = fluid.BuildStrategy() if args.use_reduce: @@ -92,7 +101,7 @@ class TestDistRunnerBase(object): build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce exe = fluid.ParallelExecutor( - use_cuda, + args.use_cuda, loss_name=avg_cost.name, exec_strategy=strategy, build_strategy=build_stra) @@ -103,27 +112,26 @@ class TestDistRunnerBase(object): ] feeder = fluid.DataFeeder(feed_var_list, place) - reader_generator = test_reader() - - data = next(reader_generator) - first_loss, = exe.run(fetch_list=[avg_cost.name], - feed=feeder.feed(data)) - print(first_loss) + reader_generator = train_reader() - for i in six.moves.xrange(5): - data = next(reader_generator) - loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) + def get_data(): + origin_batch = next(reader_generator) + if args.is_dist and args.use_reader_alloc: + new_batch = [] + for offset, item in enumerate(origin_batch): + if offset % 2 == args.trainer_id: + new_batch.append(item) + return new_batch + else: + return origin_batch - data = next(reader_generator) - last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) - print(last_loss) + for _ in six.moves.xrange(RUN_STEP): + loss, = exe.run(fetch_list=[avg_cost.name], + feed=feeder.feed(get_data())) + print(loss) def runtime_main(test_class): - import paddle - import paddle.fluid as fluid - import paddle.fluid.core as core - parser = argparse.ArgumentParser(description='Run dist test.') parser.add_argument( '--role', type=str, required=True, choices=['pserver', 'trainer']) @@ -135,7 +143,10 @@ def runtime_main(test_class): '--current_endpoint', type=str, required=False, default="") parser.add_argument('--sync_mode', action='store_true') parser.add_argument('--mem_opt', action='store_true') + parser.add_argument('--use_cuda', action='store_true') parser.add_argument('--use_reduce', action='store_true') + parser.add_argument( + '--use_reader_alloc', action='store_true', required=False, default=True) args = parser.parse_args() @@ -143,8 +154,7 @@ def runtime_main(test_class): if args.role == "pserver" and args.is_dist: model.run_pserver(args) else: - use_cuda = True if core.is_compiled_with_cuda() else False - model.run_trainer(use_cuda, args) + model.run_trainer(args) import paddle.compat as cpt @@ -163,8 +173,10 @@ class TestDistBase(unittest.TestCase): self._find_free_port(), self._find_free_port()) self._python_interp = "python" self._sync_mode = True + self._use_cuda = True self._mem_opt = False self._use_reduce = False + self._use_reader_alloc = True self._setup_config() def _find_free_port(self): @@ -172,15 +184,15 @@ class TestDistBase(unittest.TestCase): s.bind(('', 0)) return s.getsockname()[1] - def start_pserver(self, model_file, check_error_log): + def start_pserver(self, model_file, check_error_log, required_envs): ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist" ps0_cmd = ps_cmd % \ - (self._python_interp, model_file, self._ps_endpoints, ps0_ep, - self._trainers) + (self._python_interp, model_file, self._ps_endpoints, ps0_ep, + self._trainers) ps1_cmd = ps_cmd % \ - (self._python_interp, model_file, self._ps_endpoints, ps1_ep, - self._trainers) + (self._python_interp, model_file, self._ps_endpoints, ps1_ep, + self._trainers) if self._sync_mode: ps0_cmd += " --sync_mode" @@ -198,9 +210,15 @@ class TestDistBase(unittest.TestCase): ps1_pipe = open("/tmp/ps1_err.log", "wb") ps0_proc = subprocess.Popen( - ps0_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe) + ps0_cmd.strip().split(" "), + stdout=subprocess.PIPE, + stderr=ps0_pipe, + env=required_envs) ps1_proc = subprocess.Popen( - ps1_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe) + ps1_cmd.strip().split(" "), + stdout=subprocess.PIPE, + stderr=ps1_pipe, + env=required_envs) if not check_error_log: return ps0_proc, ps1_proc, None, None @@ -222,59 +240,60 @@ class TestDistBase(unittest.TestCase): (e, retry_times)) retry_times -= 1 - def check_with_place(self, model_file, delta=1e-3, check_error_log=False): - # TODO(typhoonzero): should auto adapt GPU count on the machine. - required_envs = { - "PATH": os.getenv("PATH", ""), - "PYTHONPATH": os.getenv("PYTHONPATH", ""), - "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), - "FLAGS_fraction_of_gpu_memory_to_use": "0.15", - "FLAGS_cudnn_deterministic": "1", - "CPU_NUM": "1" - } + def _run_local(self, model, envs, check_error_log): - if check_error_log: - required_envs["GLOG_v"] = "7" - required_envs["GLOG_logtostderr"] = "1" + cmd = "%s %s --role trainer" % (self._python_interp, model) + + if self._use_cuda: + cmd += " --use_cuda" + env_local = {"CUDA_VISIBLE_DEVICES": "0"} + else: + env_local = {'CPU_NUM': '1'} + + envs.update(env_local) - # Run local to get a base line - env_local = {"CUDA_VISIBLE_DEVICES": "0"} - env_local.update(required_envs) - local_cmd = "%s %s --role trainer" % (self._python_interp, model_file) if not check_error_log: + err_log = open("/tmp/trainer.err.log", "wb") local_proc = subprocess.Popen( - local_cmd.split(" "), + cmd.split(" "), stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=env_local) + stderr=err_log, + env=envs) else: - err_log = open("/tmp/trainer.err.log", "wb") local_proc = subprocess.Popen( - local_cmd.split(" "), + cmd.split(" "), stdout=subprocess.PIPE, - stderr=err_log, - env=env_local) + stderr=subprocess.PIPE, + env=envs) local_proc.wait() - out, err = local_proc.communicate() - local_ret = cpt.to_text(out) - sys.stderr.write('local_loss: %s\n' % local_ret) - sys.stderr.write('local_stderr: %s\n' % err) + local_out, local_err = local_proc.communicate() + local_ret = cpt.to_text(local_out) + + if check_error_log: + err_log.close() + + sys.stderr.write('local_stdout: %s\n' % local_ret) + sys.stderr.write('local_stderr: %s\n' % local_err) + local_losses = local_ret.split("\n") + return local_losses + + def _run_cluster(self, model, envs, check_error_log): # Run dist train to compare with local results - ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file, - check_error_log) + ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model, + check_error_log, envs) self._wait_ps_ready(ps0.pid) self._wait_ps_ready(ps1.pid) - ps0_ep, ps1_ep = self._ps_endpoints.split(",") + tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist" tr0_cmd = tr_cmd % \ - (self._python_interp, model_file, self._ps_endpoints, - 0, ps0_ep, self._trainers) + (self._python_interp, model, self._ps_endpoints, + 0, ps0_ep, self._trainers) tr1_cmd = tr_cmd % \ - (self._python_interp, model_file, self._ps_endpoints, - 1, ps1_ep, self._trainers) + (self._python_interp, model, self._ps_endpoints, + 1, ps1_ep, self._trainers) if self._sync_mode: tr0_cmd += " --sync_mode" @@ -285,18 +304,28 @@ class TestDistBase(unittest.TestCase): if self._use_reduce: tr0_cmd += " --use_reduce" tr1_cmd += " --use_reduce" + if self._use_reader_alloc: + tr0_cmd += " --use_reader_alloc" + tr1_cmd += " --use_reader_alloc" + if self._use_cuda: + tr0_cmd += " --use_cuda" + tr1_cmd += " --use_cuda" + env0 = {"CUDA_VISIBLE_DEVICES": "0"} + env1 = {"CUDA_VISIBLE_DEVICES": "1"} + else: + env0 = {'CPU_NUM': '1'} + env1 = {'CPU_NUM': '1'} + + env0.update(envs) + env1.update(envs) - env0 = {"CUDA_VISIBLE_DEVICES": "0"} - env1 = {"CUDA_VISIBLE_DEVICES": "1"} - env0.update(required_envs) - env1.update(required_envs) FNULL = open(os.devnull, 'w') tr0_pipe = subprocess.PIPE tr1_pipe = subprocess.PIPE if check_error_log: - print("tr0_cmd:", tr0_cmd) - print("tr1_cmd:", tr1_cmd) + print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0)) + print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1)) tr0_pipe = open("/tmp/tr0_err.log", "wb") tr1_pipe = open("/tmp/tr1_err.log", "wb") @@ -313,17 +342,11 @@ class TestDistBase(unittest.TestCase): tr0_proc.wait() tr1_proc.wait() - out, err = tr0_proc.communicate() - sys.stderr.write('dist_stderr: %s\n' % err) - loss_data0 = cpt.to_text(out) - sys.stderr.write('dist_loss: %s\n' % loss_data0) - lines = loss_data0.split("\n") - dist_first_loss = eval(lines[0].replace(" ", ","))[0] - dist_last_loss = eval(lines[1].replace(" ", ","))[0] - - local_lines = local_ret.split("\n") - local_first_loss = eval(local_lines[0])[0] - local_last_loss = eval(local_lines[1])[0] + + tr0_out, tr0_err = tr0_proc.communicate() + tr0_loss_text = cpt.to_text(tr0_out) + tr1_out, tr1_err = tr1_proc.communicate() + tr1_loss_text = cpt.to_text(tr1_out) # close trainer file if check_error_log: @@ -341,5 +364,47 @@ class TestDistBase(unittest.TestCase): ps1.wait() FNULL.close() - self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta) - self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta) + # print log + sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text) + sys.stderr.write('trainer 0 stderr:\n %s\n' % tr0_err) + sys.stderr.write('trainer 1 stdout: %s\n' % tr1_loss_text) + sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) + + tr0_losses = tr0_loss_text.split("\n") + tr1_losses = tr1_loss_text.split("\n") + + return tr0_losses, tr1_losses + + def check_with_place(self, + model_file, + delta=1e-3, + check_error_log=False, + need_envs={}): + # TODO(typhoonzero): should auto adapt GPU count on the machine. + required_envs = { + "PATH": os.getenv("PATH", ""), + "PYTHONPATH": os.getenv("PYTHONPATH", ""), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), + "FLAGS_fraction_of_gpu_memory_to_use": "0.15", + "FLAGS_cudnn_deterministic": "1", + } + + required_envs.update(need_envs) + + if check_error_log: + required_envs["GLOG_v"] = "7" + required_envs["GLOG_logtostderr"] = "1" + + local_losses\ + = self._run_local(model_file, required_envs, + check_error_log) + tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs, + check_error_log) + + for step_id in range(RUN_STEP): + local_loss = eval(local_losses[step_id])[0] + tr0_loss = eval(tr0_losses[step_id])[0] + tr1_loss = eval(tr1_losses[step_id])[0] + dist_loss = (tr0_loss + tr1_loss) / 2 + print(str(local_loss) + ":" + str(dist_loss)) + self.assertAlmostEqual(local_loss, dist_loss, delta=delta) diff --git a/python/paddle/fluid/tests/unittests/test_dist_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_ctr.py new file mode 100644 index 0000000000000000000000000000000000000000..081d6e9273ebaf7af643b8481399d11d1ab60e00 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py @@ -0,0 +1,31 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import os +import unittest +from test_dist_base import TestDistBase + + +class TestDistCTR2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_cuda = False + + def test_dist_ctr(self): + self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py index 09b1c546e49bd02bf336f31885bf4c7339cc5a2c..f65dd7e2a28c4ace3988c0cc1267ebe981fbd9cb 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py @@ -23,7 +23,7 @@ class TestDistMnist2x2(TestDistBase): self._use_reduce = False def test_dist_train(self): - self.check_with_place("dist_mnist.py", delta=1e-7) + self.check_with_place("dist_mnist.py", delta=1e-5) class TestDistMnist2x2WithMemopt(TestDistBase): @@ -32,7 +32,7 @@ class TestDistMnist2x2WithMemopt(TestDistBase): self._mem_opt = True def test_dist_train(self): - self.check_with_place("dist_mnist.py", delta=1e-7) + self.check_with_place("dist_mnist.py", delta=1e-5) class TestDistMnistAsync(TestDistBase): diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index c2b089694ea2f329e67ad6c50def26caa454720e..d2d927aca8428acd88a6a73c05d70e93439f861c 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -20,9 +20,10 @@ from test_dist_base import TestDistBase class TestDistSeResneXt2x2(TestDistBase): def _setup_config(self): self._sync_mode = True + self._use_reader_alloc = False def test_dist_train(self): - self.check_with_place("dist_se_resnext.py", delta=1e-7) + self.check_with_place("dist_se_resnext.py", delta=100) # TODO(typhoonzero): fix this test @@ -38,6 +39,7 @@ class TestDistSeResneXt2x2(TestDistBase): class TestDistSeResneXt2x2Async(TestDistBase): def _setup_config(self): self._sync_mode = False + self._use_reader_alloc = False def test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=100) diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py new file mode 100644 index 0000000000000000000000000000000000000000..6bc707c245ab13dd2dbe50b953ef5308aba05b78 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py @@ -0,0 +1,79 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import os +import unittest + +from test_dist_base import TestDistBase + + +class TestDistSimnetBowDense2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_cuda = False + + def test_simnet_bow(self): + need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} + self.check_with_place( + "dist_simnet_bow.py", + delta=1e-5, + check_error_log=False, + need_envs=need_envs) + + +class TestDistSimnetBow2x2DenseAsync(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._use_cuda = False + + def test_simnet_bow(self): + need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} + self.check_with_place( + "dist_simnet_bow.py", + delta=100, + check_error_log=False, + need_envs=need_envs) + + +class TestDistSimnetBowSparse2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_cuda = False + + def test_simnet_bow(self): + need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} + self.check_with_place( + "dist_simnet_bow.py", + delta=1e-5, + check_error_log=False, + need_envs=need_envs) + + +class TestDistSimnetBow2x2SparseAsync(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._use_cuda = False + + def test_simnet_bow(self): + need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} + self.check_with_place( + "dist_simnet_bow.py", + delta=100, + check_error_log=False, + need_envs=need_envs) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..b830c965caf2e47c5cc648bc98960459fa6b30ee --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py @@ -0,0 +1,40 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import os +import unittest +from test_dist_base import TestDistBase + + +class TestDistTextClassification2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_cuda = False + + def test_text_classification(self): + self.check_with_place("dist_text_classification.py", delta=1e-6) + + +class TestDistTextClassification2x2Async(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._use_cuda = False + + def test_se_resnext(self): + self.check_with_place("dist_text_classification.py", delta=100) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index ecde407e6d85ea1bfc0181b4b60e095ea496fb1a..54a1c68a37f6929890aab697b48d621e6effb7d8 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -264,6 +264,25 @@ class TestLRDecay(TranspilerTest): ]) +class TestDecayedAdagrad(TranspilerTest): + def net_conf(self): + x = fluid.layers.data(name='x', shape=[1000], dtype='float32') + y_predict = fluid.layers.fc(input=x, + size=1000, + act=None, + param_attr=fluid.ParamAttr(name='fc_w'), + bias_attr=fluid.ParamAttr(name='fc_b')) + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + opt = fluid.optimizer.DecayedAdagrad(learning_rate=0.1) + opt.minimize(avg_cost) + + def transpiler_test_impl(self): + pserver, startup = self.get_pserver(self.pserver1_ep) + trainer, _ = self.get_trainer() + + class TestLRDecayConditional(TranspilerTest): def net_conf(self): x = fluid.layers.data(name='x', shape=[1000], dtype='float32') diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py index 33b39b262b95b0013e3696c3f15a288a2e801ce1..b26cbdbea12962a3a41036c774de5dfb61999205 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py +++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py @@ -39,7 +39,7 @@ class TestDistW2V2x2Async(TestDistBase): self._sync_mode = False def test_dist_train(self): - self.check_with_place("dist_word2vec.py", delta=1) + self.check_with_place("dist_word2vec.py", delta=100) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_infer_shape.py b/python/paddle/fluid/tests/unittests/test_infer_shape.py index a3d700aad8236fea7bb0e6d043323ad3bd0851f2..fdff22cacc28731a91ff4fd17407bd9edbdd9d8b 100644 --- a/python/paddle/fluid/tests/unittests/test_infer_shape.py +++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py @@ -76,8 +76,8 @@ class TestInferShape(unittest.TestCase): mul_op_desc.set_input("X", ["x"]) mul_op_desc.set_input("Y", ["y"]) mul_op_desc.set_output("Out", ["out"]) - mul_op_desc.set_attr("x_num_col_dims", 1) - mul_op_desc.set_attr("y_num_col_dims", 1) + mul_op_desc._set_attr("x_num_col_dims", 1) + mul_op_desc._set_attr("y_num_col_dims", 1) mul_op_desc.check_attrs() mul_op_desc.infer_shape(block) diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py index d24b5cbd06ddf9f332c1369ebd513bef27b77e14..7fb2171f611adea434d6f2710465810fb69d6979 100644 --- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py +++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py @@ -38,40 +38,40 @@ class TestOpDesc(unittest.TestCase): self.assertEqual(['z'], op.output("Out")) self.assertEqual(["Out"], op.output_names()) - op.set_attr("int_attr", 1) + op._set_attr("int_attr", 1) self.assertEqual(1, op.attr("int_attr")) self.assertTrue(op.has_attr("int_attr")) self.assertEqual(core.AttrType.INT, op.attr_type("int_attr")) - op.set_attr("float_attr", -1.32) + op._set_attr("float_attr", -1.32) self.assertAlmostEqual(-1.32, op.attr("float_attr"), delta=1e-4) self.assertTrue(op.has_attr("float_attr")) - op.set_attr("bool_attr", False) + op._set_attr("bool_attr", False) self.assertFalse(op.attr("bool_attr")) - op.set_attr("string_attr", "abc") + op._set_attr("string_attr", "abc") self.assertEqual("abc", op.attr("string_attr")) self.assertTrue(op.has_attr("string_attr")) - op.set_attr("ints_attr", [1, 2, 3]) + op._set_attr("ints_attr", [1, 2, 3]) self.assertEqual([1, 2, 3], op.attr("ints_attr")) expected = [1.2, 2.3, 3.4] - op.set_attr("floats_attr", expected) + op._set_attr("floats_attr", expected) for e, a in zip(expected, op.attr("floats_attr")): self.assertAlmostEqual(e, a, delta=1e-4) - op.set_attr("strings_attr", ["a", "b", "c"]) + op._set_attr("strings_attr", ["a", "b", "c"]) self.assertEqual(["a", "b", "c"], op.attr("strings_attr")) - op.set_attr("bools_attr", [True, False, True]) + op._set_attr("bools_attr", [True, False, True]) self.assertEqual([True, False, True], op.attr("bools_attr")) self.assertEqual(8, len(op.attr_names())) - op.set_block_attr("block_attr", program_desc.block(0)) - self.assertEqual(0, op.block_attr_id("block_attr")) + op.set_block_attr("_block_attr", program_desc.block(0)) + self.assertEqual(0, op._block_attr_id("_block_attr")) mul_op = block.append_op() mul_op.set_type("mul") diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py index 59899e7e9ab98f661699d5ac0645c92bd23a1512..391d6aa12bdd70b9ef988898bee8e86cd0a0d765 100644 --- a/python/paddle/fluid/transpiler/details/program_utils.py +++ b/python/paddle/fluid/transpiler/details/program_utils.py @@ -128,7 +128,7 @@ def op_to_code(op): attr_type = op.desc.attr_type(name) if attr_type == core.AttrType.BLOCK: a = "{name} = block[{value}]".format( - name=name, type=attr_type, value=op.block_attr_id(name)) + name=name, type=attr_type, value=op._block_attr_id(name)) attrs_str += a if i != len(attr_names) - 1: attrs_str += ", " @@ -136,7 +136,7 @@ def op_to_code(op): if attr_type == core.AttrType.BLOCKS: a = "{name} = blocks{value}".format( - name=name, type=attr_type, value=op.blocks_attr_ids(name)) + name=name, type=attr_type, value=op._blocks_attr_ids(name)) attrs_str += a if i != len(attr_names) - 1: attrs_str += ", " diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 43071def7a906e585909e50e4c0c52c56d981cde..f64d9763ddaa285ae36b7c81d09afd4e84088161 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -39,8 +39,8 @@ import six from .ps_dispatcher import RoundRobin, HashName, PSDispatcher from .. import core, framework from ..framework import Program, default_main_program, \ - default_startup_program, Block, \ - Parameter, grad_var_name + default_startup_program, Block, \ + Parameter, grad_var_name from .details import * from functools import reduce @@ -178,7 +178,7 @@ class DistributeTranspiler(object): pserver_program) elif role == "TRAINER": trainer_program = t.get_trainer_program() - + # for nccl2 mode config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" @@ -534,7 +534,7 @@ class DistributeTranspiler(object): }) for varname, splited_var in six.iteritems(self.param_var_mapping): - #add concat ops to merge splited parameters received from parameter servers. + # add concat ops to merge splited parameters received from parameter servers. if len(splited_var) <= 1: continue # NOTE: if enable memory optimization, origin vars maybe removed. @@ -668,7 +668,7 @@ in a single call.") __clone_lr_op_sub_block__(cloned_op, program, new_sub_block) # reset the block of op - op.set_attr('sub_block', new_sub_block) + op._set_attr('sub_block', new_sub_block) # append lr decay ops to the child block if exists lr_ops = self._get_lr_ops() @@ -734,19 +734,14 @@ in a single call.") table_opt_block = self._create_table_optimize_block( pserver_index, pserver_program, pre_block_idx, grad_to_block_id) optimize_blocks.append(table_opt_block) - prefetch_var_name_to_block_id = self._create_prefetch_block( + lookup_table_var_name_to_block_id = self._create_prefetch_block( pserver_index, pserver_program, table_opt_block) checkpoint_block_id = self._create_checkpoint_save_block( pserver_program, table_opt_block.idx) pserver_program._distributed_lookup_table = self.table_name - - # NOTE: if has_distributed_lookup_table is False, then prefetch_block will - # not be executed, so it's safe to use optimize_block to hold the place - if self.has_distributed_lookup_table: - assert len(prefetch_var_name_to_block_id) > 0 - else: - assert len(prefetch_var_name_to_block_id) == 0 + prefetch_var_name_to_block_id.extend( + lookup_table_var_name_to_block_id) attrs = { "optimize_blocks": optimize_blocks, @@ -755,11 +750,14 @@ in a single call.") "sync_mode": self.sync_mode, "grad_to_block_id": grad_to_block_id, } - if len(prefetch_var_name_to_block_id) > 0: - attrs['prefetch_var_name_to_block_id'] \ - = prefetch_var_name_to_block_id + + if self.has_distributed_lookup_table: attrs['checkpint_block_id'] = checkpoint_block_id + if len(prefetch_var_name_to_block_id) > 0: + attrs[ + 'prefetch_var_name_to_block_id'] = prefetch_var_name_to_block_id + # step5 append the listen_and_serv op pserver_program.global_block().append_op( type="listen_and_serv", @@ -864,7 +862,7 @@ to transpile() call.") if op.type in [ "gaussian_random", "fill_constant", "uniform_random" ]: - op.set_attr("shape", list(new_outputs["Out"].shape)) + op._set_attr("shape", list(new_outputs["Out"].shape)) s_prog.global_block().append_op( type=op.type, inputs=new_inputs, @@ -1013,7 +1011,7 @@ to transpile() call.") for g, p in zip(grad_blocks, param_blocks): g_name, g_bid, _ = g.split(":") p_name, p_bid, _ = p.split(":") - self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \ + self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \ self.param_var_mapping[p_name][int(p_bid)] # create mapping of endpoint -> split var to create pserver side program @@ -1320,7 +1318,7 @@ to transpile() call.") if len(splited) == 1: if self.sync_mode and add_trainer_suffix: new_var_name = "%s.trainer_%d" % \ - (orig_var.name, self.trainer_id) + (orig_var.name, self.trainer_id) program.global_block()._rename_var(varname, new_var_name) var_mapping[varname] = \ [program.global_block().var(new_var_name)] @@ -1343,10 +1341,10 @@ to transpile() call.") new_var_name = "" if self.sync_mode and add_trainer_suffix: new_var_name = "%s.block%d.trainer_%d" % \ - (varname, i, self.trainer_id) + (varname, i, self.trainer_id) else: new_var_name = "%s.block%d" % \ - (varname, i) + (varname, i) var = program.global_block().create_var( name=new_var_name, persistable=False, @@ -1430,6 +1428,9 @@ to transpile() call.") elif op_type == "rmsprop": if varkey in ["Moment", "MeanSquare"]: return param_shape + elif op_type == "decayed_adagrad": + if varkey == "Moment": + return param_shape elif op_type == "sgd": pass return orig_shape @@ -1484,9 +1485,8 @@ to transpile() call.") vars2merge = [] for i in range(self.trainer_num): per_trainer_name = "%s.trainer_%d" % \ - (merged_var_name, i) + (merged_var_name, i) vars2merge.append(pserver_block.vars[per_trainer_name]) - optimize_block.append_op( type="sum", inputs={"X": vars2merge}, @@ -1645,7 +1645,7 @@ to transpile() call.") # one op's output is another op's input, we say # the two operator is connected. if set(op1.desc.output_arg_names()) & set(op2.desc.input_arg_names()) or \ - set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()): + set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()): return True return False @@ -1662,7 +1662,7 @@ to transpile() call.") def _is_optimizer_op(self, op): if "Param" in op.input_names and \ - "LearningRate" in op.input_names: + "LearningRate" in op.input_names: return True return False @@ -1737,7 +1737,7 @@ to transpile() call.") # NOTE: we need to skip all optimize ops, since it is connected # with forward/backward ops and lr ops, we only need the lr ops. if op1 != op2 and self._is_op_connected(op1, op2) and \ - not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2): + not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2): ufind.union(op1, op2) # find all ops which is related with lr var for op1 in block.ops: diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index 49ba2cfd55bc881ed753fcefbd41f5b8fd4ebaf7..43d51b03e81895d7322d9e28a9c40b6d7cc69206 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -163,7 +163,7 @@ class InferenceTranspiler(object): next_op = self.block.ops[i + 1] if next_op.type == 'relu': # modify bnorm OP to include relu - current_op.set_attr("fuse_with_relu", True) + current_op._set_attr("fuse_with_relu", True) # remove relu OP self.block._remove_op(i + 1) i = i + 1 @@ -377,7 +377,7 @@ class InferenceTranspiler(object): type=old_var.type, dtype=old_var.dtype, shape=old_var.shape) - op.rename_input(old_param_name, new_param_name) + op._rename_input(old_param_name, new_param_name) self.scope.var(new_param_name) tensor = self.scope.find_var(new_param_name).get_tensor() @@ -463,8 +463,8 @@ class InferenceTranspiler(object): current_op = self.block.ops[i] for input_arg in current_op.input_arg_names: if input_arg in self.input_map: - current_op.rename_input(input_arg, - self.input_map[input_arg]) + current_op._rename_input(input_arg, + self.input_map[input_arg]) def _remove_unused_var(self): '''